Example #1
0
def main(args):
    tokenizer = BertTokenizerFast.from_pretrained(tokenizer_type)

    ners = ['Component']

    add_marker_tokens(tokenizer, ners)

    train_annfiles, train_textfiles, train_corenlpfiles = read_inlist(
        args.train_inlist)
    print(f"Making Train data from {len(train_annfiles)} files")

    outdir = join(curpath, "ins/train")
    make_instances(tokenizer,
                   train_annfiles,
                   train_textfiles,
                   train_corenlpfiles,
                   outdir,
                   max_len=args.max_len,
                   is_training=True,
                   use_sys_ners=False)

    dev_annfiles, dev_textfiles, dev_corenlpfiles = read_inlist(
        args.dev_inlist)
    print(f"Making DEV data from {len(dev_annfiles)} files")
    outdir = join(curpath, "ins/dev/gold_ner")
    make_instances(tokenizer,
                   dev_annfiles,
                   dev_textfiles,
                   dev_corenlpfiles,
                   outdir,
                   max_len=args.max_len,
                   use_sys_ners=False)

    test_annfiles, test_textfiles, test_corenlpfiles = read_inlist(
        args.test_inlist)
    print(f"Making TEST data from {len(test_annfiles)} files")
    outdir = join(curpath, "ins/test/gold_ner")
    make_instances(tokenizer,
                   test_annfiles,
                   test_textfiles,
                   test_corenlpfiles,
                   outdir,
                   max_len=args.max_len,
                   use_sys_ners=False)

    vocab_dir = join(curpath, 'ins')
    tokenizer.save_vocabulary(vocab_dir)
Example #2
0
                        help='directory where testing instances are stored')
    parser.add_argument("--outdir",
                        default="./temp/prediction",
                        help="where to save the model to")
    parser.add_argument("--dropout", type=float, default=0)
    parser.add_argument("--analyze_dev",
                        default=0,
                        choices=[0, 1],
                        type=int,
                        help='whether to do analysis of the model predictions')

    args = parser.parse_args()

    ners = ['Component']

    add_marker_tokens(tokenizer, ners)

    print("Loading data ")
    test_file = join(args.test_dir, "spanins.pkl")

    with open(test_file, "rb") as f:
        test_ins = pickle.load(f)

    with open(join(args.test_dir, "gold_spanins.pkl"), "rb") as f:
        test_gold_ins = pickle.load(f)
    """ ================ make dataset ================ """
    print("Making dataset ... ")
    test_dataset = MyDataset(test_ins)
    """ ================ make dataloader ============= """
    print("Making data loader ...")
    test_dataloader = DataLoader(test_dataset,
Example #3
0
    parser.add_argument("--max_grad_norm",
                        default=1.00,
                        type=float,
                        help="max gradient norm to clip")

    args = parser.parse_args()

    args.num_classes = len(label2ind)

    seed = args.seed

    seed_everything(seed)

    ners = ['Component']

    add_marker_tokens(tokenizer,
                      ners)  # add markers into the tokenizer's vocabulary

    print("Loading data ")

    # load data, this set is a instance-level training set
    with open(join(args.train_dir, "spanins.pkl"), "rb") as f:
        train_ins = pickle.load(f)
        pos = [r.relation_label for r in train_ins if r.relation_label != "O"]

        print(
            f"Training set contains {len(pos)}/{len(train_ins)}({len(pos)/len(train_ins):.2f}) positive instances "
        )

    with open(join(args.val_dir, "spanins.pkl"), "rb") as f:
        val_ins = pickle.load(f)
Example #4
0
File: train.py Project: wkiri/MTE
                        default=1,
                        choices=[1, 0],
                        help="whether to shuffle the training data")

    parser.add_argument("--max_grad_norm",
                        default=1.00,
                        type=float,
                        help="max gradient norm to clip")

    args = parser.parse_args()

    args.num_classes = len(label2ind)
    seed = args.seed
    seed_everything(seed)

    add_marker_tokens(tokenizer, ['Target'])

    print("Loading data ")
    # load data, this set is instance based training set
    with open(join(args.train_dir, f"spanins.pkl"), "rb") as f:
        train_ins = pickle.load(f)
        pos = [r.relation_label for r in train_ins if r.relation_label != "O"]
        print(
            f"Training set contains {len(pos)}/{len(train_ins)}({len(pos)/len(train_ins):.2f}) positive instances "
        )

    with open(join(args.val_dir, f"spanins.pkl"), "rb") as f:
        val_ins = pickle.load(f)

    with open(join(args.val_dir, f"gold_spanins.pkl"), "rb") as f:
        val_gold_ins = pickle.load(f)