def main(args): tokenizer = BertTokenizerFast.from_pretrained(tokenizer_type) ners = ['Component'] add_marker_tokens(tokenizer, ners) train_annfiles, train_textfiles, train_corenlpfiles = read_inlist( args.train_inlist) print(f"Making Train data from {len(train_annfiles)} files") outdir = join(curpath, "ins/train") make_instances(tokenizer, train_annfiles, train_textfiles, train_corenlpfiles, outdir, max_len=args.max_len, is_training=True, use_sys_ners=False) dev_annfiles, dev_textfiles, dev_corenlpfiles = read_inlist( args.dev_inlist) print(f"Making DEV data from {len(dev_annfiles)} files") outdir = join(curpath, "ins/dev/gold_ner") make_instances(tokenizer, dev_annfiles, dev_textfiles, dev_corenlpfiles, outdir, max_len=args.max_len, use_sys_ners=False) test_annfiles, test_textfiles, test_corenlpfiles = read_inlist( args.test_inlist) print(f"Making TEST data from {len(test_annfiles)} files") outdir = join(curpath, "ins/test/gold_ner") make_instances(tokenizer, test_annfiles, test_textfiles, test_corenlpfiles, outdir, max_len=args.max_len, use_sys_ners=False) vocab_dir = join(curpath, 'ins') tokenizer.save_vocabulary(vocab_dir)
help='directory where testing instances are stored') parser.add_argument("--outdir", default="./temp/prediction", help="where to save the model to") parser.add_argument("--dropout", type=float, default=0) parser.add_argument("--analyze_dev", default=0, choices=[0, 1], type=int, help='whether to do analysis of the model predictions') args = parser.parse_args() ners = ['Component'] add_marker_tokens(tokenizer, ners) print("Loading data ") test_file = join(args.test_dir, "spanins.pkl") with open(test_file, "rb") as f: test_ins = pickle.load(f) with open(join(args.test_dir, "gold_spanins.pkl"), "rb") as f: test_gold_ins = pickle.load(f) """ ================ make dataset ================ """ print("Making dataset ... ") test_dataset = MyDataset(test_ins) """ ================ make dataloader ============= """ print("Making data loader ...") test_dataloader = DataLoader(test_dataset,
parser.add_argument("--max_grad_norm", default=1.00, type=float, help="max gradient norm to clip") args = parser.parse_args() args.num_classes = len(label2ind) seed = args.seed seed_everything(seed) ners = ['Component'] add_marker_tokens(tokenizer, ners) # add markers into the tokenizer's vocabulary print("Loading data ") # load data, this set is a instance-level training set with open(join(args.train_dir, "spanins.pkl"), "rb") as f: train_ins = pickle.load(f) pos = [r.relation_label for r in train_ins if r.relation_label != "O"] print( f"Training set contains {len(pos)}/{len(train_ins)}({len(pos)/len(train_ins):.2f}) positive instances " ) with open(join(args.val_dir, "spanins.pkl"), "rb") as f: val_ins = pickle.load(f)
default=1, choices=[1, 0], help="whether to shuffle the training data") parser.add_argument("--max_grad_norm", default=1.00, type=float, help="max gradient norm to clip") args = parser.parse_args() args.num_classes = len(label2ind) seed = args.seed seed_everything(seed) add_marker_tokens(tokenizer, ['Target']) print("Loading data ") # load data, this set is instance based training set with open(join(args.train_dir, f"spanins.pkl"), "rb") as f: train_ins = pickle.load(f) pos = [r.relation_label for r in train_ins if r.relation_label != "O"] print( f"Training set contains {len(pos)}/{len(train_ins)}({len(pos)/len(train_ins):.2f}) positive instances " ) with open(join(args.val_dir, f"spanins.pkl"), "rb") as f: val_ins = pickle.load(f) with open(join(args.val_dir, f"gold_spanins.pkl"), "rb") as f: val_gold_ins = pickle.load(f)