def predict(s): pre_toks = tokenizer.tokenize(s) max_seq_len = 128 with open('tmp', 'w') as f: f.write('\t'.join( ['na', ' '.join(pre_toks), ' '.join(pre_toks), 'na', 'na'])) dl, _ = get_dataloader('tmp', tok2id, 1) for batch in dl: (pre_id, pre_mask, pre_len, post_in_id, post_out_id, pre_tok_label_id, _, rel_ids, pos_ids, categories) = batch post_start_id = tok2id['行'] max_len = min(max_seq_len, pre_len[0].detach().cpu().numpy() + 10) with torch.no_grad(): predicted_toks, predicted_probs = joint_model.inference_forward( pre_id, post_start_id, pre_mask, pre_len, max_len, pre_tok_label_id, rel_ids=rel_ids, pos_ids=pos_ids, categories=categories, beam_width=1) # print(predicted_toks); quit() pred_seq = [id2tok[x] for x in predicted_toks[0][1:]] if '止' in pred_seq: pred_seq = pred_seq[:pred_seq.index('止')] pred_seq = ' '.join(pred_seq).replace('[PAD]', '').strip() pred_seq = pred_seq.replace(' ##', '') return pred_seq
import model as joint_model import utils as joint_utils assert ARGS.inference_output, "Need to specify inference_output arg!" # # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # # tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model, cache_dir=ARGS.working_dir + '/cache') tok2id = tokenizer.vocab tok2id['<del>'] = len(tok2id) eval_dataloader, num_eval_examples = get_dataloader( ARGS.test, tok2id, ARGS.test_batch_size, ARGS.working_dir + '/test_data.pkl', test=True, add_del_tok=ARGS.add_del_tok) # # # # # # # # ## # # # ## # # MODEL # # # # # # # # ## # # # ## # # if ARGS.pointer_generator: debias_model = seq2seq_model.PointerSeq2Seq( vocab_size=len(tok2id), hidden_size=ARGS.hidden_size, emb_dim=768, dropout=0.2, tok2id=tok2id) # 768 = bert hidden size else: debias_model = seq2seq_model.Seq2Seq(vocab_size=len(tok2id),
sys.path.append('.') from shared.data import get_dataloader from shared.args import ARGS if not os.path.exists(ARGS.working_dir): os.makedirs(ARGS.working_dir) print('LOADING DATA...') tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model, cache_dir=ARGS.working_dir + '/cache') tok2id = tokenizer.vocab tok2id['<del>'] = len(tok2id) train_dataloader, num_train_examples = get_dataloader( ARGS.train, tok2id, ARGS.train_batch_size, ARGS.max_seq_len, ARGS.working_dir + '/train_data.pkl') eval_dataloader, num_eval_examples = get_dataloader(ARGS.test, tok2id, ARGS.test_batch_size, ARGS.max_seq_len, ARGS.working_dir + '/test_data.pkl', test=True) featurizer = Featurizer(tok2id) def data_for_scipy(dataloader, by_seq=False): outX = [] outY = []
writer = SummaryWriter(ARGS.working_dir) # # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # # tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model, cache_dir=ARGS.working_dir + '/cache') tok2id = tokenizer.vocab tok2id['<del>'] = len(tok2id) print("Vocab size: {}".format(len(tok2id))) if ARGS.pretrain_data: print("Loading pretrain data...") pretrain_dataloader, num_pretrain_examples = get_dataloader( ARGS.pretrain_data, tok2id, ARGS.train_batch_size, ARGS.working_dir + '/pretrain_data.pkl', noise=True) print("Loading train data...") train_dataloader, num_train_examples = get_dataloader( ARGS.train, tok2id, ARGS.train_batch_size, ARGS.working_dir + '/train_data.pkl', categories_path=ARGS.categories_file, add_del_tok=ARGS.add_del_tok) print("Loading eval data...") eval_dataloader, num_eval_examples = get_dataloader( ARGS.test, tok2id,
with open(ARGS.working_dir + '/command.sh', 'w') as f: f.write('python' + ' '.join(sys.argv) + '\n') # # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # # print('LOADING DATA...') tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model, cache_dir=ARGS.working_dir + '/cache') tok2id = tokenizer.vocab tok2id['<del>'] = len(tok2id) print(ARGS) train_dataloader, num_train_examples = get_dataloader( ARGS.train, tok2id, ARGS.train_batch_size, ARGS.working_dir + '/train_data.pkl', categories_path=ARGS.categories_file) eval_dataloader, num_eval_examples = get_dataloader( ARGS.test, tok2id, ARGS.test_batch_size, ARGS.working_dir + '/test_data.pkl', test=True, categories_path=ARGS.categories_file) # # # # # # # # ## # # # ## # # MODEL # # # # # # # # ## # # # ## # # print('BUILDING MODEL...') if ARGS.extra_features_top: model = tagging_model.BertForMultitaskWithFeaturesOnTop.from_pretrained(
TEST_BATCH_SIZE = 16 else: TRAIN_BATCH_SIZE = ARGS.train_batch_size TEST_BATCH_SIZE = ARGS.test_batch_size // ARGS.beam_width # # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # # tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, cache_dir=ARGS.working_dir + '/cache') tok2id = tokenizer.vocab tok2id['<del>'] = len(tok2id) if ARGS.pretrain_data: pretrain_dataloader, num_pretrain_examples = get_dataloader( ARGS.pretrain_data, tok2id, TRAIN_BATCH_SIZE, ARGS.working_dir + '/pretrain_data.pkl', noise=True) train_dataloader, num_train_examples = get_dataloader( ARGS.train, tok2id, TRAIN_BATCH_SIZE, ARGS.working_dir + '/train_data.pkl', add_del_tok=ARGS.add_del_tok) eval_dataloader, num_eval_examples = get_dataloader( ARGS.test, tok2id, TEST_BATCH_SIZE, ARGS.working_dir + '/test_data.pkl', test=True,
BERT_MODEL = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, cache_dir=ARGS.working_dir + '/cache') tok2id = tokenizer.vocab tok2id['<del>'] = len(tok2id) featurizer = Featurizer(tok2id) feature_names = featurizer.get_feature_names() TARGET_VARIABLE = 'entailing' tgt_idx = feature_names.index(TARGET_VARIABLE) dataloader, num_pretrain_examples = get_dataloader( ARGS.train, tok2id, ARGS.train_batch_size, ARGS.working_dir + '/pretrain_data.pkl') feature_counts = defaultdict( lambda: { 'n00': 1., # docs without term, 0 label 'n01': 1., # docs without term, 1 label 'n10': 1., # docs with term, 0 label 'n11': 1. # docs with term, 1 label }) for batch in tqdm(dataloader): (pre_id, pre_mask, pre_len, post_in_id, post_out_id, pre_tok_label_id, post_tok_label_id, rel_ids, pos_ids, categories) = batch features = featurizer.featurize_batch(pre_id.detach().cpu().numpy(),