# Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data) def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = len(data) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data[0:nbatch * bsz] # Evenly divide the data across the bsz batches. def list2batch(x_list): maxlen = max([len(x) for x in x_list]) input = torch.LongTensor(maxlen, bsz).zero_() mask = torch.FloatTensor(maxlen, bsz).zero_() target = torch.LongTensor(maxlen, bsz).zero_() for idx, x in enumerate(x_list):
# Set the random seed manually for reproducibility. torch.manual_seed(args.seed) # Load model with open(args.checkpoint, 'rb') as f: model = torch.load(f) if args.cuda: model.cuda() torch.cuda.manual_seed(args.seed) else: model.cpu() # Load data # we are going to test the ptb parses # so the default corpus is ptb corpus = data_ptb.Corpus('./data/ptb') # This is to load the dictionary used in training if 'ptb' in args.data: c2_dict = corpus.dictionary else: corpus2 = data.Corpus(args.data) c2_dict = corpus2.dictionary sys.stdout.flush() #test(model, corpus, args.cuda, mode='test', dictionary=None, prt=True) test(model, corpus, args.cuda, mode=args.mode, dictionary=c2_dict, prt=True)
torch.manual_seed(args.seed) # Load model with open(args.checkpoint, 'rb') as f: model, _, _ = torch.load(f) torch.cuda.manual_seed(args.seed) model.cpu() if args.cuda: model.cuda() # Load data import hashlib fn = 'corpus.{}.data'.format(hashlib.md5('data/penn'.encode()).hexdigest()) print('Loading cached dataset...') corpus = torch.load(fn) dictionary = corpus.dictionary # test_batch_size = 1 # test_data = batchify(corpus.test, test_batch_size, args) # test_loss = evaluate(test_data, test_batch_size) # print('=' * 89) # print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format( # test_loss, math.exp(test_loss), test_loss / math.log(2))) # print('=' * 89) print('Loading PTB dataset...') corpus = data_ptb.Corpus(args.data) corpus.dictionary = dictionary test(model, corpus, args.cuda, prt=True)
############################################################################### # Load data ############################################################################### def batchify(data, bsz, cuda=False): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() if cuda: data = data.cuda() return data corpus = data.Corpus(hps['data']) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = RNNModel(hps['model'], ntokens, hps['emsize'], hps['nhid'], hps['nlayers'], hps['dropout'], hps['tied']) if hps['cuda']: model.cuda() criterion = nn.CrossEntropyLoss() #optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0) ############################################################################### # Training code
torch.cuda.synchronize() # Load data import hashlib # fn = 'corpus.{}.data'.format(hashlib.md5('data/penn/'.encode()).hexdigest()) fn = 'corpus.{}.data'.format(hashlib.md5((args.data_train + args.wvec).encode()).hexdigest()) # fn = 'corpus.{}.data'.format(hashlib.md5((args.data + args.wvec).encode()).hexdigest()) tools.print_log(args.save, 'Loading cached dataset...') corpus = torch.load(fn) dictionary = corpus.dictionary # test_batch_size = 1 # test_data = batchify(corpus.test, test_batch_size, args) # test_loss = evaluate(test_data, test_batch_size) # print('=' * 89) # print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format( # test_loss, math.exp(test_loss), test_loss / math.log(2))) # print('=' * 89) tools.print_log(args.save, 'Loading PTB dataset...') if args.wvec: word2idx = tools.pkl_loader(os.path.join('data/wordvec', args.wvec, 'words2idx')) idx2word = tools.pkl_loader(os.path.join('data/wordvec', args.wvec, 'idx2words')) corpus = data_ptb.Corpus(args.data, args.wvec, word2idx, idx2word) else: corpus = data_ptb.Corpus(args.data) corpus.dictionary = dictionary test(model, corpus, args.cuda, prt=True)
fn = 'corpus' if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data, max_span_length=args.max_span_length) torch.save(corpus, fn) fn_ptb = 'corpus_ptb' if os.path.exists(fn_ptb): print('Loading cached PTB dataset...') corpus_ptb = torch.load(fn_ptb) else: print('Producing PTB dataset...') corpus_ptb = data_ptb.Corpus(args.data_ptb) torch.save(corpus_ptb, fn_ptb) sys.stdout.flush() eval_batch_size = 10 test_batch_size = 1 train_data, train_trees = batchify(corpus.train, args.batch_size, args, corpus.train_trees) val_data, _ = batchify(corpus.valid, eval_batch_size, args) test_data, _ = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ###############################################################################
from nltk import Tree from tqdm import tqdm import json import data_ptb CORPUS_FILE = '/home/ritesh/Content_alignment/diora_snli/data/snli_1.0/snli_1.0_train.jsonl' RULE_FILE = '/home/ritesh/Content_alignment/diora_snli/Basic-CYK-Parser/grammar_wsj_no_cnf.txt' if __name__ == '__main__': rules = dict() # data = open(CORPUS_FILE,"r") corpus = data_ptb.Corpus( "/home/ritesh/Content_alignment/Tree-Transformer/data/") # corpus.dictionary = dictionary dataset = zip(corpus.test_sens, corpus.test_trees, corpus.test_nltktrees) output = open("wsj_bracket.jsonl", "w") idx = 0 for sen, sen_tree, sen_nltktree in tqdm(dataset): tree = sen_nltktree for sub in tree.subtrees(): for n, child in enumerate(sub): if isinstance(child, str): continue if len( list( child.subtrees(filter=lambda x: x.label() == '-NONE-'))) == len(child.leaves()): del sub[n]
def test(model, corpus, sess, seq_len): prt = True corpus = data_ptb.Corpus('data/penn') prec_list = [] reca_list = [] f1_list = [] pred_tree_list = [] targ_tree_list = [] nsens = 0 word2idx = corpus.dict.word2idx if True:#args.wsj10: dataset = zip(corpus.train_sens, corpus.train_trees, corpus.train_nltktrees) else: dataset = zip(corpus.test_sens, corpus.test_trees, corpus.test_nltktrees) corpus_sys = {} corpus_ref = {} print(len(corpus.test_sens)) for sen, sen_tree, sen_nltktree in dataset: if len(sen) > 12:#args.wsj10 and len(sen) > 12: continue input = numpy.array([word2idx[w] if w in word2idx else word2idx['<unk>'] for w in sen]) #print(input.shape) input = numpy.stack([input] + [numpy.zeros(input.shape) for i in range(79)]) #print(input.shape) _, _, distance_forget, distance_input =\ sess.run([model.cell.forward_propagate(input.shape[1])], feed_dict={model.cell.input:input, model.cell.seq_len:seq_len, model.targets:numpy.zeros((80,1))})[0] #print(distance_forget.shape) #print(distance_input.shape) distance_forget = distance_forget[:,:,0] distance_input = distance_input[:,:,0] nsens += 1 if prt and nsens % 100 == 0: for i in range(len(sen)): print('%15s\t%s\t%s' % (sen[i], str(distance_forget[:, i]), str(distance_input[:, i]))) print('Standard output:', sen_tree) sen_cut = sen[1:-1] for gates in [ # distance[0], distance_forget[1], # distance[2], # distance.mean(axis=0) ]: #print(gates.shape) #print(len(sen_cut)) depth = gates[1:-1] parse_tree = build_tree(depth, sen_cut) corpus_sys[nsens] = MRG(parse_tree) corpus_ref[nsens] = MRG_labeled(sen_nltktree) pred_tree_list.append(parse_tree) targ_tree_list.append(sen_tree) model_out, _ = get_brackets(parse_tree) std_out, _ = get_brackets(sen_tree) overlap = model_out.intersection(std_out) prec = float(len(overlap)) / (len(model_out) + 1e-8) reca = float(len(overlap)) / (len(std_out) + 1e-8) if len(std_out) == 0: reca = 1. if len(model_out) == 0: prec = 1. f1 = 2 * prec * reca / (prec + reca + 1e-8) prec_list.append(prec) reca_list.append(reca) f1_list.append(f1) if prt and nsens % 1 == 0: print('Model output:', parse_tree) print('Prec: %f, Reca: %f, F1: %f' % (prec, reca, f1)) if prt and nsens % 100 == 0: print('-' * 80) _, axarr = plt.subplots(3, sharex=True, figsize=(distance_forget.shape[1] // 2, 6)) axarr[0].bar(numpy.arange(distance_forget.shape[1])-0.2, distance_forget[0], width=0.4) axarr[0].bar(numpy.arange(distance_input.shape[1])+0.2, distance_input[0], width=0.4) axarr[0].set_ylim([0., 1.]) axarr[0].set_ylabel('1st layer') axarr[1].bar(numpy.arange(distance_forget.shape[1]) - 0.2, distance_forget[1], width=0.4) axarr[1].bar(numpy.arange(distance_input.shape[1]) + 0.2, distance_input[1], width=0.4) axarr[1].set_ylim([0., 1.]) axarr[1].set_ylabel('2nd layer') axarr[2].bar(numpy.arange(distance_forget.shape[1]) - 0.2, distance_forget[2], width=0.4) axarr[2].bar(numpy.arange(distance_input.shape[1]) + 0.2, distance_input[2], width=0.4) axarr[2].set_ylim([0., 1.]) axarr[2].set_ylabel('3rd layer') plt.sca(axarr[2]) plt.xlim(xmin=-0.5, xmax=distance_forget.shape[1] - 0.5) plt.xticks(numpy.arange(distance_forget.shape[1]), sen, fontsize=10, rotation=45) plt.savefig('figure/%d.png' % (nsens)) plt.close() prec_list, reca_list, f1_list \ = numpy.array(prec_list).reshape((-1,1)), numpy.array(reca_list).reshape((-1,1)), numpy.array(f1_list).reshape((-1,1)) if prt: print('-' * 80) numpy.set_printoptions(precision=4) print('Mean Prec:', prec_list.mean(axis=0), ', Mean Reca:', reca_list.mean(axis=0), ', Mean F1:', f1_list.mean(axis=0)) print('Number of sentence: %i' % nsens) correct, total = corpus_stats_labeled(corpus_sys, corpus_ref) print(correct) print(total) print('ADJP:', correct['ADJP'], total['ADJP']) print('NP:', correct['NP'], total['NP']) print('PP:', correct['PP'], total['PP']) print('INTJ:', correct['INTJ'], total['INTJ']) print(corpus_average_depth(corpus_sys)) evalb(pred_tree_list, targ_tree_list) return f1_list.mean(axis=0)