def eval_files(gold_file, test_file): gold_trees = PhraseTree.load_treefile(gold_file) test_trees = PhraseTree.load_treefile(test_file) accuracy = FScore() match_gold_trees = [] match_pred_trees = [] umatch_gold_trees = [] umatch_pred_trees = [] for gold, test in zip(gold_trees, test_trees): if len(gold) == len(test): match_gold_trees.append(gold) match_pred_trees.append(test) else: umatch_gold_trees.append(gold) umatch_pred_trees.append(test) print("***eval matched pair***") struct, label = eval_trees(match_gold_trees, match_pred_trees, verbose=True) accuracy += label print("***eval unmatched pair***") _, label = eval_trees(umatch_gold_trees, umatch_pred_trees, verbose=True) accuracy += label print("sum struct score:", struct + _) return accuracy
def label(self, nonterminals=[]): for nt in nonterminals: (left, right, trees) = self.stack.pop() tree = PhraseTree(symbol=nt, children=trees, sentence=self.sentence) self.stack.append((left, right, [tree]))
def write_predicted(fname, test_trees, fm, network, batch_size, k=5, ap=2.5, mc=3): """ Input trees being used only to carry sentences. """ f = open(fname, 'w') for i in range(0, len(test_trees), batch_size): batch = [ test_trees[i + j].sentence for j in range(batch_size) if i + j < len(trees) ] batch_predicted = parse_batch_variable_beam_stream( batch, fm, network, k, ap, mc) for predicted in batch_predicted: topped = PhraseTree( symbol='TOP', children=[predicted], sentence=predicted.sentence, ) f.write(str(topped)) f.write('\n') f.close()
def vocab_init(fname, verbose=True): """ Learn vocabulary from file of strings. """ tag_freq = defaultdict(int) trees = PhraseTree.load_treefile(fname) for i, tree in enumerate(trees): for (word, tag) in tree.sentence: tag_freq[tag] += 1 if verbose: print('\rTree {}'.format(i), end='') sys.stdout.flush() if verbose: print('\r', end='') tags = ['XX'] + sorted(tag_freq) tdict = OrderedDict((t, i) for (i, t) in enumerate(tags)) if verbose: print('Loading features from {}'.format(fname)) print('( {} tags)'.format(len(tdict), )) return { 'tdict': tdict, }
def shift(self): j = self.i # (index of shifted word) treelet = PhraseTree(symbol=self.sentence[j][1], leaf=j, sentence=self.sentence) self.stack.append((j, j, [treelet])) self.i += 1
def vocab_init(fname, verbose=True): """ Learn vocabulary from file of strings. """ word_freq = defaultdict(int) tag_freq = defaultdict(int) label_freq = defaultdict(int) trees = PhraseTree.load_treefile(fname) for i, tree in enumerate(trees): for (word, tag) in tree.sentence: word_freq[word] += 1 tag_freq[tag] += 1 for action in Parser.gold_actions(tree): if action.startswith('label-'): label = action[6:] label_freq[label] += 1 if verbose: print('\rTree {}'.format(i), end='') sys.stdout.flush() if verbose: print('\r', end='') words = [ FeatureMapper.UNK, FeatureMapper.START, FeatureMapper.STOP, ] + sorted(word_freq) wdict = OrderedDict((w,i) for (i,w) in enumerate(words)) tags = [ FeatureMapper.UNK, FeatureMapper.START, FeatureMapper.STOP, ] + sorted(tag_freq) tdict = OrderedDict((t,i) for (i,t) in enumerate(tags)) labels = sorted(label_freq) ldict = OrderedDict((l,i) for (i,l) in enumerate(labels)) if verbose: print('Loading features from {}'.format(fname)) print('({} words, {} tags, {} nonterminal-chains)'.format( len(wdict), len(tdict), len(ldict), )) return { 'wdict': wdict, 'word_freq': word_freq, 'tdict': tdict, 'ldict': ldict, }
def extract_origin_grammar(tree_file, out_file="grammar.out"): grammar_dict = defaultdict(int) trees = PhraseTree.load_treefile(tree_file) for tree in trees: tree.grammar(grammar_dict) grammar_list = [grammar for grammar, val in grammar_dict.items()] write_docs(fname=out_file, docs=grammar_list) return grammar_dict
def gold_data_from_file(self, fname): """ Static oracle for file. """ trees = PhraseTree.load_treefile(fname) result = [] for tree in trees: sentence_data = self.gold_data(tree) result.append(sentence_data) return result
def write_predicted(fname, test_trees, fm, network): """ Input trees being used only to carry sentences. """ f = open(fname, 'w') for tree in test_trees: predicted = Parser.parse(tree.sentence, fm, network) topped = PhraseTree( symbol='TOP', children=[predicted], sentence=predicted.sentence, ) f.write(str(topped)) f.write('\n') f.close()
def shift(self): j = self.i # (index of shifted word) treelet = PhraseTree(leaf=j) self.stack.append((j, j, [treelet])) self.i += 1
def train( feature_mapper, word_dims, tag_dims, lstm_units, hidden_units, epochs, batch_size, train_data_file, dev_data_file, model_save_file, droprate, unk_param, alpha=1.0, beta=0.0, ): start_time = time.time() fm = feature_mapper word_count = fm.total_words() tag_count = fm.total_tags() network = Network( word_count=word_count, tag_count=tag_count, word_dims=word_dims, tag_dims=tag_dims, lstm_units=lstm_units, hidden_units=hidden_units, struct_out=2, label_out=fm.total_label_actions(), droprate=droprate, ) network.init_params() print('Hidden units: {}, per-LSTM units: {}'.format( hidden_units, lstm_units, )) print('Embeddings: word={} tag={}'.format( (word_count, word_dims), (tag_count, tag_dims), )) print('Dropout rate: {}'.format(droprate)) print('Parameters initialized in [-0.01, 0.01]') print('Random UNKing parameter z = {}'.format(unk_param)) print('Exploration: alpha={} beta={}'.format(alpha, beta)) training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_treefile(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): batch = training_data[(b * batch_size):((b + 1) * batch_size)] explore = [ Parser.exploration( example, fm, network, alpha=alpha, beta=beta, ) for example in batch ] for (_, acc) in explore: training_acc += acc batch = [example for (example, _) in explore] dynet.renew_cg() network.prep_params() errors = [] for example in batch: ## random UNKing ## for (i, w) in enumerate(example['w']): if w <= 2: continue freq = fm.word_freq_list[w] drop_prob = unk_param / (unk_param + freq) r = np.random.random() if r < drop_prob: example['w'][i] = 0 fwd, back = network.evaluate_recurrent( example['w'], example['t'], ) for (left, right), correct in example['struct_data'].items(): scores = network.evaluate_struct( fwd, back, left, right) probs = dynet.softmax(scores) loss = -dynet.log(dynet.pick(probs, correct)) errors.append(loss) total_states += len(example['struct_data']) for (left, right), correct in example['label_data'].items(): scores = network.evaluate_label(fwd, back, left, right) probs = dynet.softmax(scores) loss = -dynet.log(dynet.pick(probs, correct)) errors.append(loss) total_states += len(example['label_data']) batch_error = dynet.esum(errors) total_cost += batch_error.scalar_value() batch_error.backward() network.trainer.update() mean_cost = total_cost / total_states print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Val: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc network.save(model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))
else: print('Must specify either --vocab-file or --train-data.') print(' (Use -h or --help flag for full option list.)') sys.exit() if args.model is None: print('Must specify --model or (or --write-vocab) parameter.') print(' (Use -h or --help flag for full option list.)') sys.exit() if args.test is not None: from phrase_tree import PhraseTree from network import Network from parser import Parser test_trees = PhraseTree.load_treefile(args.test) print('Loaded test trees from {}'.format(args.test)) network = Network.load(args.model) print('Loaded model from: {}'.format(args.model)) accuracy = Parser.evaluate_corpus(test_trees, fm, network) print('Accuracy: {}'.format(accuracy)) elif args.train is not None: from network import Network if args.np_seed is not None: import numpy as np np.random.seed(args.np_seed) print('L2 regularization: {}'.format(args.dynet_l2)) Network.train(feature_mapper=fm,
import argparse from phrase_tree import PhraseTree, FScore parser = argparse.ArgumentParser() parser.add_argument('--gold', type=str) parser.add_argument('--pred', type=str) args = parser.parse_args() gold_trees = PhraseTree.load_treefile(args.gold) pred_trees = PhraseTree.load_treefile(args.pred) accuracy = FScore() for gold, pred in zip(gold_trees, pred_trees): local_accuracy = pred.compare(gold) accuracy += local_accuracy print(accuracy)
def gold_data_from_file(self, fname): """ Static oracle for file. """ trees = PhraseTree.load_treefile(fname) return self.gold_data_from_trees(trees)
def train( feature_mapper, word_dims, tag_dims, lstm_units, hidden_units, epochs, batch_size, train_data_file, dev_data_file, model_save_file, droprate, unk_param, alpha=1.0, beta=0.0, GPU=None, ): start_time = time.time() fm = feature_mapper word_count = fm.total_words() tag_count = fm.total_tags() network = Network( word_count=word_count, tag_count=tag_count, word_dims=word_dims, tag_dims=tag_dims, lstm_units=lstm_units, hidden_units=hidden_units, struct_out=2, label_out=fm.total_label_actions(), droprate=droprate, GPU=GPU, ) f_loss = nn.CrossEntropyLoss(size_average=False) if GPU is not None: f_loss = f_loss.cuda(GPU) random.seed(1) torch.manual_seed(1) print('Hidden units: {}, per-LSTM units: {}'.format( hidden_units, lstm_units, )) print('Embeddings: word={} tag={}'.format( (word_count, word_dims), (tag_count, tag_dims), )) print('Dropout rate: {}'.format(droprate)) print('Parameters initialized in [-0.01, 0.01]') print('Random UNKing parameter z = {}'.format(unk_param)) print('Exploration: alpha={} beta={}'.format(alpha, beta)) training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_treefile(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() network.init_hidden() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): network.trainer.zero_grad() batch = training_data[(b * batch_size):((b + 1) * batch_size)] explore = [ Parser.exploration( example, fm, network, alpha=alpha, beta=beta, ) for example in batch ] for (_, acc) in explore: training_acc += acc batch = [example for (example, _) in explore] errors = [] network.init_hidden() for example in batch: ## random UNKing ## for (i, w) in enumerate(example['w']): if w <= 2: continue freq = fm.word_freq_list[w] drop_prob = unk_param / (unk_param + freq) r = np.random.random() if r < drop_prob: example['w'][i] = 0 fwd, back = network.evaluate_recurrent( example['w'], example['t'], ) indices, targets = [], [] for (left, right), correct in example['struct_data'].items(): indices.append((left, right)) targets.append(correct) """ print(example['w']) print(indices) print(targets) raw_input() """ targets = autograd.Variable(torch.LongTensor(targets)) if network.GPU is not None: targets = targets.cuda(network.GPU) scores = network.evaluate_struct(fwd, back, indices) for i in xrange(len(targets)): score = scores[i] target = targets[i] loss = f_loss(score, target) errors.append(loss) total_states += len(example['struct_data']) indices, targets = [], [] for (left, right), correct in example['label_data'].items(): indices.append((left, right)) targets.append(correct) targets = autograd.Variable(torch.LongTensor(targets)) if network.GPU is not None: targets = targets.cuda(network.GPU) scores = network.evaluate_label(fwd, back, indices) for i in xrange(len(targets)): score = scores[i] target = targets[i] loss = f_loss(score, target) errors.append(loss) total_states += len(example['label_data']) batch_loss = torch.sum(torch.cat(errors)) #network.trainer.zero_grad() batch_loss.backward() network.trainer.step() total_cost += batch_loss.data[0] mean_cost = (total_cost / total_states) print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Val: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc network.save(model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))