def test(fm, args): test_trees = PhraseTree.load_trees(args.test) print('Loaded test trees from {}'.format(args.test)) network = torch.load(args.model) print('Loaded model from: {}'.format(args.model)) accuracy = Parser.evaluate_corpus(test_trees, fm, network) print('Accuracy: {}'.format(accuracy))
def vocab_init(fname, verbose=True): """ Learn vocabulary from file of strings. """ word_freq = defaultdict(int) tag_freq = defaultdict(int) label_freq = defaultdict(int) trees = PhraseTree.load_trees(fname) for i, tree in enumerate(trees): for (word, tag) in tree.sentence: word_freq[word] += 1 tag_freq[tag] += 1 for action in Parser.gold_actions(tree): if action.startswith('label-'): label = action[6:] label_freq[label] += 1 if verbose: print('\rTree {}'.format(i), end='') sys.stdout.flush() if verbose: print('\r', end='') words = [ FeatureMapper.UNK, FeatureMapper.START, FeatureMapper.STOP, ] + sorted(word_freq) wdict = OrderedDict((w, i) for (i, w) in enumerate(words)) tags = [ FeatureMapper.UNK, FeatureMapper.START, FeatureMapper.STOP, ] + sorted(tag_freq) tdict = OrderedDict((t, i) for (i, t) in enumerate(tags)) labels = sorted(label_freq) ldict = OrderedDict((l, i) for (i, l) in enumerate(labels)) if verbose: print('Loading features from {}'.format(fname)) print('({} words, {} tags, {} nonterminal-chains)'.format( len(wdict), len(tdict), len(ldict), )) return { 'wdict': wdict, 'word_freq': word_freq, 'tdict': tdict, 'ldict': ldict, }
def gold_data_from_file(self, fname): """ Static oracle for file. """ trees = PhraseTree.load_trees(fname) result = [] for tree in trees: sentence_data = self.gold_data(tree) result.append(sentence_data) return result
def write_raw_predicted(fname, sentences, fm, network): f = open(fname, 'w') for sentence in sentences: predicted = Parser.parse(sentence, fm, network) topped = PhraseTree( symbol='TOP', children=[predicted], sentence=predicted.sentence, ) f.write(str(topped)) f.write('\n') f.close()
def write_predicted(fname, trees, fm, network): """ Input trees being used only to carry sentences. """ f = open(fname, 'w') accuracy = FScore() for tree in trees: predicted = Parser.parse(tree.sentence, fm, network) local_accuracy = predicted.compare(tree) accuracy += local_accuracy topped = PhraseTree( symbol='TOP', children=[predicted], sentence=predicted.sentence, ) f.write(str(topped)) f.write('\n') f.close() return accuracy
from core_nlp.models.parser.features import FeatureMapper from core_nlp.data.phrase_tree import PhraseTree fm = FeatureMapper.load_json( '/Users/qiwang/python-space/nju_nlp_tools/testdata/toy.vocab.json') test_trees = PhraseTree.load_trees( '/Users/qiwang/python-space/nju_nlp_tools/testdata/toy.clean') #test_trees[0].rotate_tree() test_trees[0].draw_tree('tree.png')
def train(fm, args): train_data_file = args.train dev_data_file = args.dev epochs = args.epochs batch_size = args.batch_size unk_param = args.unk_param alpha = args.alpha beta = args.beta model_save_file = args.model print("this is train mode") start_time = time.time() network = Network(fm, args) optimizer = optimize.Adadelta(network.parameters(), eps=1e-7, rho=0.99) if GlobalNames.use_gpu: network.cuda() training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_trees(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): network.zero_grad() batch = training_data[(b * batch_size): ((b + 1) * batch_size)] batch_loss = None for example in batch: example_Loss, example_states, acc = Parser.exploration(example, fm, network, alpha, beta, unk_param) total_states += example_states if batch_loss is not None: batch_loss += example_Loss else: batch_loss = example_Loss training_acc += acc if GlobalNames.use_gpu: total_cost += batch_loss.cpu().data.numpy()[0] else: total_cost += batch_loss.data.numpy()[0] batch_loss.backward() optimizer.step() mean_cost = total_cost / total_states print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Dev: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc torch.save(network, model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))
fm = FeatureMapper.load_json(args.vocab) elif args.train is not None: fm = FeatureMapper(args.train) if args.vocab_output is not None: fm.save_json(args.vocab_output) print('Wrote vocabulary file {}'.format(args.vocab_output)) sys.exit() else: print('Must specify either --vocab-file or --train-data.') print(' (Use -h or --help flag for full option list.)') sys.exit() if args.model is None: print('Must specify --model or (or --write-vocab) parameter.') print(' (Use -h or --help flag for full option list.)') sys.exit() if args.test is not None: from parser import Parser import torch test_trees = PhraseTree.load_trees(args.test) print('Loaded test trees from {}'.format(args.test)) network = torch.load(args.model) print('Loaded model from: {}'.format(args.model)) accuracy = Parser.evaluate_corpus(test_trees, fm, network) print('Accuracy: {}'.format(accuracy)) elif args.train is not None: train(fm, args)
def train(fm, args): train_data_file = args.train dev_data_file = args.dev epochs = args.epochs batch_size = args.batch_size unk_param = args.unk_param alpha = args.alpha beta = args.beta model_save_file = args.model print("this is train mode") start_time = time.time() network = SpanParserNN(fm, args) optimizer = optimize.Adadelta(network.parameters(), eps=1e-7, rho=0.99) # network.cuda() training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_trees(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): batch = training_data[(b * batch_size):((b + 1) * batch_size)] explore = [ Parser.exploration( example, fm, network, alpha=alpha, beta=beta, ) for example in batch ] for (_, acc) in explore: training_acc += acc batch = [example for (example, _) in explore] sum_loss = np.zeros(1) for example in batch: ## random UNKing ## for (i, w) in enumerate(example['w']): if w <= 2: continue freq = fm.word_freq_list[w] drop_prob = unk_param / (unk_param + freq) r = np.random.random() if r < drop_prob: example['w'][i] = 0 fwd, back = network.evaluate_word( example['w'], example['t'], ) for (left, right), correct in example['struct_data'].items(): scores = network(fwd, back, left, right, 'struct') probs = F.softmax(scores, dim=0) loss = -torch.log(probs[correct]) sum_loss += loss.data.numpy() loss.backward(retain_graph=True) total_states += len(example['struct_data']) for (left, right), correct in example['label_data'].items(): scores = network(fwd, back, left, right, 'label') probs = F.softmax(scores, dim=0) loss = -torch.log(probs[correct]) sum_loss += loss.data.numpy() loss.backward(retain_graph=True) total_states += len(example['label_data']) total_cost += sum_loss optimizer.step() network.zero_grad() mean_cost = total_cost / total_states print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Val: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc torch.save(network, model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))
def label(self, nonterminals=[]): for nt in nonterminals: (left, right, trees) = self.stack.pop() tree = PhraseTree(symbol=nt, children=trees) self.stack.append((left, right, [tree]))
def shift(self): j = self.i # (index of shifted word) treelet = PhraseTree(leaf=j) self.stack.append((j, j, [treelet])) self.i += 1