def read_group_answers(path: str) -> list: group_answers = [] for chunk in read_chunks(path): answers = set() for line in chunk: for letter in line: if re.match(ANSWER_PATTERN, letter): answers.add(letter) group_answers.append(answers) return group_answers
def main_test(): import util import vocabulary import parse label_list = util.load_label_list('data/labels.txt') label_vocab = vocabulary.Vocabulary() label_vocab.index(()) for item in label_list: label_vocab.index((item,)) for item in label_list: label_vocab.index((item + "'",)) label_vocab.index((parse.EMPTY,)) label_vocab.freeze() latent = latent_tree_builder(label_vocab, 'city') insts = util.read_chunks('data/trial.txt') # for k in range(3): # trees = latent.build_latent_trees(insts) # for tree in trees: # print(tree.linearize()) # print() trees = latent.build_dynamicRBT_trees(insts) for x, tree, chunks, latentscope in trees: print(tree.linearize()) tree = tree.convert() print() tree = tree.convert() print() #main_test()
def run_test2(args): model = dy.ParameterCollection() # [parser] = dy.load(args.model_path_base, model) [parser] = dy.load( "models/chartdyRBTC-model_addr_dytree_giga_0.4_200_1_chartdyRBTC_dytree_1_houseno_0_0_dev=0.90", model) test_chunk_insts = util.read_chunks(args.test_path, args.normal) # ftreelog = open(args.expname + '.test.predtree.txt', 'w', encoding='utf-8') ftreelog = open('aaa' + '.test.predtree.txt', 'w', encoding='utf-8') test_predicted = [] test_start_time = time.time() test_predicted = [] test_gold = [] for inst in test_chunk_insts: chunks = util.inst2chunks(inst) test_gold.append(chunks) for x, chunks in test_chunk_insts: dy.renew_cg() sentence = [(parse.XX, ch) for ch in x] predicted, _ = parser.parse(sentence) pred_tree = predicted.convert() ftreelog.write(pred_tree.linearize() + '\n') test_predicted.append(pred_tree.to_chunks()) ftreelog.close() # test_fscore = evaluate.eval_chunks2(args.evalb_dir, test_gold, test_predicted, output_filename=args.expname + '.test.txt') # evalb test_fscore = evaluate.eval_chunks2(args.evalb_dir, test_gold, test_predicted, output_filename='aaaabbbb' + '.test.txt') # evalb print("test-fscore {} " "test-elapsed {} ".format( test_fscore, format_elapsed(test_start_time), ))
def count_unanimous_answers(path: str) -> list: """ In O(num_lines) time, read each groups' answers, and count how many are unanimous. """ group_answers = [] for chunk in read_chunks(path): answers, responses = defaultdict(int), list(chunk) for response in responses: for letter in response: if re.match(ANSWER_PATTERN, letter): answers[letter] += 1 everyone_said_yes = 0 for answer, count in answers.items(): if count == len(responses): everyone_said_yes += 1 group_answers.append(everyone_said_yes) return group_answers
def run_train(args): args.numpy_seed = seed if args.numpy_seed is not None: print("Setting numpy random seed to {}...".format(args.numpy_seed)) np.random.seed(args.numpy_seed) if args.trial == 1: args.train_path = 'data/trial.txt' args.dev_path = 'data/trial.txt' args.test_path = 'data/trial.txt' # args.train_path = args.train_path.replace('[*]', args.treetype) # args.dev_path = args.dev_path.replace('[*]', args.treetype) # args.test_path = args.test_path.replace('[*]', args.treetype) print("Loading training trees from {}...".format(args.train_path)) train_chunk_insts = util.read_chunks(args.train_path, args.normal) print("Loaded {:,} training examples.".format(len(train_chunk_insts))) print("Loading development trees from {}...".format(args.dev_path)) dev_chunk_insts = util.read_chunks(args.dev_path, args.normal) print("Loaded {:,} development examples.".format(len(dev_chunk_insts))) print("Loading test trees from {}...".format(args.test_path)) test_chunk_insts = util.read_chunks(args.test_path, args.normal) print("Loaded {:,} test examples.".format(len(test_chunk_insts))) # print("Processing trees for training...") # train_parse = [tree.convert() for tree in train_treebank] print("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(parse.START) tag_vocab.index(parse.STOP) tag_vocab.index(parse.XX) word_vocab = vocabulary.Vocabulary() word_vocab.index(parse.START) word_vocab.index(parse.STOP) word_vocab.index(parse.UNK) word_vocab.index(parse.NUM) for x, chunks in train_chunk_insts + dev_chunk_insts + test_chunk_insts: for ch in x: word_vocab.index(ch) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) label_list = util.load_label_list(args.labellist_path) #'data/labels.txt') for item in label_list: label_vocab.index((item, )) if args.nontlabelstyle != 1: for item in label_list: label_vocab.index((item + "'", )) if args.nontlabelstyle == 1: label_vocab.index((parse.EMPTY, )) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() latent_tree = latent.latent_tree_builder(label_vocab, args.RBTlabel, args.nontlabelstyle) def print_vocabulary(name, vocab): special = {parse.START, parse.STOP, parse.UNK} print("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special))) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print("Initializing model...") pretrain = {'giga': 'data/giga.vec100', 'none': 'none'} pretrainemb = util.load_pretrain(pretrain[args.pretrainemb], args.word_embedding_dim, word_vocab) model = dy.ParameterCollection() if args.parser_type == "chartdyRBTC": parser = parse.ChartDynamicRBTConstraintParser( model, tag_vocab, word_vocab, label_vocab, args.tag_embedding_dim, args.word_embedding_dim, args.lstm_layers, args.lstm_dim, args.label_hidden_dim, args.dropout, (args.pretrainemb, pretrainemb), args.chunkencoding, args.trainc == 1, True, (args.zerocostchunk == 1), ) else: print('Model is not valid!') exit() if args.loadmodel != 'none': tmp = dy.load(args.loadmodel, model) parser = tmp[0] print('Model is loaded from ', args.loadmodel) trainer = dy.AdamTrainer(model) total_processed = 0 current_processed = 0 check_every = len(train_chunk_insts) / args.checks_per_epoch best_dev_fscore = -np.inf best_dev_model_path = None start_time = time.time() def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path dev_start_time = time.time() dev_predicted = [] #dev_gold = [] #dev_gold = latent_tree.build_latent_trees(dev_chunk_insts) dev_gold = [] for inst in dev_chunk_insts: chunks = util.inst2chunks(inst) dev_gold.append(chunks) for x, chunks in dev_chunk_insts: dy.renew_cg() #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] sentence = [(parse.XX, ch) for ch in x] predicted, _ = parser.parse(sentence) dev_predicted.append(predicted.convert().to_chunks()) #dev_fscore = evaluate.evalb(args.evalb_dir, dev_gold, dev_predicted, args.expname + '.dev.') #evalb dev_fscore = evaluate.eval_chunks2(args.evalb_dir, dev_gold, dev_predicted, output_filename=args.expname + '.dev.txt') # evalb print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: for ext in [".data", ".meta"]: path = best_dev_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_dev_model_path = "{}_dev={:.2f}".format( args.model_path_base + "_" + args.expname, dev_fscore.fscore) print("Saving new best model to {}...".format(best_dev_model_path)) dy.save(best_dev_model_path, [parser]) test_start_time = time.time() test_predicted = [] #test_gold = latent_tree.build_latent_trees(test_chunk_insts) test_gold = [] for inst in test_chunk_insts: chunks = util.inst2chunks(inst) test_gold.append(chunks) ftreelog = open(args.expname + '.test.predtree.txt', 'w', encoding='utf-8') for x, chunks in test_chunk_insts: dy.renew_cg() #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] sentence = [(parse.XX, ch) for ch in x] predicted, _ = parser.parse(sentence) pred_tree = predicted.convert() ftreelog.write(pred_tree.linearize() + '\n') test_predicted.append(pred_tree.to_chunks()) ftreelog.close() #test_fscore = evaluate.evalb(args.evalb_dir, test_chunk_insts, test_predicted, args.expname + '.test.') test_fscore = evaluate.eval_chunks2(args.evalb_dir, test_gold, test_predicted, output_filename=args.expname + '.test.txt') # evalb print("epoch {:,} " "test-fscore {} " "test-elapsed {} " "total-elapsed {}".format( epoch, test_fscore, format_elapsed(test_start_time), format_elapsed(start_time), )) train_trees = latent_tree.build_dynamicRBT_trees(train_chunk_insts) train_trees = [(x, tree.convert(), chunks, latentscope) for x, tree, chunks, latentscope in train_trees] for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break np.random.shuffle(train_chunk_insts) epoch_start_time = time.time() for start_index in range(0, len(train_chunk_insts), args.batch_size): dy.renew_cg() batch_losses = [] for x, tree, chunks, latentscope in train_trees[ start_index:start_index + args.batch_size]: discard = False for chunk in chunks: length = chunk[2] - chunk[1] if length > args.maxllimit: discard = True break if discard: continue print('discard') sentence = [(parse.XX, ch) for ch in x] if args.parser_type == "top-down": _, loss = parser.parse(sentence, tree, args.explore) else: _, loss = parser.parse(sentence, tree, chunks, latentscope) batch_losses.append(loss) total_processed += 1 current_processed += 1 batch_loss = dy.average(batch_losses) batch_loss_value = batch_loss.scalar_value() batch_loss.backward() trainer.update() print("Epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_chunk_insts) / args.batch_size)), total_processed, batch_loss_value, format_elapsed(epoch_start_time), format_elapsed(start_time), ), flush=True) if current_processed >= check_every: current_processed -= check_every if epoch > 7: check_dev()