def sample_gen(args): from pytreex.core.document import Document opts, files = getopt(args, 'r:n:o:w:') num_to_generate = 1 oracle_eval_file = None fname_ttrees_out = None for opt, arg in opts: if opt == '-n': num_to_generate = int(arg) elif opt == '-o': oracle_eval_file = arg elif opt == '-w': fname_ttrees_out = arg if len(files) != 2: sys.exit(__doc__) fname_cand_model, fname_da_test = files # load model log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = candgen tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker}) # generate log_info('Generating...') gen_doc = Document() das = read_das(fname_da_test) for da in das: for _ in xrange(num_to_generate): # repeat generation n times tgen.generate_tree(da, gen_doc) # evaluate if needed if oracle_eval_file is not None: log_info('Evaluating oracle F1...') log_info('Loading gold data from ' + oracle_eval_file) gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector) gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Gold data loaded.') correct, predicted, gold = 0, 0, 0 for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)): # find best of predicted trees (in terms of F1) _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree), gen_trees)], key=lambda x: x[0]) correct += tc predicted += tp gold += tg # evaluate oracle F1 log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold)) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip( das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores( da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip( eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text( eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info( "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip(das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)