def test_exp_sequence(self): """ test sequence of exps to make sure that changing timbl options works and that re-using evaluator works """ try: st = create_setting() st.validate = False st.exp_dir = st.make_tmp_dir() st.extract = False shutil.copytree("inst", st.inst_dir) st.true_dir = "true" st.n = 1 # on this limited featureset, changing -k or -w will not change # the scores, so we change the algorithm st.timbl_opts = "-a1 +D" exp(st) score1 = st.evaluator.measure_stats["f"]["micro"]["mean"] st.timbl_opts = None exp(st) score2 = st.evaluator.measure_stats["f"]["micro"]["mean"] self.assertNotEqual(score1, score2) st.timbl_opts = "-a1 +D" exp(st) score3 = st.evaluator.measure_stats["f"]["micro"]["mean"] self.assertAlmostEqual(score1, score3, 4) finally: shutil.rmtree(st.exp_dir)
def test_extract_dev(self): st = create_setting() st.validate = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() extract(st) # check no of files self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns)) self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype) self.assertEqual(len(corpus), len(inst)) clean_inst(st) clean_true(st)
def test_create_partition_forced(self): st = create_setting() # cannot reuse create_sample_partition because we need pgc_files # for asserting pgc_files = relglob(st.corpus_dir, "news/pgc/ma/2006-11/*.pgc") self.assertTrue(pgc_files) forced_fns = ['news/pgc/ma/2006-11/news-2006-11-aligned-part-02.pgc'] corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes = \ create_partition(pgc_files, corpus_dir=st.corpus_dir, dev_bins=4, val_bins=1, forced_fns=forced_fns) write_partition(corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes) self.assertEqual(len(dev_parts), 4) self.assertEqual(len(val_parts), 1 + 1) dev_fns = set( part_fname for part_list in dev_parts for part_fname in part_list ) val_fns = set( part_fname for part_list in val_parts for part_fname in part_list ) # check if no files were lost self.assertEqual( len(dev_fns) + len(val_fns), len(pgc_files) ) # check for overlap self.assertTrue(dev_fns.isdisjoint(val_fns)) # check for forced files for forced_fname in forced_fns: self.assertTrue(forced_fname in val_fns)
def test_extract_val_binary(self): st = create_setting() st.develop = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() st.binary = True extract(st) # check no of files self.assertEqual(len(st.val_true_fns), len(st.val_part_fns)) self.assertEqual(len(st.val_inst_fns), len(st.val_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.val_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadbin(st.val_inst_fns[0]) self.assertEqual(len(corpus), len(inst)) clean_inst(st) clean_true(st)
def setUp(self): self.st = create_setting() # copy instances to tmp location inst_dir = tempfile.mkdtemp() os.rmdir(inst_dir) shutil.copytree(self.st.inst_dir, inst_dir) # change defaults self.st.inst_dir = inst_dir self.st.n = 2
def test_exp_dev_fast(self): try: st = create_setting() st.validate = False st.extract = False st.classify = False st.eval_dir = st.make_tmp_dir() exp_dev_fast(st) self.assertTrue(os.path.exists(st.dev_eval_fname)) finally: shutil.rmtree(st.eval_dir)
def test_pickle(self): st = create_setting() st.part = False st.extract = False st.weight = False st.match = False st.merge = False st.eval_dir = st.make_tmp_dir() st.pickle = True st.pickle_dir = st.make_tmp_dir() exp(st) pkl_file = open(st.pickle_fname, "rb") st2 = pickle.load(pkl_file)
def test_extract_with_pp_graph_hooks(self): """ test of extracting feature with preprocessing hook """ st = create_setting() st.validate = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() # a preprocessing function which insert an attribute "x" with value # "y" on every node inthe graphs def pp_hook1(graphs): for g in graphs: for attrs in g.node.values(): attrs[u"x"] = u"y" # a feature function which relies on the pp_hook above def ff_x(nodes, graphs, **kwargs): return graphs.source.node[nodes.source][u"x"] # create a feature description f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1]) # add to features; descriptor and extractor are automatically derived st.features = (f,) extract(st) # check no of files self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns)) self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype) self.assertEqual(len(corpus), len(inst)) # check values produced by preprocessing function self.assertTrue(all(inst[0]["x"] == "y")) clean_inst(st) clean_true(st)
def test_create_parts_val(self): st = create_setting() st.part = True st.develop = False st.val_parts = partition.val_parts st.part_dir = st.make_tmp_dir() create_parts(st) self.assertTrue(st.val_parts) self.assertEqual(len(st.val_parts), len(st.val_part_fns)) # test if the part is readable ParallelGraphCorpus(inf=st.val_part_fns[0]) clean_parts(st)
def test_match_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) graph_inst = corpus_inst[0] # clear pred_match field graph_inst["match_relation"] = str(None) # backup original for comparison later on match_before = graph_inst["match_relation"].copy() match_corpus(corpus_inst, Matcher()) # check that at least one relation is different (i.e. not None) self.assertTrue(any(graph_inst["match_relation"] != match_before))
def test_create_parts_val(self): st = create_setting() st.part = True st.develop = False st.val_parts=partition.val_parts st.part_dir = st.make_tmp_dir() create_parts(st) self.assertTrue(st.val_parts) self.assertEqual(len(st.val_parts), len(st.val_part_fns)) # test if the part is readable ParallelGraphCorpus(inf=st.val_part_fns[0]) clean_parts(st)
def test_extract_with_pp_graph_hooks(self): """ test of extracting feature with preprocessing hook """ st = create_setting() st.validate = False # create tmp dirs for extraction output st.inst_dir = tempfile.mkdtemp() st.true_dir = tempfile.mkdtemp() # a preprocessing function which insert an attribute "x" with value # "y" on every node inthe graphs def pp_hook1(graphs): for g in graphs: for attrs in g.node.values(): attrs[u"x"] = u"y" # a feature function which relies on the pp_hook above def ff_x(nodes, graphs, **kwargs): return graphs.source.node[nodes.source][u"x"] # create a feature description f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1]) # add to features; descriptor and extractor are automatically derived st.features = (f, ) extract(st) # check no of files self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns)) self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns)) # test loading a corpus file corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0]) # test loading a instances file inst = CorpusInst() inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype) self.assertEqual(len(corpus), len(inst)) # check values produced by preprocessing function self.assertTrue(all(inst[0]["x"] == "y")) clean_inst(st) clean_true(st)
def test_merge_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) true_fname = st.dev_true_fns[0] true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) self.assertTrue(len(pred_corpus)) for graph_inst, graph_pair in zip(corpus_inst, pred_corpus): for inst in graph_inst: rel = inst["match_relation"] if rel != str(None): nodes = Pair(inst["source_node"], inst["target_node"]) self.assertEqual(graph_pair.get_align(nodes), rel)
def test_weight_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) graph_inst = corpus_inst[0] # clear predicted weights field graph_inst["pred_weight"] = 0.0 # backup original for comparison later on weight_before = graph_inst["pred_weight"].copy() out_fname = st.dev_clas_fns[0] timbl_out = parse_timbl_output(open(out_fname)) weight_corpus(corpus_inst, timbl_out, entropy_weight) # check that at least one weight is different (i.e. not 0.0) self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
def test_merge_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) true_fname = st.dev_true_fns[0] true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) self.assertTrue(len(pred_corpus)) for graph_inst, graph_pair in zip(corpus_inst, pred_corpus): for inst in graph_inst: rel = inst["match_relation"] if rel != str(None): nodes = Pair(inst["source_node"], inst["target_node"] ) self.assertEqual(graph_pair.get_align(nodes), rel)
def test_exp(self): try: st = create_setting() # include parting step st.part=True st.dev_parts=partition.dev_parts st.val_parts=partition.val_parts st.part_max_size = 1 st.exp_dir = st.make_tmp_dir() st.validate = False exp(st) self.assertTrue(os.path.exists(st.dev_eval_fname)) st.develop = False st.validate = True exp(st) self.assertTrue(os.path.exists(st.val_eval_fname)) finally: shutil.rmtree(st.exp_dir)
def test_exp(self): try: st = create_setting() # include parting step st.part = True st.dev_parts = partition.dev_parts st.val_parts = partition.val_parts st.part_max_size = 1 st.exp_dir = st.make_tmp_dir() st.validate = False exp(st) self.assertTrue(os.path.exists(st.dev_eval_fname)) st.develop = False st.validate = True exp(st) self.assertTrue(os.path.exists(st.val_eval_fname)) finally: shutil.rmtree(st.exp_dir)
def setUp(self): self.st = create_setting() self.st.sample = True self.st.samp_dir = self.st.make_tmp_dir()
def setUp(self): self.st = create_setting() # copy instances to tmp location self.st.eval_dir = self.st.make_tmp_dir()
def setUp(self): self.st = create_setting() self.st.clas_dir = tempfile.mkdtemp() self.st.timbl_log = True self.st.timbl_opts = "-k2" self.st.n = 2
def setUp(self): self.st = create_setting() # change default dir for writing predicted corpora files to a temp dir self.st.pred_dir = tempfile.mkdtemp()