Beispiel #1
0
    def test_exp_sequence(self):
        """
        test sequence of exps to make sure that changing timbl options works
        and that re-using evaluator works
        """
        try:
            st = create_setting()
            st.validate = False
            st.exp_dir = st.make_tmp_dir()
            st.extract = False
            shutil.copytree("inst", st.inst_dir)
            st.true_dir = "true"
            st.n = 1

            # on this limited featureset, changing -k or -w will not change
            # the scores, so we change the algorithm
            st.timbl_opts = "-a1 +D"
            exp(st)
            score1 = st.evaluator.measure_stats["f"]["micro"]["mean"]

            st.timbl_opts = None
            exp(st)
            score2 = st.evaluator.measure_stats["f"]["micro"]["mean"]
            self.assertNotEqual(score1, score2)

            st.timbl_opts = "-a1 +D"
            exp(st)
            score3 = st.evaluator.measure_stats["f"]["micro"]["mean"]
            self.assertAlmostEqual(score1, score3, 4)
        finally:
            shutil.rmtree(st.exp_dir)
Beispiel #2
0
 def test_exp_sequence(self):
     """
     test sequence of exps to make sure that changing timbl options works
     and that re-using evaluator works
     """
     try:
         st = create_setting()
         st.validate = False
         st.exp_dir = st.make_tmp_dir()
         st.extract = False
         shutil.copytree("inst", st.inst_dir) 
         st.true_dir = "true"
         st.n = 1
         
         # on this limited featureset, changing -k or -w will not change
         # the scores, so we change the algorithm
         st.timbl_opts = "-a1 +D"
         exp(st)
         score1 = st.evaluator.measure_stats["f"]["micro"]["mean"]
         
         st.timbl_opts = None
         exp(st)
         score2 = st.evaluator.measure_stats["f"]["micro"]["mean"]
         self.assertNotEqual(score1, score2)
         
         st.timbl_opts = "-a1 +D"
         exp(st)
         score3 = st.evaluator.measure_stats["f"]["micro"]["mean"]
         self.assertAlmostEqual(score1, score3, 4)
     finally:
         shutil.rmtree(st.exp_dir)
Beispiel #3
0
 def test_extract_dev(self):
     st = create_setting()
     st.validate = False
     # create tmp dirs for extraction output
     st.inst_dir = tempfile.mkdtemp()
     st.true_dir = tempfile.mkdtemp()
     
     extract(st)
     
     # check no of files
     self.assertEqual(len(st.dev_true_fns), 
                      len(st.dev_part_fns))
     self.assertEqual(len(st.dev_inst_fns), 
                      len(st.dev_part_fns))
     
     # test loading a corpus file
     corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])
     
     # test loading a instances file
     inst = CorpusInst()
     inst.loadtxt(st.dev_inst_fns[0],
                  st.descriptor.dtype)
     self.assertEqual(len(corpus), len(inst))
     
     clean_inst(st)
     clean_true(st)
Beispiel #4
0
 def test_create_partition_forced(self):
     st = create_setting()
     # cannot reuse create_sample_partition because we need pgc_files 
     # for asserting
     pgc_files = relglob(st.corpus_dir, "news/pgc/ma/2006-11/*.pgc")
     self.assertTrue(pgc_files)
     
     forced_fns = ['news/pgc/ma/2006-11/news-2006-11-aligned-part-02.pgc']
     
     corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes = \
     create_partition(pgc_files, corpus_dir=st.corpus_dir, 
                      dev_bins=4, val_bins=1, forced_fns=forced_fns)
     
     write_partition(corpus_fns, corpus_sizes, dev_parts, val_parts,
                     dev_sizes, val_sizes)
     
     self.assertEqual(len(dev_parts), 4)
     self.assertEqual(len(val_parts), 1 + 1)
     
     dev_fns = set( part_fname for part_list in dev_parts 
                    for part_fname in part_list )
     val_fns = set( part_fname for part_list in val_parts 
                    for part_fname in part_list )
     
     # check if no files were lost
     self.assertEqual( len(dev_fns) + len(val_fns), 
                       len(pgc_files) )
     
     # check for overlap
     self.assertTrue(dev_fns.isdisjoint(val_fns))
     
     # check for forced files
     for forced_fname in forced_fns:
         self.assertTrue(forced_fname in val_fns)
Beispiel #5
0
 def test_extract_val_binary(self):
     st = create_setting()
     st.develop = False
     # create tmp dirs for extraction output
     st.inst_dir = tempfile.mkdtemp()
     st.true_dir = tempfile.mkdtemp()
     st.binary = True
     
     extract(st)
     
     # check no of files
     self.assertEqual(len(st.val_true_fns), 
                      len(st.val_part_fns))
     self.assertEqual(len(st.val_inst_fns), 
                      len(st.val_part_fns))
     
     # test loading a corpus file
     corpus = ParallelGraphCorpus(inf=st.val_true_fns[0])
     
     # test loading a instances file
     inst = CorpusInst()
     inst.loadbin(st.val_inst_fns[0])
     self.assertEqual(len(corpus), len(inst))
     
     clean_inst(st)
     clean_true(st)
Beispiel #6
0
 def setUp(self):
     self.st = create_setting()
     # copy instances to tmp location
     inst_dir = tempfile.mkdtemp()
     os.rmdir(inst_dir)
     shutil.copytree(self.st.inst_dir, inst_dir)
     # change defaults
     self.st.inst_dir = inst_dir
     self.st.n = 2
Beispiel #7
0
 def setUp(self):
     self.st = create_setting()
     # copy instances to tmp location
     inst_dir = tempfile.mkdtemp()
     os.rmdir(inst_dir)
     shutil.copytree(self.st.inst_dir, inst_dir)
     # change defaults
     self.st.inst_dir = inst_dir
     self.st.n = 2
Beispiel #8
0
 def test_exp_dev_fast(self):
     try:
         st = create_setting()
         st.validate = False
         st.extract = False
         st.classify = False
         st.eval_dir = st.make_tmp_dir()
     
         exp_dev_fast(st)
         
         self.assertTrue(os.path.exists(st.dev_eval_fname))
     finally:
         shutil.rmtree(st.eval_dir)
Beispiel #9
0
 def test_pickle(self):
     st = create_setting()
     st.part = False
     st.extract = False
     st.weight = False
     st.match = False
     st.merge = False
     st.eval_dir = st.make_tmp_dir()
     st.pickle = True
     st.pickle_dir = st.make_tmp_dir()
     exp(st)
     pkl_file = open(st.pickle_fname, "rb")
     st2 = pickle.load(pkl_file)
Beispiel #10
0
    def test_exp_dev_fast(self):
        try:
            st = create_setting()
            st.validate = False
            st.extract = False
            st.classify = False
            st.eval_dir = st.make_tmp_dir()

            exp_dev_fast(st)

            self.assertTrue(os.path.exists(st.dev_eval_fname))
        finally:
            shutil.rmtree(st.eval_dir)
Beispiel #11
0
 def test_pickle(self):
     st = create_setting()
     st.part = False
     st.extract = False
     st.weight = False
     st.match = False
     st.merge = False
     st.eval_dir = st.make_tmp_dir()
     st.pickle = True
     st.pickle_dir = st.make_tmp_dir()
     exp(st)
     pkl_file = open(st.pickle_fname, "rb")
     st2 = pickle.load(pkl_file)
Beispiel #12
0
    def test_extract_with_pp_graph_hooks(self):
        """
        test of extracting feature with preprocessing hook
        """
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()
        
        # a preprocessing function which insert an attribute "x" with value
        # "y" on every node inthe graphs
        def pp_hook1(graphs):
            for g in graphs:
                for attrs in g.node.values():
                    attrs[u"x"] = u"y"
        
        # a feature function which relies on the pp_hook above
        def ff_x(nodes, graphs, **kwargs):
            return graphs.source.node[nodes.source][u"x"]
        
        # create a feature description
        f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1])
        
        # add to features; descriptor and extractor are automatically derived
        st.features = (f,)
        
        extract(st)
        
        # check no of files
        self.assertEqual(len(st.dev_true_fns), 
                         len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), 
                         len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0],
                     st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))
        
        # check values produced by preprocessing function
        self.assertTrue(all(inst[0]["x"] == "y"))

        clean_inst(st)
        clean_true(st)
Beispiel #13
0
    def test_create_parts_val(self):
        st = create_setting()
        st.part = True
        st.develop = False
        st.val_parts = partition.val_parts
        st.part_dir = st.make_tmp_dir()

        create_parts(st)

        self.assertTrue(st.val_parts)
        self.assertEqual(len(st.val_parts), len(st.val_part_fns))

        # test if the part is readable
        ParallelGraphCorpus(inf=st.val_part_fns[0])

        clean_parts(st)
Beispiel #14
0
    def test_match_corpus(self):
        st = create_setting()
        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
        graph_inst = corpus_inst[0]

        # clear pred_match field
        graph_inst["match_relation"] = str(None)
        # backup original for comparison later on
        match_before = graph_inst["match_relation"].copy()

        match_corpus(corpus_inst, Matcher())

        # check that at least one relation is different (i.e. not None)
        self.assertTrue(any(graph_inst["match_relation"] != match_before))
Beispiel #15
0
 def test_create_parts_val(self):
     st = create_setting()
     st.part = True
     st.develop = False
     st.val_parts=partition.val_parts
     st.part_dir = st.make_tmp_dir()
     
     create_parts(st)
     
     self.assertTrue(st.val_parts)
     self.assertEqual(len(st.val_parts), len(st.val_part_fns))
                      
     # test if the part is readable
     ParallelGraphCorpus(inf=st.val_part_fns[0])
     
     clean_parts(st)
Beispiel #16
0
 def test_match_corpus(self):
     st = create_setting()
     corpus_inst = CorpusInst()
     inst_fname = st.dev_inst_fns[0]
     corpus_inst.loadtxt(inst_fname, 
                         st.descriptor.dtype)
     graph_inst = corpus_inst[0]
     
     # clear pred_match field
     graph_inst["match_relation"] = str(None)
     # backup original for comparison later on
     match_before = graph_inst["match_relation"].copy()
     
     match_corpus(corpus_inst, Matcher())
     
     # check that at least one relation is different (i.e. not None)
     self.assertTrue(any(graph_inst["match_relation"] != match_before))
Beispiel #17
0
    def test_extract_with_pp_graph_hooks(self):
        """
        test of extracting feature with preprocessing hook
        """
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        # a preprocessing function which insert an attribute "x" with value
        # "y" on every node inthe graphs
        def pp_hook1(graphs):
            for g in graphs:
                for attrs in g.node.values():
                    attrs[u"x"] = u"y"

        # a feature function which relies on the pp_hook above
        def ff_x(nodes, graphs, **kwargs):
            return graphs.source.node[nodes.source][u"x"]

        # create a feature description
        f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1])

        # add to features; descriptor and extractor are automatically derived
        st.features = (f, )

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        # check values produced by preprocessing function
        self.assertTrue(all(inst[0]["x"] == "y"))

        clean_inst(st)
        clean_true(st)
Beispiel #18
0
    def test_merge_corpus(self):
        st = create_setting()

        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)

        true_fname = st.dev_true_fns[0]
        true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
        pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger())
        self.assertTrue(len(pred_corpus))

        for graph_inst, graph_pair in zip(corpus_inst, pred_corpus):
            for inst in graph_inst:
                rel = inst["match_relation"]
                if rel != str(None):
                    nodes = Pair(inst["source_node"], inst["target_node"])
                    self.assertEqual(graph_pair.get_align(nodes), rel)
Beispiel #19
0
    def test_weight_corpus(self):
        st = create_setting()
        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
        graph_inst = corpus_inst[0]

        # clear predicted weights field
        graph_inst["pred_weight"] = 0.0
        # backup original for comparison later on
        weight_before = graph_inst["pred_weight"].copy()

        out_fname = st.dev_clas_fns[0]
        timbl_out = parse_timbl_output(open(out_fname))

        weight_corpus(corpus_inst, timbl_out, entropy_weight)

        # check that at least one weight is different (i.e. not 0.0)
        self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
Beispiel #20
0
    def test_weight_corpus(self):
        st = create_setting()
        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
        graph_inst = corpus_inst[0]

        # clear predicted weights field
        graph_inst["pred_weight"] = 0.0
        # backup original for comparison later on
        weight_before = graph_inst["pred_weight"].copy()

        out_fname = st.dev_clas_fns[0]
        timbl_out = parse_timbl_output(open(out_fname))

        weight_corpus(corpus_inst, timbl_out, entropy_weight)

        # check that at least one weight is different (i.e. not 0.0)
        self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
Beispiel #21
0
 def test_merge_corpus(self):
     st = create_setting()
     
     corpus_inst = CorpusInst()
     inst_fname = st.dev_inst_fns[0]
     corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
     
     true_fname = st.dev_true_fns[0]
     true_corpus = ParallelGraphCorpus(inf=true_fname,
                                       graph_loading=LOAD_NONE)
     pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) 
     self.assertTrue(len(pred_corpus))
     
     for graph_inst, graph_pair in zip(corpus_inst, pred_corpus):
         for inst in graph_inst:
             rel = inst["match_relation"]
             if rel != str(None):
                 nodes = Pair(inst["source_node"], inst["target_node"] )
                 self.assertEqual(graph_pair.get_align(nodes), rel)    
Beispiel #22
0
 def test_exp(self):
     try:
         st = create_setting()
         # include parting step
         st.part=True
         st.dev_parts=partition.dev_parts
         st.val_parts=partition.val_parts
         st.part_max_size = 1
         st.exp_dir = st.make_tmp_dir()
     
         st.validate = False
         exp(st)
         self.assertTrue(os.path.exists(st.dev_eval_fname))
         
         st.develop = False
         st.validate = True
         exp(st)
         self.assertTrue(os.path.exists(st.val_eval_fname))
         
     finally:
         shutil.rmtree(st.exp_dir)
Beispiel #23
0
    def test_exp(self):
        try:
            st = create_setting()
            # include parting step
            st.part = True
            st.dev_parts = partition.dev_parts
            st.val_parts = partition.val_parts
            st.part_max_size = 1
            st.exp_dir = st.make_tmp_dir()

            st.validate = False
            exp(st)
            self.assertTrue(os.path.exists(st.dev_eval_fname))

            st.develop = False
            st.validate = True
            exp(st)
            self.assertTrue(os.path.exists(st.val_eval_fname))

        finally:
            shutil.rmtree(st.exp_dir)
Beispiel #24
0
    def test_extract_dev(self):
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Beispiel #25
0
    def test_extract_val_binary(self):
        st = create_setting()
        st.develop = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()
        st.binary = True

        extract(st)

        # check no of files
        self.assertEqual(len(st.val_true_fns), len(st.val_part_fns))
        self.assertEqual(len(st.val_inst_fns), len(st.val_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.val_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadbin(st.val_inst_fns[0])
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Beispiel #26
0
 def setUp(self):
     self.st = create_setting()
     self.st.sample = True
     self.st.samp_dir = self.st.make_tmp_dir()
Beispiel #27
0
 def setUp(self):
     self.st = create_setting()
     # copy instances to tmp location
     self.st.eval_dir = self.st.make_tmp_dir()
Beispiel #28
0
 def setUp(self):
     self.st = create_setting()
     self.st.clas_dir = tempfile.mkdtemp()
     self.st.timbl_log = True
     self.st.timbl_opts = "-k2"
     self.st.n = 2
Beispiel #29
0
 def setUp(self):
     self.st = create_setting()
     # change default dir for writing predicted corpora files to a temp dir
     self.st.pred_dir = tempfile.mkdtemp()
Beispiel #30
0
 def setUp(self):
     self.st = create_setting()
     # change default dir for writing predicted corpora files to a temp dir
     self.st.pred_dir = tempfile.mkdtemp()
Beispiel #31
0
 def setUp(self):
     self.st = create_setting()
     self.st.sample = True
     self.st.samp_dir = self.st.make_tmp_dir()
Beispiel #32
0
 def setUp(self):
     self.st = create_setting()
     self.st.clas_dir = tempfile.mkdtemp()
     self.st.timbl_log = True
     self.st.timbl_opts = "-k2"
     self.st.n = 2
Beispiel #33
0
 def setUp(self):
     self.st = create_setting()
     # copy instances to tmp location
     self.st.eval_dir = self.st.make_tmp_dir()