Esempio n. 1
0
 def test_purge(self):
     # adding graph pairs with identical graphbanks
     pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc1 += pgc1
     graphbanks_before = pgc1._graphbanks()
     self.assertEqual(len(graphbanks_before), 2)
     pgc1.purge()
     graphbanks_after = pgc1._graphbanks()        
     self.assertEqual(graphbanks_before, graphbanks_after)
     
     # adding graph pairs with equal graphbanks
     pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc2 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc1 += pgc2
     graphbanks_before = pgc1._graphbanks()
     self.assertEqual(len(graphbanks_before), 4)
     pgc1.purge()
     graphbanks_after = pgc1._graphbanks()        
     self.assertEqual(len(graphbanks_after), 2)
     
     # adding graph pairs with different graphbanks
     pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     pgc1 += pgc2
     graphbanks_before = pgc1._graphbanks()
     self.assertEqual(len(graphbanks_before), 4)
     pgc1.purge()
     graphbanks_after = pgc1._graphbanks()        
     self.assertEqual(graphbanks_before, graphbanks_after)
     
     # removing graphpairs and thus dependencies on graphbanks
     del pgc1[:]
     graphbanks = pgc1._graphbanks()
     self.assertEqual(len(graphbanks), 0)
Esempio n. 2
0
def eval_files(true_fns, pred_fns, eval_fname, align_eval=None, n=None):
    """
    Evaluate predicted against true parallel graph corpora files.
    
    @param true_fns: list of true corpora filenames
    
    @param pred_fns: list of predicted corpora filenames
    
    @keyword eval_fname: name of file to which evaluation output is written 
    
    @keyword align_eval: AlignEval instance
    
    @keyword n: limit evaluation to the first n files
    """
    assert ( len(true_fns[:n]) == 
             len(pred_fns[:n]) > 0 )
    
    # use iterators so only one corpus  
    true_corpora = ( ParallelGraphCorpus(inf=true_fname,
                                          graph_loading=LOAD_NONE)
                     for true_fname in true_fns[:n] )
    
    pred_corpora = ( ParallelGraphCorpus(inf=pred_fname,
                                         graph_loading=LOAD_NONE)
                     for pred_fname in pred_fns[:n] )
    
    names = ( os.path.basename(true_fname).split("_")[0]
              for true_fname in true_fns[:n] )
    
    return eval_corpora(true_corpora, pred_corpora, names,
                        eval_fname, align_eval, n)
Esempio n. 3
0
 def test__iadd__(self):
     self.pgc1 += self.pgc1
     self.assertEquals(len(self.pgc1), 6)
     
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     pgc2 += self.pgc1
     self.assertEquals(len(pgc2), 9)
Esempio n. 4
0
def read_corpora(corpus_fns, words_only):
    corpora = []

    if words_only:
        graph_loading = LOAD_SPARSE
    else:
        graph_loading = LOAD_NONE

    for fn in corpus_fns:
        corpus = ParallelGraphCorpus(inf=fn, graph_loading=graph_loading)

        if words_only:
            # Remove any alignments involving a non-terminal node.
            # This is a bit of a hack, and inefficient also. However, I don't
            # want to complicate align_eval.add, or introduce a WordAlignEval
            # class, for an option that will be rarely used
            for graph_pair in corpus:
                graphs = graph_pair.get_graphs()

                for nodes, relation in graph_pair.alignments():
                    if (graphs.source.node_is_non_terminal(nodes.source) or
                            graphs.target.node_is_non_terminal(nodes.target)):
                        graph_pair.del_align(nodes)

        corpora.append(corpus)
    return corpora
Esempio n. 5
0
    def test__add__(self):
        """
        corpus + other
        """
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc3 = self.pgc1 + pgc2

        self.assertEqual(len(pgc3), len(self.pgc1) + len(pgc2))
Esempio n. 6
0
    def test_aligner(self):
        descriptor = create_setting().descriptor
        classifier = TimblClassifier(descriptor, "exp/inst/dev001.inst")
        aligner = GraphAligner(descriptor=descriptor, classifier=classifier)

        corpus = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")

        for graph_pair in corpus[:3]:
            graph_inst = aligner.align(graph_pair, clear=True)
Esempio n. 7
0
 def setUp(self):
     corpus = ParallelGraphCorpus(
         inf="../exp/corpora/news/pgc/ma/2006-11/news-2006-11-aligned-part-00.pgc")
     self.graph_pair = corpus[0]
     
     descriptor = Descriptor(cornet_sim) 
     self.feat_extr = Extractor(
         descriptor,
         node_selector=select_visible_node)
Esempio n. 8
0
 def test_append(self):
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     self.pgc1.append(pgc2[2])
     self.assertEqual(len(self.pgc1), 4)
     
     self.assertRaises(TypeError,
                       ParallelGraphCorpus.__setslice__,
                       self.pgc1,
                       1,
                       1,
                       ["x"])
Esempio n. 9
0
    def test_pickle(self):
        true_corpus = pred_corpus = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        align_eval = AlignEval()
        align_eval.add(true_corpus, pred_corpus, "corpus-1")
        align_eval.run_eval()

        pickle_file = tempfile.TemporaryFile()
        pickle.dump(align_eval, pickle_file, 2)
        pickle_file.seek(0)
        align_eval_2 = pickle.load(pickle_file)
        align_eval_2.write()
Esempio n. 10
0
    def test_merge(self):
        corpus_inst = CorpusInst()
        dtype = create_setting().descriptor.dtype
        corpus_inst.loadtxt("exp/inst/dev001.inst", dtype)
        graph_inst = corpus_inst[0]

        pgc = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")
        graph_pair = pgc[0]

        gm = Merger()
        gm.merge(graph_inst, graph_pair)
Esempio n. 11
0
 def test__setslice__(self):
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     self.pgc1[-1:] = pgc2[:2]
     self.assertEqual(len(self.pgc1), 4)
     
     self.assertRaises(TypeError,
                       ParallelGraphCorpus.__setslice__,
                       self.pgc1,
                       1,
                       1,
                       ["x"])
Esempio n. 12
0
 def __init__(self):
     self._corpus = ParallelGraphCorpus()
     # the domain model
     self._changed = False
     self._filename = None
     self._graph_pair = None
     self._graph_pair_index = None
     self._graphs = Pair(None, None)
     self._nodes = Pair(None, None)
     # the special relation which stands for "no relation"
     self._no_relation = "none"
     self._co_node_selection = False
Esempio n. 13
0
 def test_merge_val(self):
     self.st.develop = False
     merge(self.st)
     self.assertEqual(len(self.st.val_pred_fns), 
                      len(self.st.val_true_fns))
     
     # check that there are alignments
     for pred_fname in self.st.val_pred_fns:
         corpus = ParallelGraphCorpus(pred_fname, graph_loading=LOAD_NONE)
         align_count = sum( len(graph_pair)
                            for graph_pair in corpus)
         self.assertTrue(align_count)
Esempio n. 14
0
def extract_corpus(extractor, selector, corpus):
    corpus_inst = CorpusInst()
    # create an empty copy, because append() is faster than __del__() or
    # remove()
    true_corpus = ParallelGraphCorpus(relations=corpus.get_relations(),
                                      meta_data=corpus.get_meta_data())

    for graph_pair in corpus:
        if selector(graph_pair):
            true_corpus.append(graph_pair)
            corpus_inst.append(extractor.extract(graph_pair))

    return corpus_inst, true_corpus
Esempio n. 15
0
def join_pgc(corpus_fnames):
    """
    join parallel graph corpora
    
    @param corpus_fnames: list of parallel graph corpora filenames
    
    @return: new ParallelGraphCorpus object
    
    Corpora are assumed to have the same relations.
    Graphbanks are not read, but graphbanks in the result are purged.
    """
    corpus = ParallelGraphCorpus(inf=corpus_fnames.pop(),
                                 graph_loading=LOAD_NONE)
    
    # suppress DaesoWarning: meta data of other corpus is discarded!
    warnings.filterwarnings('ignore', category=DaesoWarning)
    
    for fname in corpus_fnames:
        corpus += ParallelGraphCorpus(inf=fname, graph_loading=LOAD_NONE)
        
    # corpus.purge() not required, as it is called during corpus.write
    return corpus
Esempio n. 16
0
def pgc_stats(files, 
              with_empty_nodes=False,
              with_failed_parses=False,
              with_punc=False,
              with_unaligned_roots=False,
              threshold=0,
              with_unaligned_graphs=False):
    
    pgc_table = PgcStatsTable(size=len(files))
    gb_table = GbStatsTable()
    pgc_row = gb_row = 0
    
    for fn in files:
        pgc = ParallelGraphCorpus()
        pgc.read(inf=fn, graph_loading=LOAD_ALL)
        graph_pair_stats(os.path.basename(fn),
                         pgc,
                         pgc_table, 
                         pgc_row,
                         with_empty_nodes,
                         with_failed_parses,
                         with_punc,
                         with_unaligned_roots,
                         threshold)
        
        graphbanks = pgc._graphbanks()
        gb_table.grow(len(graphbanks))
    
        # somewhat messy to process pgc and gb files intertwined, 
        # but otherwise all graphbanks must be kept in memory
        for gb in graphbanks:
            graph_stats(gb,
                        gb_table, 
                        gb_row,
                        with_empty_nodes,
                        with_failed_parses,
                        with_punc,
                        with_unaligned_roots,
                        threshold,
                        with_unaligned_graphs)
            gb_row += 1
            
        pgc_row += 1

    pgc_table.summarize()
    gb_table.summarize()
    
    return pgc_table, gb_table
Esempio n. 17
0
    def test_create_parts_val(self):
        st = create_setting()
        st.part = True
        st.develop = False
        st.val_parts = partition.val_parts
        st.part_dir = st.make_tmp_dir()

        create_parts(st)

        self.assertTrue(st.val_parts)
        self.assertEqual(len(st.val_parts), len(st.val_part_fns))

        # test if the part is readable
        ParallelGraphCorpus(inf=st.val_part_fns[0])

        clean_parts(st)
Esempio n. 18
0
    def test_extract_with_pp_graph_hooks(self):
        """
        test of extracting feature with preprocessing hook
        """
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        # a preprocessing function which insert an attribute "x" with value
        # "y" on every node inthe graphs
        def pp_hook1(graphs):
            for g in graphs:
                for attrs in g.node.values():
                    attrs[u"x"] = u"y"

        # a feature function which relies on the pp_hook above
        def ff_x(nodes, graphs, **kwargs):
            return graphs.source.node[nodes.source][u"x"]

        # create a feature description
        f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1])

        # add to features; descriptor and extractor are automatically derived
        st.features = (f, )

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        # check values produced by preprocessing function
        self.assertTrue(all(inst[0]["x"] == "y"))

        clean_inst(st)
        clean_true(st)
Esempio n. 19
0
def extract_files(extractor,
                  selector,
                  part_fns,
                  inst_fns,
                  true_fns,
                  binary=False):
    """
    Extract features from source corpus files, produce instance files and true
    corpus files.
    
    @param extractor: Extractor instance for feature extraction from graph
    pairs
    
    @param selector: boolean graph pair selection function
    
    @param part_fns: list of corpus filenames
    
    @param inst_fns: list of instance filenames to be created
    
    @param true_fns: list of true corpus filenames to be created
    
    @keyword binary: save corpus instances in binary rather than text format
    
    Note that the true corpus files may be substantially different from the
    original corpus files because of node and graph selection. 
    """
    # The reason for generating true corpus files is that it makes evaluation
    # against predicted corpus files much easier.
    assert isinstance(extractor, Extractor)
    assert len(part_fns)

    for part_fname, inst_fname, true_fname in zip(part_fns, inst_fns,
                                                  true_fns):
        part_corpus = ParallelGraphCorpus(inf=part_fname)
        corpus_inst, true_corpus = extract_corpus(extractor, selector,
                                                  part_corpus)

        log.info("saving instances file {0}".format(inst_fname))
        if binary:
            corpus_inst.savebin(inst_fname)
        else:
            corpus_inst.savetxt(inst_fname)

        log.info("saving true corpus file {0}".format(true_fname))
        true_corpus.write(true_fname, pprint=True)
Esempio n. 20
0
    def test_align(self):
        # create graph aligner
        descriptor = create_setting().descriptor
        classifier = TimblClassifier(descriptor, "exp/inst/dev001.inst")
        graph_aligner = GraphAligner(descriptor=descriptor,
                                     classifier=classifier)

        # create corpus aligner
        corpus_aligner = CorpusAligner(graph_aligner=graph_aligner,
                                       graph_selector=select_parsed_graph_pair)

        # align part of corpus
        corpus = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")[:3]
        corpus_aligner.align(corpus[:3], clear=True)
        #corpus.write(pprint=True)

        self.assertTrue(corpus[0].alignments() or corpus[1].alignments()
                        or corpus[2].alignments())
Esempio n. 21
0
 def test_merge_corpus(self):
     st = create_setting()
     
     corpus_inst = CorpusInst()
     inst_fname = st.dev_inst_fns[0]
     corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
     
     true_fname = st.dev_true_fns[0]
     true_corpus = ParallelGraphCorpus(inf=true_fname,
                                       graph_loading=LOAD_NONE)
     pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) 
     self.assertTrue(len(pred_corpus))
     
     for graph_inst, graph_pair in zip(corpus_inst, pred_corpus):
         for inst in graph_inst:
             rel = inst["match_relation"]
             if rel != str(None):
                 nodes = Pair(inst["source_node"], inst["target_node"] )
                 self.assertEqual(graph_pair.get_align(nodes), rel)    
Esempio n. 22
0
    def test_parser_load_relaxed(self):
        tmp_dir = tempfile.gettempdir()
        shutil.copy("data/corpus-2.pgc", tmp_dir + "/corpus-2.pgc")
        shutil.copy("../gb/data/source-gb-2.xml", tmp_dir + "/source-gb-2.xml")
        shutil.copy("../gb/data/target-gb-2.xml", tmp_dir + "/target-gb-2.xml")

        pg_corpus = ParallelGraphCorpus()
        pg_corpus.read(tmp_dir + "/corpus-2.pgc", relax_gb_paths=True)

        for graph_pair in pg_corpus:
            for graph in graph_pair._graphs:
                self.assertFalse(isinstance(graph, GraphStub))

            for bank in graph_pair._banks:
                self.assertEqual(bank.__class__, SparseGraphBank)
                self.assertEqual(len(bank), 3)

        os.remove(tmp_dir + "/corpus-2.pgc")
        os.remove(tmp_dir + "/source-gb-2.xml")
        os.remove(tmp_dir + "/target-gb-2.xml")
Esempio n. 23
0
def pgc_zip(zip_filename, pgc_filenames):
    zip_arch = zipfile.ZipFile(zip_filename, "w")
    arch_dir = os.path.splitext(os.path.basename(zip_filename))[0]

    for corpus_filename in multiglob(pgc_filenames):
        # add corpus to archive
        arch_filename = os.path.join(arch_dir,
                                     os.path.basename(corpus_filename))
        zip_arch.write(corpus_filename, arch_filename)

        corpus = ParallelGraphCorpus(inf=corpus_filename,
                                     graph_loading=LOAD_NONE)

        for gb in corpus._graphbanks():
            gb_filename = gb.get_file_path()
            # add graphbank files to archive
            arch_filename = os.path.join(arch_dir,
                                         os.path.basename(gb_filename))
            zip_arch.write(gb_filename, arch_filename)

    zip_arch.close()
Esempio n. 24
0
def merge_files(inst_fns, true_fns, pred_fns, merger=Merger(),
                descriptor=None, n=None, binary=False):
    """
    Merge corpus instance files
    
    @param inst_fns: list of corpus instance filenames
    
    @param true_fns: list of corpus filenames containing the true alignments

    @param pred_fns: list of predicted corpus filenames to be created
    
    @param merger: instance of Merger class for merging instances into a graph
    pair
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format
    
    @keyword n: limit merging to the first n files
    
    @keyword binary: corpus instances in binary rather than text format
    """
    assert isinstance(merger, Merger)
    assert len(inst_fns) == len(true_fns) > 0
    
    for inst_fname, true_fname, pred_fname in zip(inst_fns,
                                                  true_fns,
                                                  pred_fns)[:n]:
        corpus_inst = CorpusInst()
        
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
            
        true_corpus = ParallelGraphCorpus(inf=true_fname,
                                          graph_loading=LOAD_NONE)
        pred_corpus = merge_corpus(corpus_inst, true_corpus, merger)
        log.info("saving predictd corpus {0}".format(inst_fname))
        pred_corpus.write(pred_fname)
Esempio n. 25
0
def extract_phrases(file, delimiter="\t", verbose=False):
    corpus = ParallelGraphCorpus(inf=file)

    for graph_pair in corpus:
        graphs = graph_pair.get_graphs()

        for nodes, relation in graph_pair.alignments_iter():
            columns = [
                graphs.source.get_node_token_string(nodes.source), relation,
                graphs.target.get_node_token_string(nodes.target)
            ]

            if verbose:
                banks = graph_pair.get_banks()

                columns = [
                    banks.source.get_file_path(),
                    banks.target.get_file_path(), graphs.source.id,
                    graphs.target.id, nodes.source, nodes.target
                ] + columns

            print delimiter.join(columns).encode("utf-8")
Esempio n. 26
0
    def test_extract_dev(self):
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Esempio n. 27
0
    def open_corpus(self, filename):
        send(self.open_corpus, "statusDescription",
             "Loading corpus %s ..." % filename)

        # May raise errors such IOErrors, not an xml file, corrupt format, etc.
        # Use of relax_gb_paths allows graphbank files to be located in the
        # same direcory as the corpus file instead of the location specified
        # in the <file> element
        corpus = ParallelGraphCorpus()
        corpus.read(inf=filename, relax_gb_paths=True)

        if not corpus:
            raise AlgraephException(
                "Parallel graph corpus contains no alignments")

        self._corpus = corpus
        self._filename = filename
        self._changed = False

        send(self.open_corpus, "statusDescription")
        send(self.open_corpus, "newCorpus")
        send(self.open_corpus, "newCorpusName")

        self.goto_graph_pair(0)
Esempio n. 28
0
    def test_extract_val_binary(self):
        st = create_setting()
        st.develop = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()
        st.binary = True

        extract(st)

        # check no of files
        self.assertEqual(len(st.val_true_fns), len(st.val_part_fns))
        self.assertEqual(len(st.val_inst_fns), len(st.val_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.val_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadbin(st.val_inst_fns[0])
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Esempio n. 29
0
run from the exp dir which contains a data subdir with the true pgc
files and an eval subdir for evaluation results
"""

import copy
import glob
import os

from daeso.pgc.corpus import ParallelGraphCorpus
from daeso.pgc.evaluate import AlignEval
from daeso_nl.ga.kb.baseline import greedy_align_equal_words, greedy_align_equal_words_roots

eval1 = AlignEval()
eval2 = AlignEval()

for pgc_fn in glob.glob("data/part*true.pgc"):
    true_corpus = ParallelGraphCorpus(inf=pgc_fn)
    pred_corpus = copy.deepcopy(true_corpus)

    greedy_align_equal_words(pred_corpus)
    eval1.add(true_corpus, pred_corpus, os.path.basename(pgc_fn))

    greedy_align_equal_words_roots(pred_corpus)
    eval2.add(true_corpus, pred_corpus, os.path.basename(pgc_fn))

eval1.run_eval()
eval1.write("eval/greedy_align_equals_words.txt")

eval2.run_eval()
eval2.write("eval/greedy_align_equals_words_roots.txt")
Esempio n. 30
0
 def setUp(self):
     self.corpus = ParallelGraphCorpus(
         inf=
         "../exp/corpora/news/pgc/ma/2006-11/news-2006-11-aligned-part-00.pgc"
     )