Esempio n. 1
0
def eval_files(true_fns, pred_fns, eval_fname, align_eval=None, n=None):
    """
    Evaluate predicted against true parallel graph corpora files.
    
    @param true_fns: list of true corpora filenames
    
    @param pred_fns: list of predicted corpora filenames
    
    @keyword eval_fname: name of file to which evaluation output is written 
    
    @keyword align_eval: AlignEval instance
    
    @keyword n: limit evaluation to the first n files
    """
    assert ( len(true_fns[:n]) == 
             len(pred_fns[:n]) > 0 )
    
    # use iterators so only one corpus  
    true_corpora = ( ParallelGraphCorpus(inf=true_fname,
                                          graph_loading=LOAD_NONE)
                     for true_fname in true_fns[:n] )
    
    pred_corpora = ( ParallelGraphCorpus(inf=pred_fname,
                                         graph_loading=LOAD_NONE)
                     for pred_fname in pred_fns[:n] )
    
    names = ( os.path.basename(true_fname).split("_")[0]
              for true_fname in true_fns[:n] )
    
    return eval_corpora(true_corpora, pred_corpora, names,
                        eval_fname, align_eval, n)
Esempio n. 2
0
 def __init__(self):
     self._corpus = ParallelGraphCorpus()
     # the domain model
     self._changed = False
     self._filename = None
     self._graph_pair = None
     self._graph_pair_index = None
     self._graphs = Pair(None, None)
     self._nodes = Pair(None, None)
     # the special relation which stands for "no relation"
     self._no_relation = "none"
     self._co_node_selection = False
Esempio n. 3
0
def extract_corpus(extractor, selector, corpus):
    corpus_inst = CorpusInst()
    # create an empty copy, because append() is faster than __del__() or
    # remove()
    true_corpus = ParallelGraphCorpus(relations=corpus.get_relations(),
                                      meta_data=corpus.get_meta_data())

    for graph_pair in corpus:
        if selector(graph_pair):
            true_corpus.append(graph_pair)
            corpus_inst.append(extractor.extract(graph_pair))

    return corpus_inst, true_corpus
Esempio n. 4
0
def extract_corpus(extractor, selector, corpus):
    corpus_inst = CorpusInst()
    # create an empty copy, because append() is faster than __del__() or
    # remove()
    true_corpus = ParallelGraphCorpus(
        relations=corpus.get_relations(),
        meta_data=corpus.get_meta_data())
    
    for graph_pair in corpus:
        if selector(graph_pair):
            true_corpus.append(graph_pair)
            corpus_inst.append(
                extractor.extract(graph_pair))
            
    return corpus_inst, true_corpus
Esempio n. 5
0
def pgc_stats(files, 
              with_empty_nodes=False,
              with_failed_parses=False,
              with_punc=False,
              with_unaligned_roots=False,
              threshold=0,
              with_unaligned_graphs=False):
    
    pgc_table = PgcStatsTable(size=len(files))
    gb_table = GbStatsTable()
    pgc_row = gb_row = 0
    
    for fn in files:
        pgc = ParallelGraphCorpus()
        pgc.read(inf=fn, graph_loading=LOAD_ALL)
        graph_pair_stats(os.path.basename(fn),
                         pgc,
                         pgc_table, 
                         pgc_row,
                         with_empty_nodes,
                         with_failed_parses,
                         with_punc,
                         with_unaligned_roots,
                         threshold)
        
        graphbanks = pgc._graphbanks()
        gb_table.grow(len(graphbanks))
    
        # somewhat messy to process pgc and gb files intertwined, 
        # but otherwise all graphbanks must be kept in memory
        for gb in graphbanks:
            graph_stats(gb,
                        gb_table, 
                        gb_row,
                        with_empty_nodes,
                        with_failed_parses,
                        with_punc,
                        with_unaligned_roots,
                        threshold,
                        with_unaligned_graphs)
            gb_row += 1
            
        pgc_row += 1

    pgc_table.summarize()
    gb_table.summarize()
    
    return pgc_table, gb_table
Esempio n. 6
0
def read_corpora(corpus_fns, words_only):
    corpora = []

    if words_only:
        graph_loading = LOAD_SPARSE
    else:
        graph_loading = LOAD_NONE

    for fn in corpus_fns:
        corpus = ParallelGraphCorpus(inf=fn, graph_loading=graph_loading)

        if words_only:
            # Remove any alignments involving a non-terminal node.
            # This is a bit of a hack, and inefficient also. However, I don't
            # want to complicate align_eval.add, or introduce a WordAlignEval
            # class, for an option that will be rarely used
            for graph_pair in corpus:
                graphs = graph_pair.get_graphs()

                for nodes, relation in graph_pair.alignments():
                    if (graphs.source.node_is_non_terminal(nodes.source) or
                            graphs.target.node_is_non_terminal(nodes.target)):
                        graph_pair.del_align(nodes)

        corpora.append(corpus)
    return corpora
Esempio n. 7
0
 def test__iadd__(self):
     self.pgc1 += self.pgc1
     self.assertEquals(len(self.pgc1), 6)
     
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     pgc2 += self.pgc1
     self.assertEquals(len(pgc2), 9)
Esempio n. 8
0
    def test__add__(self):
        """
        corpus + other
        """
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc3 = self.pgc1 + pgc2

        self.assertEqual(len(pgc3), len(self.pgc1) + len(pgc2))
Esempio n. 9
0
    def test_aligner(self):
        descriptor = create_setting().descriptor
        classifier = TimblClassifier(descriptor, "exp/inst/dev001.inst")
        aligner = GraphAligner(descriptor=descriptor, classifier=classifier)

        corpus = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")

        for graph_pair in corpus[:3]:
            graph_inst = aligner.align(graph_pair, clear=True)
Esempio n. 10
0
 def setUp(self):
     corpus = ParallelGraphCorpus(
         inf="../exp/corpora/news/pgc/ma/2006-11/news-2006-11-aligned-part-00.pgc")
     self.graph_pair = corpus[0]
     
     descriptor = Descriptor(cornet_sim) 
     self.feat_extr = Extractor(
         descriptor,
         node_selector=select_visible_node)
Esempio n. 11
0
    def test_parser_load_relaxed(self):
        tmp_dir = tempfile.gettempdir()
        shutil.copy("data/corpus-2.pgc", tmp_dir + "/corpus-2.pgc")
        shutil.copy("../gb/data/source-gb-2.xml", tmp_dir + "/source-gb-2.xml")
        shutil.copy("../gb/data/target-gb-2.xml", tmp_dir + "/target-gb-2.xml")

        pg_corpus = ParallelGraphCorpus()
        pg_corpus.read(tmp_dir + "/corpus-2.pgc", relax_gb_paths=True)

        for graph_pair in pg_corpus:
            for graph in graph_pair._graphs:
                self.assertFalse(isinstance(graph, GraphStub))

            for bank in graph_pair._banks:
                self.assertEqual(bank.__class__, SparseGraphBank)
                self.assertEqual(len(bank), 3)

        os.remove(tmp_dir + "/corpus-2.pgc")
        os.remove(tmp_dir + "/source-gb-2.xml")
        os.remove(tmp_dir + "/target-gb-2.xml")
Esempio n. 12
0
    def test_pickle(self):
        true_corpus = pred_corpus = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        align_eval = AlignEval()
        align_eval.add(true_corpus, pred_corpus, "corpus-1")
        align_eval.run_eval()

        pickle_file = tempfile.TemporaryFile()
        pickle.dump(align_eval, pickle_file, 2)
        pickle_file.seek(0)
        align_eval_2 = pickle.load(pickle_file)
        align_eval_2.write()
Esempio n. 13
0
    def test_merge(self):
        corpus_inst = CorpusInst()
        dtype = create_setting().descriptor.dtype
        corpus_inst.loadtxt("exp/inst/dev001.inst", dtype)
        graph_inst = corpus_inst[0]

        pgc = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")
        graph_pair = pgc[0]

        gm = Merger()
        gm.merge(graph_inst, graph_pair)
Esempio n. 14
0
 def test__setslice__(self):
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     self.pgc1[-1:] = pgc2[:2]
     self.assertEqual(len(self.pgc1), 4)
     
     self.assertRaises(TypeError,
                       ParallelGraphCorpus.__setslice__,
                       self.pgc1,
                       1,
                       1,
                       ["x"])
Esempio n. 15
0
 def test_append(self):
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     self.pgc1.append(pgc2[2])
     self.assertEqual(len(self.pgc1), 4)
     
     self.assertRaises(TypeError,
                       ParallelGraphCorpus.__setslice__,
                       self.pgc1,
                       1,
                       1,
                       ["x"])
Esempio n. 16
0
def pgc_zip(zip_filename, pgc_filenames):
    zip_arch = zipfile.ZipFile(zip_filename, "w")
    arch_dir = os.path.splitext( os.path.basename(zip_filename))[0]
    
    for corpus_filename in multiglob(pgc_filenames):
        # add corpus to archive
        arch_filename = os.path.join( arch_dir,
                                      os.path.basename(corpus_filename) )
        zip_arch.write(corpus_filename, arch_filename)
        
        corpus = ParallelGraphCorpus(inf=corpus_filename,
                                     graph_loading=LOAD_NONE)

        for gb in corpus._graphbanks():
            gb_filename = gb.get_file_path()
            # add graphbank files to archive
            arch_filename = os.path.join( arch_dir,
                                          os.path.basename(gb_filename) )
            zip_arch.write(gb_filename, arch_filename)
            
    zip_arch.close()
Esempio n. 17
0
def pgc_zip(zip_filename, pgc_filenames):
    zip_arch = zipfile.ZipFile(zip_filename, "w")
    arch_dir = os.path.splitext(os.path.basename(zip_filename))[0]

    for corpus_filename in multiglob(pgc_filenames):
        # add corpus to archive
        arch_filename = os.path.join(arch_dir,
                                     os.path.basename(corpus_filename))
        zip_arch.write(corpus_filename, arch_filename)

        corpus = ParallelGraphCorpus(inf=corpus_filename,
                                     graph_loading=LOAD_NONE)

        for gb in corpus._graphbanks():
            gb_filename = gb.get_file_path()
            # add graphbank files to archive
            arch_filename = os.path.join(arch_dir,
                                         os.path.basename(gb_filename))
            zip_arch.write(gb_filename, arch_filename)

    zip_arch.close()
Esempio n. 18
0
 def __init__(self):
     self._corpus = ParallelGraphCorpus()
     # the domain model
     self._changed = False
     self._filename = None
     self._graph_pair = None
     self._graph_pair_index = None
     self._graphs = Pair(None, None)
     self._nodes = Pair(None, None)
     # the special relation which stands for "no relation"
     self._no_relation = "none"
     self._co_node_selection = False
Esempio n. 19
0
 def test_merge_val(self):
     self.st.develop = False
     merge(self.st)
     self.assertEqual(len(self.st.val_pred_fns), 
                      len(self.st.val_true_fns))
     
     # check that there are alignments
     for pred_fname in self.st.val_pred_fns:
         corpus = ParallelGraphCorpus(pred_fname, graph_loading=LOAD_NONE)
         align_count = sum( len(graph_pair)
                            for graph_pair in corpus)
         self.assertTrue(align_count)
Esempio n. 20
0
def join_pgc(corpus_fnames):
    """
    join parallel graph corpora
    
    @param corpus_fnames: list of parallel graph corpora filenames
    
    @return: new ParallelGraphCorpus object
    
    Corpora are assumed to have the same relations.
    Graphbanks are not read, but graphbanks in the result are purged.
    """
    corpus = ParallelGraphCorpus(inf=corpus_fnames.pop(),
                                 graph_loading=LOAD_NONE)
    
    # suppress DaesoWarning: meta data of other corpus is discarded!
    warnings.filterwarnings('ignore', category=DaesoWarning)
    
    for fname in corpus_fnames:
        corpus += ParallelGraphCorpus(inf=fname, graph_loading=LOAD_NONE)
        
    # corpus.purge() not required, as it is called during corpus.write
    return corpus
Esempio n. 21
0
    def open_corpus(self, filename):
        send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename)

        # May raise errors such IOErrors, not an xml file, corrupt format, etc.
        # Use of relax_gb_paths allows graphbank files to be located in the
        # same direcory as the corpus file instead of the location specified
        # in the <file> element
        corpus = ParallelGraphCorpus()
        corpus.read(inf=filename, relax_gb_paths=True)
        
        if not corpus:
            raise AlgraephException("Parallel graph corpus contains no alignments")
        
        self._corpus = corpus
        self._filename = filename
        self._changed = False
            
        send(self.open_corpus, "statusDescription")
        send(self.open_corpus, "newCorpus")
        send(self.open_corpus, "newCorpusName")

        self.goto_graph_pair(0)
Esempio n. 22
0
 def test_parser_load_relaxed(self):
     tmp_dir = tempfile.gettempdir()
     shutil.copy("data/corpus-2.pgc",
                 tmp_dir + "/corpus-2.pgc")
     shutil.copy("../gb/data/source-gb-2.xml", 
                 tmp_dir + "/source-gb-2.xml")
     shutil.copy("../gb/data/target-gb-2.xml", 
                 tmp_dir + "/target-gb-2.xml")
     
     pg_corpus = ParallelGraphCorpus()
     pg_corpus.read(tmp_dir + "/corpus-2.pgc",
                    relax_gb_paths=True)
 
     for graph_pair in pg_corpus:
         for graph in graph_pair._graphs:
             self.assertFalse(isinstance(graph, GraphStub))
             
         for bank in graph_pair._banks:
             self.assertEqual(bank.__class__, SparseGraphBank)
             self.assertEqual(len(bank), 3)
             
     os.remove(tmp_dir + "/corpus-2.pgc")
     os.remove(tmp_dir + "/source-gb-2.xml")
     os.remove(tmp_dir + "/target-gb-2.xml")
Esempio n. 23
0
    def open_corpus(self, filename):
        send(self.open_corpus, "statusDescription",
             "Loading corpus %s ..." % filename)

        # May raise errors such IOErrors, not an xml file, corrupt format, etc.
        # Use of relax_gb_paths allows graphbank files to be located in the
        # same direcory as the corpus file instead of the location specified
        # in the <file> element
        corpus = ParallelGraphCorpus()
        corpus.read(inf=filename, relax_gb_paths=True)

        if not corpus:
            raise AlgraephException(
                "Parallel graph corpus contains no alignments")

        self._corpus = corpus
        self._filename = filename
        self._changed = False

        send(self.open_corpus, "statusDescription")
        send(self.open_corpus, "newCorpus")
        send(self.open_corpus, "newCorpusName")

        self.goto_graph_pair(0)
Esempio n. 24
0
    def test_create_parts_val(self):
        st = create_setting()
        st.part = True
        st.develop = False
        st.val_parts = partition.val_parts
        st.part_dir = st.make_tmp_dir()

        create_parts(st)

        self.assertTrue(st.val_parts)
        self.assertEqual(len(st.val_parts), len(st.val_part_fns))

        # test if the part is readable
        ParallelGraphCorpus(inf=st.val_part_fns[0])

        clean_parts(st)
Esempio n. 25
0
    def test_extract_with_pp_graph_hooks(self):
        """
        test of extracting feature with preprocessing hook
        """
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        # a preprocessing function which insert an attribute "x" with value
        # "y" on every node inthe graphs
        def pp_hook1(graphs):
            for g in graphs:
                for attrs in g.node.values():
                    attrs[u"x"] = u"y"

        # a feature function which relies on the pp_hook above
        def ff_x(nodes, graphs, **kwargs):
            return graphs.source.node[nodes.source][u"x"]

        # create a feature description
        f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1])

        # add to features; descriptor and extractor are automatically derived
        st.features = (f, )

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        # check values produced by preprocessing function
        self.assertTrue(all(inst[0]["x"] == "y"))

        clean_inst(st)
        clean_true(st)
Esempio n. 26
0
def extract_files(extractor,
                  selector,
                  part_fns,
                  inst_fns,
                  true_fns,
                  binary=False):
    """
    Extract features from source corpus files, produce instance files and true
    corpus files.
    
    @param extractor: Extractor instance for feature extraction from graph
    pairs
    
    @param selector: boolean graph pair selection function
    
    @param part_fns: list of corpus filenames
    
    @param inst_fns: list of instance filenames to be created
    
    @param true_fns: list of true corpus filenames to be created
    
    @keyword binary: save corpus instances in binary rather than text format
    
    Note that the true corpus files may be substantially different from the
    original corpus files because of node and graph selection. 
    """
    # The reason for generating true corpus files is that it makes evaluation
    # against predicted corpus files much easier.
    assert isinstance(extractor, Extractor)
    assert len(part_fns)

    for part_fname, inst_fname, true_fname in zip(part_fns, inst_fns,
                                                  true_fns):
        part_corpus = ParallelGraphCorpus(inf=part_fname)
        corpus_inst, true_corpus = extract_corpus(extractor, selector,
                                                  part_corpus)

        log.info("saving instances file {0}".format(inst_fname))
        if binary:
            corpus_inst.savebin(inst_fname)
        else:
            corpus_inst.savetxt(inst_fname)

        log.info("saving true corpus file {0}".format(true_fname))
        true_corpus.write(true_fname, pprint=True)
Esempio n. 27
0
    def test_align(self):
        # create graph aligner
        descriptor = create_setting().descriptor
        classifier = TimblClassifier(descriptor, "exp/inst/dev001.inst")
        graph_aligner = GraphAligner(descriptor=descriptor,
                                     classifier=classifier)

        # create corpus aligner
        corpus_aligner = CorpusAligner(graph_aligner=graph_aligner,
                                       graph_selector=select_parsed_graph_pair)

        # align part of corpus
        corpus = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")[:3]
        corpus_aligner.align(corpus[:3], clear=True)
        #corpus.write(pprint=True)

        self.assertTrue(corpus[0].alignments() or corpus[1].alignments()
                        or corpus[2].alignments())
Esempio n. 28
0
 def test_merge_corpus(self):
     st = create_setting()
     
     corpus_inst = CorpusInst()
     inst_fname = st.dev_inst_fns[0]
     corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
     
     true_fname = st.dev_true_fns[0]
     true_corpus = ParallelGraphCorpus(inf=true_fname,
                                       graph_loading=LOAD_NONE)
     pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) 
     self.assertTrue(len(pred_corpus))
     
     for graph_inst, graph_pair in zip(corpus_inst, pred_corpus):
         for inst in graph_inst:
             rel = inst["match_relation"]
             if rel != str(None):
                 nodes = Pair(inst["source_node"], inst["target_node"] )
                 self.assertEqual(graph_pair.get_align(nodes), rel)    
Esempio n. 29
0
def extract_phrases(file, delimiter="\t", verbose=False):
    corpus = ParallelGraphCorpus(inf=file)

    for graph_pair in corpus:
        graphs = graph_pair.get_graphs()

        for nodes, relation in graph_pair.alignments_iter():
            columns = [
                graphs.source.get_node_token_string(nodes.source), relation,
                graphs.target.get_node_token_string(nodes.target)
            ]

            if verbose:
                banks = graph_pair.get_banks()

                columns = [
                    banks.source.get_file_path(),
                    banks.target.get_file_path(), graphs.source.id,
                    graphs.target.id, nodes.source, nodes.target
                ] + columns

            print delimiter.join(columns).encode("utf-8")
Esempio n. 30
0
def merge_files(inst_fns, true_fns, pred_fns, merger=Merger(),
                descriptor=None, n=None, binary=False):
    """
    Merge corpus instance files
    
    @param inst_fns: list of corpus instance filenames
    
    @param true_fns: list of corpus filenames containing the true alignments

    @param pred_fns: list of predicted corpus filenames to be created
    
    @param merger: instance of Merger class for merging instances into a graph
    pair
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format
    
    @keyword n: limit merging to the first n files
    
    @keyword binary: corpus instances in binary rather than text format
    """
    assert isinstance(merger, Merger)
    assert len(inst_fns) == len(true_fns) > 0
    
    for inst_fname, true_fname, pred_fname in zip(inst_fns,
                                                  true_fns,
                                                  pred_fns)[:n]:
        corpus_inst = CorpusInst()
        
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
            
        true_corpus = ParallelGraphCorpus(inf=true_fname,
                                          graph_loading=LOAD_NONE)
        pred_corpus = merge_corpus(corpus_inst, true_corpus, merger)
        log.info("saving predictd corpus {0}".format(inst_fname))
        pred_corpus.write(pred_fname)
Esempio n. 31
0
    def test_extract_dev(self):
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Esempio n. 32
0
    def test_extract_val_binary(self):
        st = create_setting()
        st.develop = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()
        st.binary = True

        extract(st)

        # check no of files
        self.assertEqual(len(st.val_true_fns), len(st.val_part_fns))
        self.assertEqual(len(st.val_inst_fns), len(st.val_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.val_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadbin(st.val_inst_fns[0])
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Esempio n. 33
0
run from the exp dir which contains a data subdir with the true pgc
files and an eval subdir for evaluation results
"""

import copy
import glob
import os

from daeso.pgc.corpus import ParallelGraphCorpus
from daeso.pgc.evaluate import AlignEval
from daeso_nl.ga.kb.baseline import greedy_align_equal_words, greedy_align_equal_words_roots

eval1 = AlignEval()
eval2 = AlignEval()

for pgc_fn in glob.glob("data/part*true.pgc"):
    true_corpus = ParallelGraphCorpus(inf=pgc_fn)
    pred_corpus = copy.deepcopy(true_corpus)

    greedy_align_equal_words(pred_corpus)
    eval1.add(true_corpus, pred_corpus, os.path.basename(pgc_fn))

    greedy_align_equal_words_roots(pred_corpus)
    eval2.add(true_corpus, pred_corpus, os.path.basename(pgc_fn))

eval1.run_eval()
eval1.write("eval/greedy_align_equals_words.txt")

eval2.run_eval()
eval2.write("eval/greedy_align_equals_words_roots.txt")
Esempio n. 34
0
 def setUp(self):
     self.pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
Esempio n. 35
0
    help="verbose ouput to stderr"
    )


args = parser.parse_args()

pgc_fns = multiglob(args.file)

def log(s):
    if args.verbose:
        print >>sys.stderr, "***", s
        
        
log("Reading corpus from " + pgc_fns[0])
        
corpus = ParallelGraphCorpus(inf=pgc_fns[0])

for fn in pgc_fns[1:]:
    log("Joining corpus from " + fn)
    # __iadd__ also checks if another corpus is compatible w.r.t. relations
    # and meta-data
    corpus += ParallelGraphCorpus(inf=fn)
    

# Purge the corpus of duplicate graphbanks held in memory    
log("Purging corpus")    
corpus.purge()

log("Writing corpus")
corpus.write(pprint=args.format)
Esempio n. 36
0
 def test_purge(self):
     # adding graph pairs with identical graphbanks
     pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc1 += pgc1
     graphbanks_before = pgc1._graphbanks()
     self.assertEqual(len(graphbanks_before), 2)
     pgc1.purge()
     graphbanks_after = pgc1._graphbanks()        
     self.assertEqual(graphbanks_before, graphbanks_after)
     
     # adding graph pairs with equal graphbanks
     pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc2 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc1 += pgc2
     graphbanks_before = pgc1._graphbanks()
     self.assertEqual(len(graphbanks_before), 4)
     pgc1.purge()
     graphbanks_after = pgc1._graphbanks()        
     self.assertEqual(len(graphbanks_after), 2)
     
     # adding graph pairs with different graphbanks
     pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
     pgc1 += pgc2
     graphbanks_before = pgc1._graphbanks()
     self.assertEqual(len(graphbanks_before), 4)
     pgc1.purge()
     graphbanks_after = pgc1._graphbanks()        
     self.assertEqual(graphbanks_before, graphbanks_after)
     
     # removing graphpairs and thus dependencies on graphbanks
     del pgc1[:]
     graphbanks = pgc1._graphbanks()
     self.assertEqual(len(graphbanks), 0)
Esempio n. 37
0
else:
    target_is_dir = False
    
    if len(args.source) > 1:
        parser.print_usage()
        exit("error: too many arguments")
    
    
for source in args.source:
    if not exists(source):
        stderr.write("warning: source " + repr(source) + " does not exist "
                     "(not copied)\n")
        continue

    try:
        corpus = ParallelGraphCorpus(inf=source, graph_loading=LOAD_NONE)
    except Exception, inst:
        stderr.write(str(inst) + "\n")
        stderr.write("warning: source " + repr(source) + 
                     " is not a valid parallel graph corpus (not copied) \n")
        continue
    
    if isdir(args.target):
        target = join(args.target, basename(source))
    else:
        target = args.target
        
    if exists(target) and samefile(source, target):
        stderr.write("warning: source " + repr(source) + " and target " + 
                     repr(target) + " are the same file (not copied) \n")
        continue
Esempio n. 38
0
class Aligner(object):
    """
    the Algraeph application model 
    """
    
    def __init__(self):
        self._corpus = ParallelGraphCorpus()
        # the domain model
        self._changed = False
        self._filename = None
        self._graph_pair = None
        self._graph_pair_index = None
        self._graphs = Pair(None, None)
        self._nodes = Pair(None, None)
        # the special relation which stands for "no relation"
        self._no_relation = "none"
        self._co_node_selection = False
        
    # ------------------------------------------------------------------------------
    # Corpus
    # ------------------------------------------------------------------------------

    def open_corpus(self, filename):
        send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename)

        # May raise errors such IOErrors, not an xml file, corrupt format, etc.
        # Use of relax_gb_paths allows graphbank files to be located in the
        # same direcory as the corpus file instead of the location specified
        # in the <file> element
        corpus = ParallelGraphCorpus()
        corpus.read(inf=filename, relax_gb_paths=True)
        
        if not corpus:
            raise AlgraephException("Parallel graph corpus contains no alignments")
        
        self._corpus = corpus
        self._filename = filename
        self._changed = False
            
        send(self.open_corpus, "statusDescription")
        send(self.open_corpus, "newCorpus")
        send(self.open_corpus, "newCorpusName")

        self.goto_graph_pair(0)
        # implies send("newGraphPair"), and sets self._graph_pair,
        # self._graph_pair_index, self._graphs and self._nodes

        
    def save_corpus(self, filename=None):
        if filename:
            self._filename = filename
            send(self.save_corpus, "newCorpusName")
            
        send(self.save_corpus, "statusDescription", "Saving corpus %s ..." % self._filename)        
        
        self._corpus.write(self._filename, pprint=True)
        self._changed = False
            
        send(self.save_corpus, "statusDescription")
        
        
    def get_corpus_len(self):
        return len(self._corpus)
    

    def get_corpus_filename(self):
        return self._filename

    
    def get_corpus_dir(self):
        try:
            return dirname(self._filename)
        except (AttributeError, TypeError):
            return None
    
    
    def corpus_changed(self):
        """
        returns True if the corpus has unsaved changes
        """
        return self._changed
        
    
    # ------------------------------------------------------------------------------
    # Treebanks
    # ------------------------------------------------------------------------------    
 
    def get_graphbanks_format(self):
        # The ParallelGraphCorpus class in principle supports graphbanks in
        # different formats, although untested for the time being. Formats are
        # therefore stored as a property of the graphbanks, but there is no
        # global format defined as a property of the corpus. So getting "the
        # graphbanks format" is not straightforward. We will make the
        # assumption that all graphbanks are in the same format, and there it
        # is sufficient to look at any graphbank linked to an arbitary graph
        # pair.
        return self._corpus[0].get_source_bank().get_format()
        
    # ------------------------------------------------------------------------------
    # Graphs (GraphPair and DaesoGraph)
    # ------------------------------------------------------------------------------    
    
    def get_graph_pair(self):
        return self._graph_pair
    
    
    def goto_prev_graph_pair(self):
        self.goto_graph_pair(self._graph_pair_index - 1)

    def goto_next_graph_pair(self):
        self.goto_graph_pair(self._graph_pair_index + 1)
        
        
    def goto_graph_pair(self, index):
        # don't use try-except here, because negative index is allowed for list
        if 0 <= index < len(self._corpus):
            self._graph_pair = self._corpus[index]
            self._graph_pair_index = index
            self._graphs = self._graph_pair.get_graphs()
            self._nodes = Pair(None, None)
            
            send(self.goto_graph_pair, "newGraphPair.viz")
            send(self.goto_graph_pair, "newGraphPair.gui")
    
        
    def get_from_graph(self):
        return self._graphs.source
    
    def get_to_graph(self):
        return self._graphs.target
    
        
    def get_from_graph_tokens(self):
        return self._graphs.source.get_graph_token_string()
        
    def get_to_graph_tokens(self):
        return self._graphs.target.get_graph_token_string()
    
    
    def get_graph_pair_counter(self):
        # counting starts from 1
        return (self._graph_pair_index + 1, len(self._corpus))
        
    
    # ------------------------------------------------------------------------------
    # Nodes
    # ------------------------------------------------------------------------------
    
    def co_node_selection_mode(self, state=False):
        self._co_node_selection = state
        
    
    def set_from_node(self, node=None):
        self._nodes.source = node
        
        if self._co_node_selection:
            self._nodes.target = self.get_aligned_to_node()
            
        send(self.set_from_node, "newNodeSelect.viz")
        send(self.set_from_node, "newNodeSelect.gui")

        
    def set_to_node(self, node=None):
        self._nodes.target = node
        
        if self._co_node_selection:
            self._nodes.source = self.get_aligned_from_node()
            
        send(self.set_to_node, "newNodeSelect.viz")
        send(self.set_to_node, "newNodeSelect.gui")
            
    
    def get_from_node(self):
        return self._nodes.source

    
    def get_to_node(self):
        return self._nodes.target
    
    
    def nodes_are_selected(self):
        return all(self._nodes)
    
    
    def get_from_node_tokens(self):
        return ( self._graphs.source.get_node_token_string(self._nodes.source) or
                 "" )
        
    
    def get_to_node_tokens(self):
        return ( self._graphs.target.get_node_token_string(self._nodes.target) or
                 "" )
    
    # ------------------------------------------------------------------------------
    # Alignment
    # ------------------------------------------------------------------------------
    
    def get_relation_set(self):
        try:
            return [self._no_relation] + self._corpus.get_relations()
        except TypeError:
            return [self._no_relation]
        
        
    def get_node_pair_relation(self):
        return self._graph_pair.get_align(self._nodes) or self._no_relation
    
        
    def set_node_pair_relation(self, relation):
        if self.nodes_are_selected():
            if relation != self._no_relation:
                self._graph_pair.add_align(self._nodes, relation)
            else:
                self._graph_pair.del_align(self._nodes)
                
            self._changed = True
                
            send(self.set_node_pair_relation, "newRelation.viz")
            send(self.set_node_pair_relation, "newRelation.gui")
    

    def get_aligned_to_node(self):
        """
        Get 'to' node aligned to the selected 'from' node
        """
        return self._graph_pair.get_aligned_target_node(self._nodes.source)
    
    
    def get_aligned_from_node(self):
        """
        Get 'from' node aligned to the selected 'to' node
        """
        return self._graph_pair.get_aligned_source_node(self._nodes.target)

    
    def get_auto_fold_equal_nodes(self):
        """
        Get lists of non-terminal 'from' and 'to' nodes aligned with an 
        'equals' relation
        """
        # ignoring terminals, so the list may be of unequal size
        from_nodes = []
        to_nodes = []
        
        for (nodes, rel) in self._graph_pair.alignments_iter():
            if rel == "equals":
                if self._graphs.source.node_is_non_terminal(nodes.source):
                    from_nodes.append(nodes.source)
                    
                if self._graphs.target.node_is_non_terminal(nodes.target):
                    to_nodes.append(nodes.target)
                    
        return from_nodes, to_nodes
        
    #------------------------------------------------------------------------------
    # Comments
    #------------------------------------------------------------------------------    
        
    def get_comment(self):
        try:
            return self._graph_pair.get_meta_data().find("comment").text
        except AttributeError:
            return ""

    
    def set_comment(self, text):
        meta_data_elem = self._graph_pair.get_meta_data()
        comment_elem = meta_data_elem.find("comment")
        
        if text.strip():
            if comment_elem is None:
                comment_elem = SubElement(meta_data_elem, "comment")
            comment_elem.text = text
        elif comment_elem:
            meta_data_elem.remove(comment_elem)
            
        self._changed = True
Esempio n. 39
0
 def setUp(self):
     self.corpus = ParallelGraphCorpus(
         inf=
         "../exp/corpora/news/pgc/ma/2006-11/news-2006-11-aligned-part-00.pgc"
     )
Esempio n. 40
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file, 
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)    
    doc_trees = text_corpus.get_doc_trees(search=True)
    
    # read graph banks
    source_bank = GraphBank(source_graphbank_file,
                            graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file,
                            graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)
    
    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)
    
    for alignment in text_corpus.alignment:
        if ( alignment.get("from_tag") != focus_tags.source or 
             alignment.get("to_tag") != focus_tags.target ):
            continue
        
        source_tokens = _get_elem_tokens(doc_trees.source,
                                         focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target,
                                         focus_tags.target,
                                         alignment.get("to_id"))
        
        if len(source_tokens) > max_token_len or len(target_tokens) > max_token_len:
            continue
        
        if ( min_token_diff and
             _token_diff(source_tokens, target_tokens)  < min_token_diff ):
            continue
        
        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(
            source_bank.get_graph(source_graph_id),
            target_bank.get_graph(target_graph_id))
        
        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)
            
    return graph_corpus
Esempio n. 41
0
class Test_ParallelGraphCorpus(unittest.TestCase):
    
    def setUp(self):
        self.pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
    
        
    def test__init(self):
        """
        init from another corpus
        """
        ParallelGraphCorpus(self.pgc1, self.pgc1.get_relations())
        
    
    def test__add__(self):
        """
        corpus + other
        """
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc3 = self.pgc1 + pgc2

        self.assertEqual(len(pgc3), len(self.pgc1) + len(pgc2))
        
        
    def test__deepcopy__(self):
        """
        copy.deepcopy(corpus)
        """
        pgc2 = copy.deepcopy(self.pgc1)
        
        self.assertTrue(isinstance(pgc2, ParallelGraphCorpus))
        self.assertFalse(self.pgc1._relations is pgc2._relations)
        self.assertFalse(self.pgc1._meta_data is pgc2._meta_data)
        
        for gp1, gp2 in zip(self.pgc1, pgc2):
            self.assertFalse(gp1 is gp2)
            # however, graphbanks and graphs are still shared
            self.assertTrue(gp1._banks is gp2._banks)
            self.assertTrue(gp1._graphs is gp2._graphs)
            
            
    def test__delitem__(self):
        """
        del corpus[1]
        """
        pg = self.pgc1[0]
        del self.pgc1[0]
        self.assertFalse(pg in self.pgc1)
        
        
    def test__delslice__(self):
        """
        del [:1]
        """
        pg = self.pgc1[0]
        del self.pgc1[:1]
        self.assertFalse(pg in self.pgc1)
        
        del self.pgc1[:]
        self.assertEqual(len(self.pgc1), 0)
        
        
    def test__eq__(self):
        self.assertEqual(self.pgc1, self.pgc1)
        
        pgc2 = self.pgc1[:]
        self.assertEqual(self.pgc1, pgc2)
        
        pgc2 = copy.deepcopy(self.pgc1)
        self.assertEqual(self.pgc1, pgc2)
        
        
    def test__getitem__(self):
        self.assertTrue(isinstance(self.pgc1[0], GraphPair))
        
    
    def test__getslice__(self):
        # or shallow copy
        pgc2 = self.pgc1[1:1:1]
        
        self.assertTrue(isinstance(pgc2, ParallelGraphCorpus))
        self.assertTrue(self.pgc1._relations is pgc2._relations)
        self.assertTrue(self.pgc1._meta_data is pgc2._meta_data)
        
        for gp1, gp2 in zip(self.pgc1, pgc2):
            self.assertTrue(gp1 is gp2)
            
            
    def test__iadd__(self):
        self.pgc1 += self.pgc1
        self.assertEquals(len(self.pgc1), 6)
        
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc2 += self.pgc1
        self.assertEquals(len(pgc2), 9)
        
        
    def test__repr__(self):
        self.assertTrue(repr(self.pgc1))
        
        
    def test__str__(self):
        self.assertTrue(str(self.pgc1))
        
        
    def test__setitem__(self):
        self.pgc1[0] = self.pgc1[-1]
        self.assertEqual(self.pgc1[0], self.pgc1[-1])
        
        self.assertRaises(TypeError, 
                          ParallelGraphCorpus.__setitem__,
                          self.pgc1,
                          1)
        
        
    def test__setslice__(self):
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        self.pgc1[-1:] = pgc2[:2]
        self.assertEqual(len(self.pgc1), 4)
        
        self.assertRaises(TypeError,
                          ParallelGraphCorpus.__setslice__,
                          self.pgc1,
                          1,
                          1,
                          ["x"])
        
        
    def test_append(self):
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        self.pgc1.append(pgc2[2])
        self.assertEqual(len(self.pgc1), 4)
        
        self.assertRaises(TypeError,
                          ParallelGraphCorpus.__setslice__,
                          self.pgc1,
                          1,
                          1,
                          ["x"])
        
        
    def test_clear(self):
        self.pgc1.clear()
        self.assertFalse(self.pgc1)
        self.assertTrue(isinstance(self.pgc1, ParallelGraphCorpus))
        
        
    def test_extend(self):
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        self.pgc1.extend(iter(pgc2))
        self.assertEqual(len(self.pgc1), 6)
        
        
    def test_purge(self):
        # adding graph pairs with identical graphbanks
        pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc1 += pgc1
        graphbanks_before = pgc1._graphbanks()
        self.assertEqual(len(graphbanks_before), 2)
        pgc1.purge()
        graphbanks_after = pgc1._graphbanks()        
        self.assertEqual(graphbanks_before, graphbanks_after)
        
        # adding graph pairs with equal graphbanks
        pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc2 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc1 += pgc2
        graphbanks_before = pgc1._graphbanks()
        self.assertEqual(len(graphbanks_before), 4)
        pgc1.purge()
        graphbanks_after = pgc1._graphbanks()        
        self.assertEqual(len(graphbanks_after), 2)
        
        # adding graph pairs with different graphbanks
        pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc1 += pgc2
        graphbanks_before = pgc1._graphbanks()
        self.assertEqual(len(graphbanks_before), 4)
        pgc1.purge()
        graphbanks_after = pgc1._graphbanks()        
        self.assertEqual(graphbanks_before, graphbanks_after)
        
        # removing graphpairs and thus dependencies on graphbanks
        del pgc1[:]
        graphbanks = pgc1._graphbanks()
        self.assertEqual(len(graphbanks), 0)
            
        
    def test__graph_banks(self):
        graphbanks = self.pgc1._graphbanks()
        self.assertEqual(len(graphbanks), 2)
        
        for gb in graphbanks:
            self.assertTrue(isinstance(gb, GraphBank)) 
            
            
    def test_annotator(self):
        self.assertFalse(self.pgc1.get_annotator())
        self.pgc1.set_annotator("AA")
        self.assertEqual(self.pgc1.get_annotator(), "AA")
        self.pgc1.set_annotator("BB")
        self.assertEqual(self.pgc1.get_annotator(), "AA + BB")
        self.pgc1.set_annotator("CC", append=False)
        self.assertEqual(self.pgc1.get_annotator(), "CC")
Esempio n. 42
0
 def setUp(self):
     # create an element tree which we can mutilate and save
     pg_corpus = ParallelGraphCorpus(inf="data/corpus-1.pgc")
     self.tmpfn = tempfile.NamedTemporaryFile().name
     generator = PGCGenerator()
     self.tree = generator.generate(pg_corpus, outf=self.tmpfn)
Esempio n. 43
0
parser.add_argument(
    "-x", "--clear",
    action="store_true",
    help="remove all existing alignments"
    )

parser.add_argument(
    "-i", "--in-place",
    action="store_true",
    help="modify input file(s)"
    )

args = parser.parse_args()


if args.config:
    config = imp.load_source("config", args.config)
    corpus_aligner = set_up_corpus_aligner(config)
else:
    from daeso_nl.ga.corpus import CorpusAligner
    corpus_aligner = CorpusAligner()
    

for inf in multiglob(args.pgc_files):
    corpus = ParallelGraphCorpus(inf=inf)
    corpus_aligner.align(corpus, clear=args.clear)
    
    if args.in_place:
        corpus.write(outf=inf, pprint=True)
    else:
        corpus.write(pprint=True)