Esempio n. 1
0
def extract_corpus(extractor, selector, corpus):
    corpus_inst = CorpusInst()
    # create an empty copy, because append() is faster than __del__() or
    # remove()
    true_corpus = ParallelGraphCorpus(relations=corpus.get_relations(),
                                      meta_data=corpus.get_meta_data())

    for graph_pair in corpus:
        if selector(graph_pair):
            true_corpus.append(graph_pair)
            corpus_inst.append(extractor.extract(graph_pair))

    return corpus_inst, true_corpus
Esempio n. 2
0
def extract_corpus(extractor, selector, corpus):
    corpus_inst = CorpusInst()
    # create an empty copy, because append() is faster than __del__() or
    # remove()
    true_corpus = ParallelGraphCorpus(
        relations=corpus.get_relations(),
        meta_data=corpus.get_meta_data())
    
    for graph_pair in corpus:
        if selector(graph_pair):
            true_corpus.append(graph_pair)
            corpus_inst.append(
                extractor.extract(graph_pair))
            
    return corpus_inst, true_corpus
Esempio n. 3
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file, 
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)    
    doc_trees = text_corpus.get_doc_trees(search=True)
    
    # read graph banks
    source_bank = GraphBank(source_graphbank_file,
                            graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file,
                            graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)
    
    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)
    
    for alignment in text_corpus.alignment:
        if ( alignment.get("from_tag") != focus_tags.source or 
             alignment.get("to_tag") != focus_tags.target ):
            continue
        
        source_tokens = _get_elem_tokens(doc_trees.source,
                                         focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target,
                                         focus_tags.target,
                                         alignment.get("to_id"))
        
        if len(source_tokens) > max_token_len or len(target_tokens) > max_token_len:
            continue
        
        if ( min_token_diff and
             _token_diff(source_tokens, target_tokens)  < min_token_diff ):
            continue
        
        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(
            source_bank.get_graph(source_graph_id),
            target_bank.get_graph(target_graph_id))
        
        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)
            
    return graph_corpus
Esempio n. 4
0
class Test_ParallelGraphCorpus(unittest.TestCase):
    
    def setUp(self):
        self.pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
    
        
    def test__init(self):
        """
        init from another corpus
        """
        ParallelGraphCorpus(self.pgc1, self.pgc1.get_relations())
        
    
    def test__add__(self):
        """
        corpus + other
        """
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc3 = self.pgc1 + pgc2

        self.assertEqual(len(pgc3), len(self.pgc1) + len(pgc2))
        
        
    def test__deepcopy__(self):
        """
        copy.deepcopy(corpus)
        """
        pgc2 = copy.deepcopy(self.pgc1)
        
        self.assertTrue(isinstance(pgc2, ParallelGraphCorpus))
        self.assertFalse(self.pgc1._relations is pgc2._relations)
        self.assertFalse(self.pgc1._meta_data is pgc2._meta_data)
        
        for gp1, gp2 in zip(self.pgc1, pgc2):
            self.assertFalse(gp1 is gp2)
            # however, graphbanks and graphs are still shared
            self.assertTrue(gp1._banks is gp2._banks)
            self.assertTrue(gp1._graphs is gp2._graphs)
            
            
    def test__delitem__(self):
        """
        del corpus[1]
        """
        pg = self.pgc1[0]
        del self.pgc1[0]
        self.assertFalse(pg in self.pgc1)
        
        
    def test__delslice__(self):
        """
        del [:1]
        """
        pg = self.pgc1[0]
        del self.pgc1[:1]
        self.assertFalse(pg in self.pgc1)
        
        del self.pgc1[:]
        self.assertEqual(len(self.pgc1), 0)
        
        
    def test__eq__(self):
        self.assertEqual(self.pgc1, self.pgc1)
        
        pgc2 = self.pgc1[:]
        self.assertEqual(self.pgc1, pgc2)
        
        pgc2 = copy.deepcopy(self.pgc1)
        self.assertEqual(self.pgc1, pgc2)
        
        
    def test__getitem__(self):
        self.assertTrue(isinstance(self.pgc1[0], GraphPair))
        
    
    def test__getslice__(self):
        # or shallow copy
        pgc2 = self.pgc1[1:1:1]
        
        self.assertTrue(isinstance(pgc2, ParallelGraphCorpus))
        self.assertTrue(self.pgc1._relations is pgc2._relations)
        self.assertTrue(self.pgc1._meta_data is pgc2._meta_data)
        
        for gp1, gp2 in zip(self.pgc1, pgc2):
            self.assertTrue(gp1 is gp2)
            
            
    def test__iadd__(self):
        self.pgc1 += self.pgc1
        self.assertEquals(len(self.pgc1), 6)
        
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc2 += self.pgc1
        self.assertEquals(len(pgc2), 9)
        
        
    def test__repr__(self):
        self.assertTrue(repr(self.pgc1))
        
        
    def test__str__(self):
        self.assertTrue(str(self.pgc1))
        
        
    def test__setitem__(self):
        self.pgc1[0] = self.pgc1[-1]
        self.assertEqual(self.pgc1[0], self.pgc1[-1])
        
        self.assertRaises(TypeError, 
                          ParallelGraphCorpus.__setitem__,
                          self.pgc1,
                          1)
        
        
    def test__setslice__(self):
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        self.pgc1[-1:] = pgc2[:2]
        self.assertEqual(len(self.pgc1), 4)
        
        self.assertRaises(TypeError,
                          ParallelGraphCorpus.__setslice__,
                          self.pgc1,
                          1,
                          1,
                          ["x"])
        
        
    def test_append(self):
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        self.pgc1.append(pgc2[2])
        self.assertEqual(len(self.pgc1), 4)
        
        self.assertRaises(TypeError,
                          ParallelGraphCorpus.__setslice__,
                          self.pgc1,
                          1,
                          1,
                          ["x"])
        
        
    def test_clear(self):
        self.pgc1.clear()
        self.assertFalse(self.pgc1)
        self.assertTrue(isinstance(self.pgc1, ParallelGraphCorpus))
        
        
    def test_extend(self):
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        self.pgc1.extend(iter(pgc2))
        self.assertEqual(len(self.pgc1), 6)
        
        
    def test_purge(self):
        # adding graph pairs with identical graphbanks
        pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc1 += pgc1
        graphbanks_before = pgc1._graphbanks()
        self.assertEqual(len(graphbanks_before), 2)
        pgc1.purge()
        graphbanks_after = pgc1._graphbanks()        
        self.assertEqual(graphbanks_before, graphbanks_after)
        
        # adding graph pairs with equal graphbanks
        pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc2 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc1 += pgc2
        graphbanks_before = pgc1._graphbanks()
        self.assertEqual(len(graphbanks_before), 4)
        pgc1.purge()
        graphbanks_after = pgc1._graphbanks()        
        self.assertEqual(len(graphbanks_after), 2)
        
        # adding graph pairs with different graphbanks
        pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc")
        pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc")
        pgc1 += pgc2
        graphbanks_before = pgc1._graphbanks()
        self.assertEqual(len(graphbanks_before), 4)
        pgc1.purge()
        graphbanks_after = pgc1._graphbanks()        
        self.assertEqual(graphbanks_before, graphbanks_after)
        
        # removing graphpairs and thus dependencies on graphbanks
        del pgc1[:]
        graphbanks = pgc1._graphbanks()
        self.assertEqual(len(graphbanks), 0)
            
        
    def test__graph_banks(self):
        graphbanks = self.pgc1._graphbanks()
        self.assertEqual(len(graphbanks), 2)
        
        for gb in graphbanks:
            self.assertTrue(isinstance(gb, GraphBank)) 
            
            
    def test_annotator(self):
        self.assertFalse(self.pgc1.get_annotator())
        self.pgc1.set_annotator("AA")
        self.assertEqual(self.pgc1.get_annotator(), "AA")
        self.pgc1.set_annotator("BB")
        self.assertEqual(self.pgc1.get_annotator(), "AA + BB")
        self.pgc1.set_annotator("CC", append=False)
        self.assertEqual(self.pgc1.get_annotator(), "CC")
Esempio n. 5
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file,
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)
    doc_trees = text_corpus.get_doc_trees(search=True)

    # read graph banks
    source_bank = GraphBank(source_graphbank_file, graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file, graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)

    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)

    for alignment in text_corpus.alignment:
        if (alignment.get("from_tag") != focus_tags.source
                or alignment.get("to_tag") != focus_tags.target):
            continue

        source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target,
                                         alignment.get("to_id"))

        if len(source_tokens) > max_token_len or len(
                target_tokens) > max_token_len:
            continue

        if (min_token_diff and
                _token_diff(source_tokens, target_tokens) < min_token_diff):
            continue

        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(source_bank.get_graph(source_graph_id),
                      target_bank.get_graph(target_graph_id))

        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)

    return graph_corpus