def extract_corpus(extractor, selector, corpus): corpus_inst = CorpusInst() # create an empty copy, because append() is faster than __del__() or # remove() true_corpus = ParallelGraphCorpus(relations=corpus.get_relations(), meta_data=corpus.get_meta_data()) for graph_pair in corpus: if selector(graph_pair): true_corpus.append(graph_pair) corpus_inst.append(extractor.extract(graph_pair)) return corpus_inst, true_corpus
def extract_corpus(extractor, selector, corpus): corpus_inst = CorpusInst() # create an empty copy, because append() is faster than __del__() or # remove() true_corpus = ParallelGraphCorpus( relations=corpus.get_relations(), meta_data=corpus.get_meta_data()) for graph_pair in corpus: if selector(graph_pair): true_corpus.append(graph_pair) corpus_inst.append( extractor.extract(graph_pair)) return corpus_inst, true_corpus
def pgc_from_ptc(text_corpus_file, source_graphbank_file, target_graphbank_file, focus_tags=Pair("s", "s"), graph_formats=Pair("alpino", "alpino"), relations=RELATIONS, min_token_diff=0, max_token_len=99999): """ Create a new parallel graph corpus from a parallel text corpus and a pair of graphbanks @PARAM text_corpus_file: parallel text corpus filename @PARAM source_bank: source graphank filename @PARAM target_bank: target graphbank filname @KEYWORD focus_tags: pair of focus tags @KEYWORD graph_format: pair of graphbank formats @KEYWORD relations: list of alignment relations @keyword min_token_diff: minimum number of different tokens @keyword max_token_len: maximum number of tokens per focus element @RETURN: ParallelGraphCorpus object """ # read parallel text corpus text_corpus = HitaextDoc(file=text_corpus_file) doc_trees = text_corpus.get_doc_trees(search=True) # read graph banks source_bank = GraphBank(source_graphbank_file, graph_formats.source) source_bank.load() target_bank = GraphBank(target_graphbank_file, graph_formats.target) target_bank.load() graph_banks = Pair(source_bank, target_bank) # create an empty parallel graph corpus graph_corpus = ParallelGraphCorpus(relations=relations) for alignment in text_corpus.alignment: if ( alignment.get("from_tag") != focus_tags.source or alignment.get("to_tag") != focus_tags.target ): continue source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source, alignment.get("from_id")) target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target, alignment.get("to_id")) if len(source_tokens) > max_token_len or len(target_tokens) > max_token_len: continue if ( min_token_diff and _token_diff(source_tokens, target_tokens) < min_token_diff ): continue # the crucial assumption is that id's of the aligned focus # elements in the marked-up text have corresponding graphs with # the same id in the graph banks source_graph_id = alignment.get("from_id") target_graph_id = alignment.get("to_id") graphs = Pair( source_bank.get_graph(source_graph_id), target_bank.get_graph(target_graph_id)) graph_pair = GraphPair(graph_banks, graphs) graph_corpus.append(graph_pair) return graph_corpus
class Test_ParallelGraphCorpus(unittest.TestCase): def setUp(self): self.pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc") def test__init(self): """ init from another corpus """ ParallelGraphCorpus(self.pgc1, self.pgc1.get_relations()) def test__add__(self): """ corpus + other """ pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc") pgc3 = self.pgc1 + pgc2 self.assertEqual(len(pgc3), len(self.pgc1) + len(pgc2)) def test__deepcopy__(self): """ copy.deepcopy(corpus) """ pgc2 = copy.deepcopy(self.pgc1) self.assertTrue(isinstance(pgc2, ParallelGraphCorpus)) self.assertFalse(self.pgc1._relations is pgc2._relations) self.assertFalse(self.pgc1._meta_data is pgc2._meta_data) for gp1, gp2 in zip(self.pgc1, pgc2): self.assertFalse(gp1 is gp2) # however, graphbanks and graphs are still shared self.assertTrue(gp1._banks is gp2._banks) self.assertTrue(gp1._graphs is gp2._graphs) def test__delitem__(self): """ del corpus[1] """ pg = self.pgc1[0] del self.pgc1[0] self.assertFalse(pg in self.pgc1) def test__delslice__(self): """ del [:1] """ pg = self.pgc1[0] del self.pgc1[:1] self.assertFalse(pg in self.pgc1) del self.pgc1[:] self.assertEqual(len(self.pgc1), 0) def test__eq__(self): self.assertEqual(self.pgc1, self.pgc1) pgc2 = self.pgc1[:] self.assertEqual(self.pgc1, pgc2) pgc2 = copy.deepcopy(self.pgc1) self.assertEqual(self.pgc1, pgc2) def test__getitem__(self): self.assertTrue(isinstance(self.pgc1[0], GraphPair)) def test__getslice__(self): # or shallow copy pgc2 = self.pgc1[1:1:1] self.assertTrue(isinstance(pgc2, ParallelGraphCorpus)) self.assertTrue(self.pgc1._relations is pgc2._relations) self.assertTrue(self.pgc1._meta_data is pgc2._meta_data) for gp1, gp2 in zip(self.pgc1, pgc2): self.assertTrue(gp1 is gp2) def test__iadd__(self): self.pgc1 += self.pgc1 self.assertEquals(len(self.pgc1), 6) pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc") pgc2 += self.pgc1 self.assertEquals(len(pgc2), 9) def test__repr__(self): self.assertTrue(repr(self.pgc1)) def test__str__(self): self.assertTrue(str(self.pgc1)) def test__setitem__(self): self.pgc1[0] = self.pgc1[-1] self.assertEqual(self.pgc1[0], self.pgc1[-1]) self.assertRaises(TypeError, ParallelGraphCorpus.__setitem__, self.pgc1, 1) def test__setslice__(self): pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc") self.pgc1[-1:] = pgc2[:2] self.assertEqual(len(self.pgc1), 4) self.assertRaises(TypeError, ParallelGraphCorpus.__setslice__, self.pgc1, 1, 1, ["x"]) def test_append(self): pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc") self.pgc1.append(pgc2[2]) self.assertEqual(len(self.pgc1), 4) self.assertRaises(TypeError, ParallelGraphCorpus.__setslice__, self.pgc1, 1, 1, ["x"]) def test_clear(self): self.pgc1.clear() self.assertFalse(self.pgc1) self.assertTrue(isinstance(self.pgc1, ParallelGraphCorpus)) def test_extend(self): pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc") self.pgc1.extend(iter(pgc2)) self.assertEqual(len(self.pgc1), 6) def test_purge(self): # adding graph pairs with identical graphbanks pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc") pgc1 += pgc1 graphbanks_before = pgc1._graphbanks() self.assertEqual(len(graphbanks_before), 2) pgc1.purge() graphbanks_after = pgc1._graphbanks() self.assertEqual(graphbanks_before, graphbanks_after) # adding graph pairs with equal graphbanks pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc") pgc2 = ParallelGraphCorpus(inf="data/corpus-1.pgc") pgc1 += pgc2 graphbanks_before = pgc1._graphbanks() self.assertEqual(len(graphbanks_before), 4) pgc1.purge() graphbanks_after = pgc1._graphbanks() self.assertEqual(len(graphbanks_after), 2) # adding graph pairs with different graphbanks pgc1 = ParallelGraphCorpus(inf="data/corpus-1.pgc") pgc2 = ParallelGraphCorpus(inf="data/corpus-2.pgc") pgc1 += pgc2 graphbanks_before = pgc1._graphbanks() self.assertEqual(len(graphbanks_before), 4) pgc1.purge() graphbanks_after = pgc1._graphbanks() self.assertEqual(graphbanks_before, graphbanks_after) # removing graphpairs and thus dependencies on graphbanks del pgc1[:] graphbanks = pgc1._graphbanks() self.assertEqual(len(graphbanks), 0) def test__graph_banks(self): graphbanks = self.pgc1._graphbanks() self.assertEqual(len(graphbanks), 2) for gb in graphbanks: self.assertTrue(isinstance(gb, GraphBank)) def test_annotator(self): self.assertFalse(self.pgc1.get_annotator()) self.pgc1.set_annotator("AA") self.assertEqual(self.pgc1.get_annotator(), "AA") self.pgc1.set_annotator("BB") self.assertEqual(self.pgc1.get_annotator(), "AA + BB") self.pgc1.set_annotator("CC", append=False) self.assertEqual(self.pgc1.get_annotator(), "CC")
def pgc_from_ptc(text_corpus_file, source_graphbank_file, target_graphbank_file, focus_tags=Pair("s", "s"), graph_formats=Pair("alpino", "alpino"), relations=RELATIONS, min_token_diff=0, max_token_len=99999): """ Create a new parallel graph corpus from a parallel text corpus and a pair of graphbanks @PARAM text_corpus_file: parallel text corpus filename @PARAM source_bank: source graphank filename @PARAM target_bank: target graphbank filname @KEYWORD focus_tags: pair of focus tags @KEYWORD graph_format: pair of graphbank formats @KEYWORD relations: list of alignment relations @keyword min_token_diff: minimum number of different tokens @keyword max_token_len: maximum number of tokens per focus element @RETURN: ParallelGraphCorpus object """ # read parallel text corpus text_corpus = HitaextDoc(file=text_corpus_file) doc_trees = text_corpus.get_doc_trees(search=True) # read graph banks source_bank = GraphBank(source_graphbank_file, graph_formats.source) source_bank.load() target_bank = GraphBank(target_graphbank_file, graph_formats.target) target_bank.load() graph_banks = Pair(source_bank, target_bank) # create an empty parallel graph corpus graph_corpus = ParallelGraphCorpus(relations=relations) for alignment in text_corpus.alignment: if (alignment.get("from_tag") != focus_tags.source or alignment.get("to_tag") != focus_tags.target): continue source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source, alignment.get("from_id")) target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target, alignment.get("to_id")) if len(source_tokens) > max_token_len or len( target_tokens) > max_token_len: continue if (min_token_diff and _token_diff(source_tokens, target_tokens) < min_token_diff): continue # the crucial assumption is that id's of the aligned focus # elements in the marked-up text have corresponding graphs with # the same id in the graph banks source_graph_id = alignment.get("from_id") target_graph_id = alignment.get("to_id") graphs = Pair(source_bank.get_graph(source_graph_id), target_bank.get_graph(target_graph_id)) graph_pair = GraphPair(graph_banks, graphs) graph_corpus.append(graph_pair) return graph_corpus