def __init__(self, tokenizer=None, alpino=None, graph_aligner=None): self.init_tokenizer(tokenizer) self.init_alpino(alpino) self.init_graph_xml_parser() self.init_graph_aligner(graph_aligner) self.init_others() # a pair of graphbank dummies, which are needed when creating a new # GraphMapping instance self._graphbanks = Pair(GraphBank("", "alpino"), GraphBank("", "alpino"))
def test_equal(self): gb1 = GraphBank("data/source-gb-1.xml", "alpino") gb1.load() gb2 = GraphBank("data/source-gb-1.xml", "alpino") gb2.load() self.assertTrue(gb1 == gb2) gb2 = GraphBank("../../test/gb/data/source-gb-1.xml", "alpino") gb2.load() self.assertTrue(gb1 == gb2) gb2 = GraphBank("data/target-gb-1.xml", "alpino") gb2.load() self.assertFalse(gb1 == gb2)
def gb_stats(files, format, with_empty_nodes=False, with_failed_parses=False, with_punc=False, threshold=0): gb_table = GbStatsTable(size=len(files)) gb_row = 0 for i, fn in enumerate(files): bank = GraphBank(file_path=fn, format=format) bank.load() graph_stats(bank, gb_table, i, with_empty_nodes, with_failed_parses, with_punc, with_unaligned_roots=True, threshold=threshold, with_unaligned_graphs=True) gb_table.summarize() return gb_table
def test__iter__(self): gb = GraphBank("data/source-gb-1.xml", "alpino") gb.load() graphs = [graph for graph in gb] self.assertEqual(len(graphs), 3)
def test_init_1(self): gb = GraphBank("data/source-gb-1.xml", "alpino") gb.load() self.assertEqual(len(gb), 3)
def pgc_from_ptc(text_corpus_file, source_graphbank_file, target_graphbank_file, focus_tags=Pair("s", "s"), graph_formats=Pair("alpino", "alpino"), relations=RELATIONS, min_token_diff=0, max_token_len=99999): """ Create a new parallel graph corpus from a parallel text corpus and a pair of graphbanks @PARAM text_corpus_file: parallel text corpus filename @PARAM source_bank: source graphank filename @PARAM target_bank: target graphbank filname @KEYWORD focus_tags: pair of focus tags @KEYWORD graph_format: pair of graphbank formats @KEYWORD relations: list of alignment relations @keyword min_token_diff: minimum number of different tokens @keyword max_token_len: maximum number of tokens per focus element @RETURN: ParallelGraphCorpus object """ # read parallel text corpus text_corpus = HitaextDoc(file=text_corpus_file) doc_trees = text_corpus.get_doc_trees(search=True) # read graph banks source_bank = GraphBank(source_graphbank_file, graph_formats.source) source_bank.load() target_bank = GraphBank(target_graphbank_file, graph_formats.target) target_bank.load() graph_banks = Pair(source_bank, target_bank) # create an empty parallel graph corpus graph_corpus = ParallelGraphCorpus(relations=relations) for alignment in text_corpus.alignment: if (alignment.get("from_tag") != focus_tags.source or alignment.get("to_tag") != focus_tags.target): continue source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source, alignment.get("from_id")) target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target, alignment.get("to_id")) if len(source_tokens) > max_token_len or len( target_tokens) > max_token_len: continue if (min_token_diff and _token_diff(source_tokens, target_tokens) < min_token_diff): continue # the crucial assumption is that id's of the aligned focus # elements in the marked-up text have corresponding graphs with # the same id in the graph banks source_graph_id = alignment.get("from_id") target_graph_id = alignment.get("to_id") graphs = Pair(source_bank.get_graph(source_graph_id), target_bank.get_graph(target_graph_id)) graph_pair = GraphPair(graph_banks, graphs) graph_corpus.append(graph_pair) return graph_corpus