Esempio n. 1
0
def gb_stats(files,
             format,
             with_empty_nodes=False,
             with_failed_parses=False,
             with_punc=False,
             threshold=0):

    gb_table = GbStatsTable(size=len(files))
    gb_row = 0

    for i, fn in enumerate(files):
        bank = GraphBank(file_path=fn, format=format)
        bank.load()
        graph_stats(bank,
                    gb_table,
                    i,
                    with_empty_nodes,
                    with_failed_parses,
                    with_punc,
                    with_unaligned_roots=True,
                    threshold=threshold,
                    with_unaligned_graphs=True)

    gb_table.summarize()
    return gb_table
Esempio n. 2
0
    def __init__(self, tokenizer=None, alpino=None, graph_aligner=None):
        self.init_tokenizer(tokenizer)
        self.init_alpino(alpino)
        self.init_graph_xml_parser()
        self.init_graph_aligner(graph_aligner)
        self.init_others()

        # a pair of graphbank dummies, which are needed when creating a new
        # GraphMapping instance
        self._graphbanks = Pair(GraphBank("", "alpino"),
                                GraphBank("", "alpino"))
Esempio n. 3
0
def gb_stats(files, format, with_empty_nodes=False, with_failed_parses=False, with_punc=False, threshold=0):

    gb_table = GbStatsTable(size=len(files))
    gb_row = 0

    for i, fn in enumerate(files):
        bank = GraphBank(file_path=fn, format=format)
        bank.load()
        graph_stats(
            bank,
            gb_table,
            i,
            with_empty_nodes,
            with_failed_parses,
            with_punc,
            with_unaligned_roots=True,
            threshold=threshold,
            with_unaligned_graphs=True,
        )

    gb_table.summarize()
    return gb_table
Esempio n. 4
0
 def test__iter__(self):
     gb = GraphBank("data/source-gb-1.xml", "alpino")
     gb.load()
     graphs = [graph for graph in gb]
     self.assertEqual(len(graphs), 3)
Esempio n. 5
0
 def test_equal(self):
     gb1 = GraphBank("data/source-gb-1.xml", "alpino")
     gb1.load()
     
     gb2 = GraphBank("data/source-gb-1.xml", "alpino")
     gb2.load()
     self.assertTrue(gb1 == gb2)
     
     gb2 = GraphBank("../../test/gb/data/source-gb-1.xml", "alpino")
     gb2.load()
     self.assertTrue(gb1 == gb2)
     
     gb2 = GraphBank("data/target-gb-1.xml", "alpino")
     gb2.load()
     self.assertFalse(gb1 == gb2)
Esempio n. 6
0
 def test_init_1(self):
     gb = GraphBank("data/source-gb-1.xml", "alpino")
     gb.load()
     self.assertEqual(len(gb), 3)
Esempio n. 7
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file, 
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)    
    doc_trees = text_corpus.get_doc_trees(search=True)
    
    # read graph banks
    source_bank = GraphBank(source_graphbank_file,
                            graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file,
                            graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)
    
    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)
    
    for alignment in text_corpus.alignment:
        if ( alignment.get("from_tag") != focus_tags.source or 
             alignment.get("to_tag") != focus_tags.target ):
            continue
        
        source_tokens = _get_elem_tokens(doc_trees.source,
                                         focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target,
                                         focus_tags.target,
                                         alignment.get("to_id"))
        
        if len(source_tokens) > max_token_len or len(target_tokens) > max_token_len:
            continue
        
        if ( min_token_diff and
             _token_diff(source_tokens, target_tokens)  < min_token_diff ):
            continue
        
        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(
            source_bank.get_graph(source_graph_id),
            target_bank.get_graph(target_graph_id))
        
        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)
            
    return graph_corpus
Esempio n. 8
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file,
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)
    doc_trees = text_corpus.get_doc_trees(search=True)

    # read graph banks
    source_bank = GraphBank(source_graphbank_file, graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file, graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)

    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)

    for alignment in text_corpus.alignment:
        if (alignment.get("from_tag") != focus_tags.source
                or alignment.get("to_tag") != focus_tags.target):
            continue

        source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target,
                                         alignment.get("to_id"))

        if len(source_tokens) > max_token_len or len(
                target_tokens) > max_token_len:
            continue

        if (min_token_diff and
                _token_diff(source_tokens, target_tokens) < min_token_diff):
            continue

        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(source_bank.get_graph(source_graph_id),
                      target_bank.get_graph(target_graph_id))

        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)

    return graph_corpus