Esempio n. 1
0
def pgc_stats(files, 
              with_empty_nodes=False,
              with_failed_parses=False,
              with_punc=False,
              with_unaligned_roots=False,
              threshold=0,
              with_unaligned_graphs=False):
    
    pgc_table = PgcStatsTable(size=len(files))
    gb_table = GbStatsTable()
    pgc_row = gb_row = 0
    
    for fn in files:
        pgc = ParallelGraphCorpus()
        pgc.read(inf=fn, graph_loading=LOAD_ALL)
        graph_pair_stats(os.path.basename(fn),
                         pgc,
                         pgc_table, 
                         pgc_row,
                         with_empty_nodes,
                         with_failed_parses,
                         with_punc,
                         with_unaligned_roots,
                         threshold)
        
        graphbanks = pgc._graphbanks()
        gb_table.grow(len(graphbanks))
    
        # somewhat messy to process pgc and gb files intertwined, 
        # but otherwise all graphbanks must be kept in memory
        for gb in graphbanks:
            graph_stats(gb,
                        gb_table, 
                        gb_row,
                        with_empty_nodes,
                        with_failed_parses,
                        with_punc,
                        with_unaligned_roots,
                        threshold,
                        with_unaligned_graphs)
            gb_row += 1
            
        pgc_row += 1

    pgc_table.summarize()
    gb_table.summarize()
    
    return pgc_table, gb_table
Esempio n. 2
0
    def test_parser_load_relaxed(self):
        tmp_dir = tempfile.gettempdir()
        shutil.copy("data/corpus-2.pgc", tmp_dir + "/corpus-2.pgc")
        shutil.copy("../gb/data/source-gb-2.xml", tmp_dir + "/source-gb-2.xml")
        shutil.copy("../gb/data/target-gb-2.xml", tmp_dir + "/target-gb-2.xml")

        pg_corpus = ParallelGraphCorpus()
        pg_corpus.read(tmp_dir + "/corpus-2.pgc", relax_gb_paths=True)

        for graph_pair in pg_corpus:
            for graph in graph_pair._graphs:
                self.assertFalse(isinstance(graph, GraphStub))

            for bank in graph_pair._banks:
                self.assertEqual(bank.__class__, SparseGraphBank)
                self.assertEqual(len(bank), 3)

        os.remove(tmp_dir + "/corpus-2.pgc")
        os.remove(tmp_dir + "/source-gb-2.xml")
        os.remove(tmp_dir + "/target-gb-2.xml")
Esempio n. 3
0
    def open_corpus(self, filename):
        send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename)

        # May raise errors such IOErrors, not an xml file, corrupt format, etc.
        # Use of relax_gb_paths allows graphbank files to be located in the
        # same direcory as the corpus file instead of the location specified
        # in the <file> element
        corpus = ParallelGraphCorpus()
        corpus.read(inf=filename, relax_gb_paths=True)
        
        if not corpus:
            raise AlgraephException("Parallel graph corpus contains no alignments")
        
        self._corpus = corpus
        self._filename = filename
        self._changed = False
            
        send(self.open_corpus, "statusDescription")
        send(self.open_corpus, "newCorpus")
        send(self.open_corpus, "newCorpusName")

        self.goto_graph_pair(0)
Esempio n. 4
0
    def open_corpus(self, filename):
        send(self.open_corpus, "statusDescription",
             "Loading corpus %s ..." % filename)

        # May raise errors such IOErrors, not an xml file, corrupt format, etc.
        # Use of relax_gb_paths allows graphbank files to be located in the
        # same direcory as the corpus file instead of the location specified
        # in the <file> element
        corpus = ParallelGraphCorpus()
        corpus.read(inf=filename, relax_gb_paths=True)

        if not corpus:
            raise AlgraephException(
                "Parallel graph corpus contains no alignments")

        self._corpus = corpus
        self._filename = filename
        self._changed = False

        send(self.open_corpus, "statusDescription")
        send(self.open_corpus, "newCorpus")
        send(self.open_corpus, "newCorpusName")

        self.goto_graph_pair(0)
Esempio n. 5
0
 def test_parser_load_relaxed(self):
     tmp_dir = tempfile.gettempdir()
     shutil.copy("data/corpus-2.pgc",
                 tmp_dir + "/corpus-2.pgc")
     shutil.copy("../gb/data/source-gb-2.xml", 
                 tmp_dir + "/source-gb-2.xml")
     shutil.copy("../gb/data/target-gb-2.xml", 
                 tmp_dir + "/target-gb-2.xml")
     
     pg_corpus = ParallelGraphCorpus()
     pg_corpus.read(tmp_dir + "/corpus-2.pgc",
                    relax_gb_paths=True)
 
     for graph_pair in pg_corpus:
         for graph in graph_pair._graphs:
             self.assertFalse(isinstance(graph, GraphStub))
             
         for bank in graph_pair._banks:
             self.assertEqual(bank.__class__, SparseGraphBank)
             self.assertEqual(len(bank), 3)
             
     os.remove(tmp_dir + "/corpus-2.pgc")
     os.remove(tmp_dir + "/source-gb-2.xml")
     os.remove(tmp_dir + "/target-gb-2.xml")