Beispiel #1
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    chunkmap=chunkmap_factory(pickle.load(
                            bz2.BZ2File('test_data/5th.chunkmap.bz2')))
                            
    setupstart=time.clock()
    semrep_reader=SemrepOutput(bz2.BZ2File('test_data/5th.semrep.out.bz2'),
                                ["semrep wrapper error"],
                                chunkmap)
    semrep_grapher=SemrepGraphBuilder()
    pr_algorithm=PageRanker()
    count=0
    parsestart=time.clock()
    for article in semrep_reader:
        print "Read article", article.set_id,
        graph=semrep_grapher.create_graph(article.lines)
        print "graphed it",
        matrixed=graph.as_mapped_link_matrix()
        print "matrixed it,",
        fake_e_vector=[0.0] * len(matrixed)
        if fake_e_vector==[]:
            print "didn't pagerank because it was empty."
        else:
            ranked=pr_algorithm.evaluate(matrixed, fake_e_vector)
            print "pageranked it. Stats:", pr_algorithm.stats
        count+=1
    endparse=time.clock()   
    print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) for "\
          "%d total articles, for a grand total of %1.3f compressed "\
          "articles/second turned into semantic graphs, link matrices," \
          " and finally pageranked." % (
          endparse-setupstart, parsestart-setupstart, count, 
          float(count)/(endparse-setupstart))
Beispiel #2
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    chunkmap = chunkmap_factory(
        pickle.load(bz2.BZ2File('test_data/5th.chunkmap.bz2')))

    setupstart = time.clock()
    semrep_reader = SemrepOutput(bz2.BZ2File('test_data/5th.semrep.out.bz2'),
                                 ["semrep wrapper error"], chunkmap)
    semrep_grapher = SemrepGraphBuilder()
    pr_algorithm = PageRanker()
    count = 0
    parsestart = time.clock()
    for article in semrep_reader:
        print "Read article", article.set_id,
        graph = semrep_grapher.create_graph(article.lines)
        print "graphed it",
        matrixed = graph.as_mapped_link_matrix()
        print "matrixed it,",
        fake_e_vector = [0.0] * len(matrixed)
        if fake_e_vector == []:
            print "didn't pagerank because it was empty."
        else:
            ranked = pr_algorithm.evaluate(matrixed, fake_e_vector)
            print "pageranked it. Stats:", pr_algorithm.stats
        count += 1
    endparse = time.clock()
    print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) for "\
          "%d total articles, for a grand total of %1.3f compressed "\
          "articles/second turned into semantic graphs, link matrices," \
          " and finally pageranked." % (
          endparse-setupstart, parsestart-setupstart, count,
          float(count)/(endparse-setupstart))
Beispiel #3
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    chunkmap=chunkmap_factory(pickle.load(
                            bz2.BZ2File(
                            'test_data/metamap.chunkmap.bz2')))
                            
    setupstart=time.clock()
    metamap_reader=MetamapOutput(bz2.BZ2File(
                                'test_data/metamap.out.bz2'),
                                DEFAULT_LINES_TO_IGNORE,
                                chunkmap)
    grapher=MetamapCoccurrenceGraphBuilder()
    # PageRank is not the correct algorithm for a matrix of adirectional nodes
    # but it'll do for now, to exercise the system
    # pr_algorithm=PageRanker()
    # TextRanker now written
    pr_algorithm=MappedRanker(TextRanker())
    count=0
    parsestart=time.clock()
    for article in metamap_reader:
        print "Read article", article.set_id,
        graph=grapher.create_graph(article.lines)
        print "graphed it",
        matrix=graph.as_mapped_link_matrix()
        print "turned it into a", matrix, 
        #fake_e_vector=[0.0] * len(matrix)
        if len(matrix)==0:
            print "didn't pagerank because it was empty."
        else:
            ranked=pr_algorithm.evaluate(matrix)
            print "TextRanked it. First results: %r Stats:" % \
                    [x for x in ranked][:5],  pr_algorithm.stats
        count+=1
    endparse=time.clock()   
    print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) "\
          "for %d total articles, for a grand total of %1.3f compressed "\
          "articles/second read, turned into link matrices, and " \
          " pageranked." \
          % (endparse-setupstart, parsestart-setupstart, count, 
             float(count)/(endparse-setupstart))
    
    count+=1
Beispiel #4
0
    def setUp(self):
        # Test setup borrowed from semrep.py
        from file.metamap import (MetamapOutput)
        from file.chunkmap import chunkmap_factory
        import StringIO
        # logging.basicConfig(level=logging.DEBUG,
        #                  format='%(asctime)s %(levelname)s %(message)s')
        # This fakefile is NOT the same as semrep.py - this one has a
        # relationship that should not be part of the graph, and one that
        # should
        # Metamap test from metamap.py
        #logging.setLevel(ULTRADEBUG)
        mm_file = StringIO.StringIO("""
        >>>>> MMI
        * ERROR *
        0000000000|MM|530|Carboxyhemoglobin|C0007061| ["Carboxyhaemoglobin"-ti-1-"Carboxyhaemoglobin"]|TI
        0000000000|MM|223|Male population group|C0025266|["Men"-ti-1-"men"]|TI
        0000000000|MM|121|Levels|C0441889|["Levels"-ti-1-"levels"]|TI
        0000000000|MM|114|British|C0596227|["British"-ti-1-"British"]|TI
        0000000000|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI
        0000000000|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI
        <<<<< MMI
        >>>>> MMI
        0000000001|MM|585|Little's Disease|C0023882|["Little"-ti-1-"little"]|TI
        0000000001|MM|424|HAC protocol|C0062074|["HAD"-ti-1-"has"]|TI
        0000000001|MM|170|Background|C1706907|["Background"-ti-1-"BACKGROUND"]|TI
        0000000001|MM|170|General Population|C0683971|["General Population"-ti-1-"general population"]|TI
        0000000001|MM|170|Small|C0700321|["Little"-ti-1-"little"]|TI
        0000000001|MM|124|Levels|C0441889|["Levels"-ti-1-"levels"]|TI
        0000000001|MM|121|Exposure to|C0332157|["Exposure"-ti-1-"exposure"]|TI
        0000000001|MM|121|Injury due to exposure to external cause|C0274281|["Exposure, NOS"-ti-1-"exposure"]|TI
        0000000001|MM|121|Persons|C0027361|["People"-ti-1-"people"]|TI
        0000000001|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI
        0000000001|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI
        0000000001|MM|114|Known|C0205309|["Known"-ti-1-"known"]|TI
        <<<<< MMI
        """)
        mm_chunkmap = chunkmap_factory({'123.txt': [0, 1]})

        self.mo = MetamapOutput(mm_file,
                                [">>>>> MMI", "<<<<< MMI", "* error *"],
                                mm_chunkmap)
    def setUp(self):
        # Test setup borrowed from semrep.py
        from file.metamap import (MetamapOutput)
        from file.chunkmap import chunkmap_factory
        import StringIO
        # logging.basicConfig(level=logging.DEBUG,
        #                  format='%(asctime)s %(levelname)s %(message)s')
        # This fakefile is NOT the same as semrep.py - this one has a 
        # relationship that should not be part of the graph, and one that 
        # should
        # Metamap test from metamap.py
        #logging.setLevel(ULTRADEBUG)
        mm_file=StringIO.StringIO("""
        >>>>> MMI
        * ERROR *
        0000000000|MM|530|Carboxyhemoglobin|C0007061| ["Carboxyhaemoglobin"-ti-1-"Carboxyhaemoglobin"]|TI
        0000000000|MM|223|Male population group|C0025266|["Men"-ti-1-"men"]|TI
        0000000000|MM|121|Levels|C0441889|["Levels"-ti-1-"levels"]|TI
        0000000000|MM|114|British|C0596227|["British"-ti-1-"British"]|TI
        0000000000|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI
        0000000000|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI
        <<<<< MMI
        >>>>> MMI
        0000000001|MM|585|Little's Disease|C0023882|["Little"-ti-1-"little"]|TI
        0000000001|MM|424|HAC protocol|C0062074|["HAD"-ti-1-"has"]|TI
        0000000001|MM|170|Background|C1706907|["Background"-ti-1-"BACKGROUND"]|TI
        0000000001|MM|170|General Population|C0683971|["General Population"-ti-1-"general population"]|TI
        0000000001|MM|170|Small|C0700321|["Little"-ti-1-"little"]|TI
        0000000001|MM|124|Levels|C0441889|["Levels"-ti-1-"levels"]|TI
        0000000001|MM|121|Exposure to|C0332157|["Exposure"-ti-1-"exposure"]|TI
        0000000001|MM|121|Injury due to exposure to external cause|C0274281|["Exposure, NOS"-ti-1-"exposure"]|TI
        0000000001|MM|121|Persons|C0027361|["People"-ti-1-"people"]|TI
        0000000001|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI
        0000000001|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI
        0000000001|MM|114|Known|C0205309|["Known"-ti-1-"known"]|TI
        <<<<< MMI
        """)
        mm_chunkmap=chunkmap_factory({'123.txt': [0, 1]})

        self.mo=MetamapOutput(mm_file,
                              [">>>>> MMI", "<<<<< MMI", "* error *"],
                              mm_chunkmap)
Beispiel #6
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    chunkmap = chunkmap_factory(
        pickle.load(bz2.BZ2File('test_data/metamap.chunkmap.bz2')))

    setupstart = time.clock()
    metamap_reader = MetamapOutput(bz2.BZ2File('test_data/metamap.out.bz2'),
                                   DEFAULT_LINES_TO_IGNORE, chunkmap)
    grapher = MetamapCoccurrenceGraphBuilder()
    # PageRank is not the correct algorithm for a matrix of adirectional nodes
    # but it'll do for now, to exercise the system
    # pr_algorithm=PageRanker()
    # TextRanker now written
    pr_algorithm = MappedRanker(TextRanker())
    count = 0
    parsestart = time.clock()
    for article in metamap_reader:
        print "Read article", article.set_id,
        graph = grapher.create_graph(article.lines)
        print "graphed it",
        matrix = graph.as_mapped_link_matrix()
        print "turned it into a", matrix,
        #fake_e_vector=[0.0] * len(matrix)
        if len(matrix) == 0:
            print "didn't pagerank because it was empty."
        else:
            ranked = pr_algorithm.evaluate(matrix)
            print "TextRanked it. First results: %r Stats:" % \
                    [x for x in ranked][:5],  pr_algorithm.stats
        count += 1
    endparse = time.clock()
    print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) "\
          "for %d total articles, for a grand total of %1.3f compressed "\
          "articles/second read, turned into link matrices, and " \
          " pageranked." \
          % (endparse-setupstart, parsestart-setupstart, count,
             float(count)/(endparse-setupstart))

    count += 1