def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') chunkmap=chunkmap_factory(pickle.load( bz2.BZ2File('test_data/5th.chunkmap.bz2'))) setupstart=time.clock() semrep_reader=SemrepOutput(bz2.BZ2File('test_data/5th.semrep.out.bz2'), ["semrep wrapper error"], chunkmap) semrep_grapher=SemrepGraphBuilder() pr_algorithm=PageRanker() count=0 parsestart=time.clock() for article in semrep_reader: print "Read article", article.set_id, graph=semrep_grapher.create_graph(article.lines) print "graphed it", matrixed=graph.as_mapped_link_matrix() print "matrixed it,", fake_e_vector=[0.0] * len(matrixed) if fake_e_vector==[]: print "didn't pagerank because it was empty." else: ranked=pr_algorithm.evaluate(matrixed, fake_e_vector) print "pageranked it. Stats:", pr_algorithm.stats count+=1 endparse=time.clock() print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) for "\ "%d total articles, for a grand total of %1.3f compressed "\ "articles/second turned into semantic graphs, link matrices," \ " and finally pageranked." % ( endparse-setupstart, parsestart-setupstart, count, float(count)/(endparse-setupstart))
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') chunkmap = chunkmap_factory( pickle.load(bz2.BZ2File('test_data/5th.chunkmap.bz2'))) setupstart = time.clock() semrep_reader = SemrepOutput(bz2.BZ2File('test_data/5th.semrep.out.bz2'), ["semrep wrapper error"], chunkmap) semrep_grapher = SemrepGraphBuilder() pr_algorithm = PageRanker() count = 0 parsestart = time.clock() for article in semrep_reader: print "Read article", article.set_id, graph = semrep_grapher.create_graph(article.lines) print "graphed it", matrixed = graph.as_mapped_link_matrix() print "matrixed it,", fake_e_vector = [0.0] * len(matrixed) if fake_e_vector == []: print "didn't pagerank because it was empty." else: ranked = pr_algorithm.evaluate(matrixed, fake_e_vector) print "pageranked it. Stats:", pr_algorithm.stats count += 1 endparse = time.clock() print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) for "\ "%d total articles, for a grand total of %1.3f compressed "\ "articles/second turned into semantic graphs, link matrices," \ " and finally pageranked." % ( endparse-setupstart, parsestart-setupstart, count, float(count)/(endparse-setupstart))
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') chunkmap=chunkmap_factory(pickle.load( bz2.BZ2File( 'test_data/metamap.chunkmap.bz2'))) setupstart=time.clock() metamap_reader=MetamapOutput(bz2.BZ2File( 'test_data/metamap.out.bz2'), DEFAULT_LINES_TO_IGNORE, chunkmap) grapher=MetamapCoccurrenceGraphBuilder() # PageRank is not the correct algorithm for a matrix of adirectional nodes # but it'll do for now, to exercise the system # pr_algorithm=PageRanker() # TextRanker now written pr_algorithm=MappedRanker(TextRanker()) count=0 parsestart=time.clock() for article in metamap_reader: print "Read article", article.set_id, graph=grapher.create_graph(article.lines) print "graphed it", matrix=graph.as_mapped_link_matrix() print "turned it into a", matrix, #fake_e_vector=[0.0] * len(matrix) if len(matrix)==0: print "didn't pagerank because it was empty." else: ranked=pr_algorithm.evaluate(matrix) print "TextRanked it. First results: %r Stats:" % \ [x for x in ranked][:5], pr_algorithm.stats count+=1 endparse=time.clock() print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) "\ "for %d total articles, for a grand total of %1.3f compressed "\ "articles/second read, turned into link matrices, and " \ " pageranked." \ % (endparse-setupstart, parsestart-setupstart, count, float(count)/(endparse-setupstart)) count+=1
def setUp(self): # Test setup borrowed from semrep.py from file.metamap import (MetamapOutput) from file.chunkmap import chunkmap_factory import StringIO # logging.basicConfig(level=logging.DEBUG, # format='%(asctime)s %(levelname)s %(message)s') # This fakefile is NOT the same as semrep.py - this one has a # relationship that should not be part of the graph, and one that # should # Metamap test from metamap.py #logging.setLevel(ULTRADEBUG) mm_file = StringIO.StringIO(""" >>>>> MMI * ERROR * 0000000000|MM|530|Carboxyhemoglobin|C0007061| ["Carboxyhaemoglobin"-ti-1-"Carboxyhaemoglobin"]|TI 0000000000|MM|223|Male population group|C0025266|["Men"-ti-1-"men"]|TI 0000000000|MM|121|Levels|C0441889|["Levels"-ti-1-"levels"]|TI 0000000000|MM|114|British|C0596227|["British"-ti-1-"British"]|TI 0000000000|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI 0000000000|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI <<<<< MMI >>>>> MMI 0000000001|MM|585|Little's Disease|C0023882|["Little"-ti-1-"little"]|TI 0000000001|MM|424|HAC protocol|C0062074|["HAD"-ti-1-"has"]|TI 0000000001|MM|170|Background|C1706907|["Background"-ti-1-"BACKGROUND"]|TI 0000000001|MM|170|General Population|C0683971|["General Population"-ti-1-"general population"]|TI 0000000001|MM|170|Small|C0700321|["Little"-ti-1-"little"]|TI 0000000001|MM|124|Levels|C0441889|["Levels"-ti-1-"levels"]|TI 0000000001|MM|121|Exposure to|C0332157|["Exposure"-ti-1-"exposure"]|TI 0000000001|MM|121|Injury due to exposure to external cause|C0274281|["Exposure, NOS"-ti-1-"exposure"]|TI 0000000001|MM|121|Persons|C0027361|["People"-ti-1-"people"]|TI 0000000001|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI 0000000001|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI 0000000001|MM|114|Known|C0205309|["Known"-ti-1-"known"]|TI <<<<< MMI """) mm_chunkmap = chunkmap_factory({'123.txt': [0, 1]}) self.mo = MetamapOutput(mm_file, [">>>>> MMI", "<<<<< MMI", "* error *"], mm_chunkmap)
def setUp(self): # Test setup borrowed from semrep.py from file.metamap import (MetamapOutput) from file.chunkmap import chunkmap_factory import StringIO # logging.basicConfig(level=logging.DEBUG, # format='%(asctime)s %(levelname)s %(message)s') # This fakefile is NOT the same as semrep.py - this one has a # relationship that should not be part of the graph, and one that # should # Metamap test from metamap.py #logging.setLevel(ULTRADEBUG) mm_file=StringIO.StringIO(""" >>>>> MMI * ERROR * 0000000000|MM|530|Carboxyhemoglobin|C0007061| ["Carboxyhaemoglobin"-ti-1-"Carboxyhaemoglobin"]|TI 0000000000|MM|223|Male population group|C0025266|["Men"-ti-1-"men"]|TI 0000000000|MM|121|Levels|C0441889|["Levels"-ti-1-"levels"]|TI 0000000000|MM|114|British|C0596227|["British"-ti-1-"British"]|TI 0000000000|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI 0000000000|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI <<<<< MMI >>>>> MMI 0000000001|MM|585|Little's Disease|C0023882|["Little"-ti-1-"little"]|TI 0000000001|MM|424|HAC protocol|C0062074|["HAD"-ti-1-"has"]|TI 0000000001|MM|170|Background|C1706907|["Background"-ti-1-"BACKGROUND"]|TI 0000000001|MM|170|General Population|C0683971|["General Population"-ti-1-"general population"]|TI 0000000001|MM|170|Small|C0700321|["Little"-ti-1-"little"]|TI 0000000001|MM|124|Levels|C0441889|["Levels"-ti-1-"levels"]|TI 0000000001|MM|121|Exposure to|C0332157|["Exposure"-ti-1-"exposure"]|TI 0000000001|MM|121|Injury due to exposure to external cause|C0274281|["Exposure, NOS"-ti-1-"exposure"]|TI 0000000001|MM|121|Persons|C0027361|["People"-ti-1-"people"]|TI 0000000001|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI 0000000001|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI 0000000001|MM|114|Known|C0205309|["Known"-ti-1-"known"]|TI <<<<< MMI """) mm_chunkmap=chunkmap_factory({'123.txt': [0, 1]}) self.mo=MetamapOutput(mm_file, [">>>>> MMI", "<<<<< MMI", "* error *"], mm_chunkmap)
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') chunkmap = chunkmap_factory( pickle.load(bz2.BZ2File('test_data/metamap.chunkmap.bz2'))) setupstart = time.clock() metamap_reader = MetamapOutput(bz2.BZ2File('test_data/metamap.out.bz2'), DEFAULT_LINES_TO_IGNORE, chunkmap) grapher = MetamapCoccurrenceGraphBuilder() # PageRank is not the correct algorithm for a matrix of adirectional nodes # but it'll do for now, to exercise the system # pr_algorithm=PageRanker() # TextRanker now written pr_algorithm = MappedRanker(TextRanker()) count = 0 parsestart = time.clock() for article in metamap_reader: print "Read article", article.set_id, graph = grapher.create_graph(article.lines) print "graphed it", matrix = graph.as_mapped_link_matrix() print "turned it into a", matrix, #fake_e_vector=[0.0] * len(matrix) if len(matrix) == 0: print "didn't pagerank because it was empty." else: ranked = pr_algorithm.evaluate(matrix) print "TextRanked it. First results: %r Stats:" % \ [x for x in ranked][:5], pr_algorithm.stats count += 1 endparse = time.clock() print "Total time elapsed: %1.3f seconds (%1.7f seconds were setup) "\ "for %d total articles, for a grand total of %1.3f compressed "\ "articles/second read, turned into link matrices, and " \ " pageranked." \ % (endparse-setupstart, parsestart-setupstart, count, float(count)/(endparse-setupstart)) count += 1