def main(): my_format_string='%(asctime)s %(levelname)s %(module)s.' \ '%(funcName)s: %(message)s' logging.basicConfig(level=logging.INFO, format=my_format_string) chunkmap = chunkmap_factory( pickle.load(bz2.BZ2File('test_data/5th.chunkmap.bz2'))) semrep_reader = SemrepOutput(bz2.BZ2File('test_data/5th.semrep.out.bz2'), DEFAULT_LINES_TO_IGNORE, chunkmap) tfidf = TF_IDF(file_mode="c") tfidf.build_tf_from_file(semrep_reader) semrep_reader.rewind() semrep_grapher = SemrepCooccurrenceGraphBuilder( node_weight_threshold=0.001, link_weight_threshold=0.003, tf_idf_provider=tfidf) eval_params = EvaluationParameters() eval_params.alpha = 0.65 work = myWorkflow(semrep_reader, semrep_grapher, TextRanker(), eval_params, PAGERANK_CUTOFF, MESH_TREE_FILE, SAVCC_MATRIX_FILE, lambda x: 1.0 / math.exp(x) if x >= 0 and x < 5 else 0.0, UMLS_CONVERTER_DATA, UMLS_CONCEPT_DATA, open(OUTPUT_FILE, 'w')) work.run()
def main(): my_format_string='%(asctime)s %(levelname)s %(module)s.' \ '%(funcName)s: %(message)s' logging.basicConfig(level=logging.INFO, format=my_format_string) chunkmap=chunkmap_factory(pickle.load( bz2.BZ2File('test_data/5th.chunkmap.bz2'))) semrep_reader=SemrepOutput(bz2.BZ2File('test_data/5th.semrep.out.bz2'), DEFAULT_LINES_TO_IGNORE, chunkmap) tfidf=TF_IDF(file_mode="c") tfidf.build_tf_from_file(semrep_reader) semrep_reader.rewind() semrep_grapher=SemrepCooccurrenceGraphBuilder(node_weight_threshold=0.001, link_weight_threshold=0.003, tf_idf_provider=tfidf ) eval_params=EvaluationParameters() eval_params.alpha=0.65 work=myWorkflow(semrep_reader, semrep_grapher, TextRanker(), eval_params, PAGERANK_CUTOFF, MESH_TREE_FILE, SAVCC_MATRIX_FILE, lambda x: 1.0/math.exp(x) if x>=0 and x<5 else 0.0, UMLS_CONVERTER_DATA, UMLS_CONCEPT_DATA, open(OUTPUT_FILE, 'w')) work.run()
def setUp(self): import StringIO from MEDRank.file.chunkmap import chunkmap_factory # Fake file taken from actual SEMREP output. # No animals were harmed in the making of this test. # The output doesn't really make total sense, but to keep the volume # of data at a reasonable level we'll use a small, senseless file. self.fakefile=\ StringIO.StringIO("SE|0000000000||ti|2|entity|Affecting|" "ftcn|C0392760|involved||||1000|319|326\n" "SE|0000000000||ti|2|entity|Involvement" "with|ftcn|C1314939|involved||" "||1000|319|326\n" "SE|0000000000||ti|2|relation|||Steroid" "hormone|horm,strd|horm|C0301818|" "||||||||901|115|130||INTERACTS_WITH" "||379|385|||steroid hormone" "receptor|gngm,aapp,rcpt|gngm|C0597519" "||None|steroid hormone receptors|their" " respective steroid hormone" " receptors|||||890|390|431\n" "USELESS LINE!\n" "SE|0000000000||ti|3|text|Coactivator and" " corepressor proteins have recently been" " identified that interact with steroid" " hormone receptors and modulate" " transcriptional activation\n") self.fake_chunkmap = chunkmap_factory({'123.txt': [0]})
def setUp(self): import StringIO from MEDRank.file.chunkmap import chunkmap_factory # Data from an actual file, with a few additions self.fakefile=StringIO.StringIO(""" >>>>> MMI * ERROR * 0000000000|MM|530|Carboxyhemoglobin|C0007061| ["Carboxyhaemoglobin"-ti-1-"Carboxyhaemoglobin"]|TI 0000000000|MM|223|Male population group|C0025266|["Men"-ti-1-"men"]|TI 0000000000|MM|121|Levels|C0441889|["Levels"-ti-1-"levels"]|TI 0000000000|MM|114|British|C0596227|["British"-ti-1-"British"]|TI 0000000000|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI 0000000000|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI <<<<< MMI >>>>> MMI 0000000001|MM|585|Little's Disease|C0023882|["Little"-ti-1-"little"]|TI 0000000001|MM|424|HAC protocol|C0062074|["HAD"-ti-1-"has"]|TI 0000000001|MM|170|Background|C1706907|["Background"-ti-1-"BACKGROUND"]|TI 0000000001|MM|170|General Population|C0683971|["General Population"-ti-1-"general population"]|TI 0000000001|MM|170|Small|C0700321|["Little"-ti-1-"little"]|TI 0000000001|MM|124|Levels|C0441889|["Levels"-ti-1-"levels"]|TI 0000000001|MM|121|Exposure to|C0332157|["Exposure"-ti-1-"exposure"]|TI 0000000001|MM|121|Injury due to exposure to external cause|C0274281|["Exposure, NOS"-ti-1-"exposure"]|TI 0000000001|MM|121|Persons|C0027361|["People"-ti-1-"people"]|TI 0000000001|MM|114|Old|C0580836|["Old"-ti-1-"older"]|TI 0000000001|MM|114|Old episode|C0677546|["Old"-ti-1-"older"]|TI 0000000001|MM|114|Known|C0205309|["Known"-ti-1-"known"]|TI <<<<< MMI """) self.lines_to_ignore=[">>>>> MMI", "<<<<< MMI", "* error *"] self.fake_chunkmap=chunkmap_factory({'123.txt': [0], '345.txt': [1]}) self.mo=MetamapOutput(self.fakefile, self.lines_to_ignore, self.fake_chunkmap)
def setUp(self): import StringIO from MEDRank.file.chunkmap import chunkmap_factory # Fake file taken from actual SEMREP output. # No animals were harmed in the making of this test. # The output doesn't really make total sense, but to keep the volume # of data at a reasonable level we'll use a small, senseless file. self.fakefile=\ StringIO.StringIO("SE|0000000000||ti|2|entity|Affecting|" "ftcn|C0392760|involved||||1000|319|326\n" "SE|0000000000||ti|2|entity|Involvement" "with|ftcn|C1314939|involved||" "||1000|319|326\n" "SE|0000000000||ti|2|relation|||Steroid" "hormone|horm,strd|horm|C0301818|" "||||||||901|115|130||INTERACTS_WITH" "||379|385|||steroid hormone" "receptor|gngm,aapp,rcpt|gngm|C0597519" "||None|steroid hormone receptors|their" " respective steroid hormone" " receptors|||||890|390|431\n" "USELESS LINE!\n" "SE|0000000000||ti|3|text|Coactivator and" " corepressor proteins have recently been" " identified that interact with steroid" " hormone receptors and modulate" " transcriptional activation\n") self.fake_chunkmap=chunkmap_factory({'123.txt': [0]})
def testChunkedOutput(self): from MEDRank.file.chunkmap import chunkmap_factory from MEDRank.pubmed.pmid import Pmid _cm={'1.txt': [12345], '2.txt': [56789, 56790]} cm=chunkmap_factory(_cm) cno=ChunkedNLMOutput(self.fakefile, Line, self.lines_to_skip, cm) processed_sets=[x for x in cno] self.assertEquals(len(processed_sets), 2) self.assertEquals(len(processed_sets[0].lines), 1) self.assertEquals(len(processed_sets[1].lines), 2) self.assertEquals(processed_sets[0].set_id, Pmid(1)) self.assertEquals(processed_sets[1].set_id, Pmid(2)) self.assertEquals(processed_sets[1].lines[1].line_id, 56790)
def main(): my_format_string='%(asctime)s %(levelname)s %(module)s.' \ '%(funcName)s: %(message)s' logging.basicConfig(level=logging.DEBUG, format=my_format_string) chunkmap = chunkmap_factory( pickle.load(bz2.BZ2File('test_data/all_abstracts.mti_chunkmap.bz2'))) reader = MtiOutput( bz2.BZ2File('test_data/all_abstracts.mti.just_metamap.out.bz2'), DEFAULT_LINES_TO_IGNORE, chunkmap) eval_params = EvaluationParameters() eval_params.alpha = 0.65 work = MtiWorkflow( reader, None, None, eval_params, PAGERANK_CUTOFF, MESH_TREE_FILE, SAVCC_MATRIX_FILE, lambda x: 1.0 / math.exp(x) if x >= 0 and x < 5 else 0.0, UMLS_CONVERTER_DATA, UMLS_CONCEPT_DATA, open(OUTPUT_FILE, 'w')) work.run()
def main(): my_format_string='%(asctime)s %(levelname)s %(module)s.' \ '%(funcName)s: %(message)s' logging.basicConfig(level=logging.DEBUG, format=my_format_string) chunkmap=chunkmap_factory(pickle.load( bz2.BZ2File('test_data/all_abstracts.mti_chunkmap.bz2'))) reader=MtiOutput(bz2.BZ2File('test_data/all_abstracts.mti.just_metamap.out.bz2'), DEFAULT_LINES_TO_IGNORE, chunkmap) eval_params=EvaluationParameters() eval_params.alpha=0.65 work=MtiWorkflow(reader, None, None, eval_params, PAGERANK_CUTOFF, MESH_TREE_FILE, SAVCC_MATRIX_FILE, lambda x: 1.0/math.exp(x) if x>=0 and x<5 else 0.0, UMLS_CONVERTER_DATA, UMLS_CONCEPT_DATA, open(OUTPUT_FILE, 'w')) work.run()
def setUp(self): # Test setup borrowed from semrep.py from MEDRank.file.semrep import (SemrepOutput) from MEDRank.file.metamap import (MetamapOutput) from MEDRank.file.chunkmap import chunkmap_factory import StringIO # logging.basicConfig(level=logging.DEBUG, # format='%(asctime)s %(levelname)s %(message)s') # This fakefile is NOT the same as semrep.py - this one has a # relationship that should not be part of the graph, and one that # should sr_file=StringIO.StringIO("SE|0000000000||ti|2|entity|Affecting|" "ftcn|C0392760|involved||||1000|319|326\n" "SE|0000000000||ti|2|entity|Involvement" "with|ftcn|C1314939|involved||" "||1000|319|326\n" "SE|0000000000||ti|2|relation|||Steroid" "hormone|horm,strd|horm|C0301818|" "||||||||901|115|130||INTERACTS_WITH" "||379|385|||steroid hormone" "receptor|gngm,aapp,rcpt|gngm|C0597519" "||None|steroid hormone receptors|their" " respective steroid hormone" " receptors|||||890|390|431\n" "SE|0000000000||ti|2|relation|||Affection" "|horm,strd|horm|C0392760|" "||||||||901|115|130||INTERACTS_WITH" "||379|385|||Involvement" "with|gngm,aapp,rcpt|gngm|C1314939" "||None|steroid hormone receptors|their" " respective steroid hormone" " receptors|||||890|390|431\n" "USELESS LINE!\n" "SE|0000000000||ti|3|text|Coactivator and" " corepressor proteins have recently been" " identified that interact with steroid" " hormone receptors and modulate" " transcriptional activation\n") fake_chunkmap=chunkmap_factory({'123.txt': [0]}) self.sro=SemrepOutput(sr_file, ["USELESS LINE!"], fake_chunkmap)
def setUp(self): # Test setup borrowed from semrep.py from MEDRank.file.semrep import (SemrepOutput) from MEDRank.file.metamap import (MetamapOutput) from MEDRank.file.chunkmap import chunkmap_factory import StringIO # logging.basicConfig(level=logging.DEBUG, # format='%(asctime)s %(levelname)s %(message)s') # This fakefile is NOT the same as semrep.py - this one has a # relationship that should not be part of the graph, and one that # should sr_file = StringIO.StringIO("SE|0000000000||ti|2|entity|Affecting|" "ftcn|C0392760|involved||||1000|319|326\n" "SE|0000000000||ti|2|entity|Involvement" "with|ftcn|C1314939|involved||" "||1000|319|326\n" "SE|0000000000||ti|2|relation|||Steroid" "hormone|horm,strd|horm|C0301818|" "||||||||901|115|130||INTERACTS_WITH" "||379|385|||steroid hormone" "receptor|gngm,aapp,rcpt|gngm|C0597519" "||None|steroid hormone receptors|their" " respective steroid hormone" " receptors|||||890|390|431\n" "SE|0000000000||ti|2|relation|||Affection" "|horm,strd|horm|C0392760|" "||||||||901|115|130||INTERACTS_WITH" "||379|385|||Involvement" "with|gngm,aapp,rcpt|gngm|C1314939" "||None|steroid hormone receptors|their" " respective steroid hormone" " receptors|||||890|390|431\n" "USELESS LINE!\n" "SE|0000000000||ti|3|text|Coactivator and" " corepressor proteins have recently been" " identified that interact with steroid" " hormone receptors and modulate" " transcriptional activation\n") fake_chunkmap = chunkmap_factory({'123.txt': [0]}) self.sro = SemrepOutput(sr_file, ["USELESS LINE!"], fake_chunkmap)
def setUp(self): import StringIO from MEDRank.file.chunkmap import chunkmap_factory #logging.basicConfig(level=logging.INFO, # format='%(asctime)s %(levelname)s %(message)s') # Data from an actual file, with a few additions self.fakefile=StringIO.StringIO(""" 16606471|*Pancreatitis, Acute Necrotizing|C0267941|46152|MH|RtM via: Pancreatitis, Acute Necrotizing|TI;AB|MM;RC 16606471|*Randomized Controlled Trials as Topic|C0282440|19056|MH|RtM via: Randomized Controlled Trials as Topic|TI;AB|MM;RC 16606471|Pancreatitis|C0030305|6224|MH|RtM via: Pancreatitis|AB|MM;RC 16606471|Multicenter Studies as Topic|C0282439|4703|MH|||RC 16606471|Sample Size|C0242618|3713|MH|RtM via: Sample Size|AB|MM;RC 16606471|Double-Blind Method|C0013072|3524|MH|||RC 16606471|*Clinical Trials as Topic|C0008976|2835|MH|RtM via: Clinical Trials|TI|MM 16606471|Debridement|C0011079|1457|MH|RtM via: Debridement|AB|MM;RC 16606471|Pancreas|C0030274|1387|MH|||RC 16606471|Drainage|C0013103|1228|MH|RtM via: Drainage procedure|AB|MM;RC ------ 16606471|Patient Selection|C0242802|918|MH|||RC 16606471|Laparotomy|C0023038|867|MH|RtM via: Laparotomy|AB|MM;RC """) self.fake_chunkmap=chunkmap_factory({16606471: 16606471})
def setUp(self): import StringIO from MEDRank.file.chunkmap import chunkmap_factory #logging.basicConfig(level=logging.INFO, # format='%(asctime)s %(levelname)s %(message)s') # Data from an actual file, with a few additions self.fakefile = StringIO.StringIO(""" 16606471|*Pancreatitis, Acute Necrotizing|C0267941|46152|MH|RtM via: Pancreatitis, Acute Necrotizing|TI;AB|MM;RC 16606471|*Randomized Controlled Trials as Topic|C0282440|19056|MH|RtM via: Randomized Controlled Trials as Topic|TI;AB|MM;RC 16606471|Pancreatitis|C0030305|6224|MH|RtM via: Pancreatitis|AB|MM;RC 16606471|Multicenter Studies as Topic|C0282439|4703|MH|||RC 16606471|Sample Size|C0242618|3713|MH|RtM via: Sample Size|AB|MM;RC 16606471|Double-Blind Method|C0013072|3524|MH|||RC 16606471|*Clinical Trials as Topic|C0008976|2835|MH|RtM via: Clinical Trials|TI|MM 16606471|Debridement|C0011079|1457|MH|RtM via: Debridement|AB|MM;RC 16606471|Pancreas|C0030274|1387|MH|||RC 16606471|Drainage|C0013103|1228|MH|RtM via: Drainage procedure|AB|MM;RC ------ 16606471|Patient Selection|C0242802|918|MH|||RC 16606471|Laparotomy|C0023038|867|MH|RtM via: Laparotomy|AB|MM;RC """) self.fake_chunkmap = chunkmap_factory({16606471: 16606471})