def search(self, *args, **kwds): for path in Folder.listdir(self.dirname, **kwds): if path.lower().endswith('.pdf'): path = pdf2txt(path) yield [{ 'keyword': term, 'match': match, 'path': path } for term in self.keywords for match in self.get_matches(path) if term in match]
def main(generator=False): os.chdir(path) # Version: url = 'http://thebiogrid.org/' f = urllib.urlopen(url) contents = f.read() f.close() lines = contents.split('\n') for line in lines: if '<div class="newspost-title">BioGRID Version' in line: print line VERSION = line.split('<div class="newspost-title">BioGRID Version ')[1].split(' Release ')[0] break print VERSION # Urls: url = 'http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-' tab2url = url+VERSION+'/BIOGRID-ALL-'+VERSION+'.tab2.zip' mitaburl = url+VERSION+'/BIOGRID-ALL-'+VERSION+'.mitab.zip' # Files: folder = Folder() folder.get([tab2url, mitaburl]) tab2 = folder.contains('tab2')[0].parse(printing=False, seperator=None) mitab = folder.contains('mitab')[0].parse(printing=False, seperator=None) # Parsing: header = tab2[0].split('\t') D = {} output = open(os.path.join(path, 'interactions.txt'), 'w') for x in xrange(0, len(tab2)): line = tab2[x] if "#BioGRID Interaction ID" not in line and line != "": columns = line.split('\t') if int(columns[0]) not in D: D[int(columns[0])] = {} systematic_name_intactor_a = columns[5] systematic_name_intactor_b = columns[6] official_gene_symbol_a = columns[7] official_gene_symbol_b = columns[8] synonymns_interactor_a = columns[9].split('|') synonymns_interactor_b = columns[10].split('|') AliasA, AliasB = [columns[1]], [columns[2]] if systematic_name_intactor_a != '-' and systematic_name_intactor_a not in AliasA: AliasA.append(systematic_name_intactor_a) if systematic_name_intactor_b != '-' and systematic_name_intactor_b not in AliasB: AliasB.append(systematic_name_intactor_b) if official_gene_symbol_a != '-' and official_gene_symbol_a not in AliasA: AliasA.append(official_gene_symbol_a) if official_gene_symbol_b != '-' and official_gene_symbol_b not in AliasB: AliasB.append(official_gene_symbol_b) for i in synonymns_interactor_a: if i != "-" and i not in AliasA: AliasA.append(i) for i in synonymns_interactor_b: if i != "-" and i not in AliasB: AliasB.append(i) experimental_system_type = [columns[12], 'direct'] experimental_system = columns[11] type = mitab[x].split('\t')[11].split('(')[1].split(')')[0] pmid = int(columns[14]) taxid_a = int(columns[15]) if taxid_a == 559292: taxid_a = 4932 taxid_b = int(columns[16]) if taxid_b == 559292: taxid_b = 4932 throughput = columns[17].split('|') if columns[19] != '-': modification = columns[19] #print modification else: modification = '' source = columns[23] #'BioGRID' r = '\t'.join(['; '.join(AliasA), '; '.join(AliasB), '; '.join(experimental_system_type), experimental_system, type, modification, str(taxid_a), str(taxid_b), str(pmid), source+'\n']) output.write(r) D[int(columns[0])][int(columns[1])] = {'source':source, 'experimental_system':experimental_system, 'experimental_system_type':experimental_system_type, 'taxid_a':taxid_a, 'taxid_b':taxid_b, 'pmid':pmid, 'throughput':throughput} output.close() new_entry = '\n%s %s' % (VERSION, datetime.datetime.now()) version_file = open(os.path.join(path, 'version.txt'), 'a') version_file.write(new_entry) version_file.close() #Cleaning up: filenames = os.listdir(path) for filename in filenames: if "BIOGRID" in filename: print("Deleting %s" % filename) os.remove(os.path.join(filename))
def main(interactions=False, download=True, parse=True, withdrawn=True, cleanup=True): """Performs the download of interaction and annotation files from MGI. Builds a gene annotation file and mapping tables. TODO: - Inspect and eventually use interaction file, else discard from this module. - Also check whether other information from MGI is worse to integrate such as homology or phenotypes.""" os.chdir(path) genes.name = 'MGI' genes.key = 'mgi' folder = Folder(path) if interactions: ftp = FTP( url='ftp://ftp.informatics.jax.org/pub/protein-interaction-data/', path=path) ftp.download(path) if download: url = "ftp://ftp.informatics.jax.org/pub/reports/" files = [ "MRK_List1.rpt", "MRK_List2.rpt", "MGI_Coordinate.rpt", "MRK_Sequence.rpt", "MRK_SwissProt_TrEMBL.rpt", "MRK_VEGA.rpt", "MRK_ENSEMBL.rpt", "MGI_EntrezGene.rpt" ] # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt for f in files: f = File(url=url + f) # automatically does f.download() res = f.parse(header=True, printing=False) folder.downloads.append(f.name) if parse: folder.update() if withdrawn: filename = "MRK_List1.rpt" else: filename = "MRK_List2.rpt" data = folder[filename].parse(header=True, printing=False) genes.addData(data, key='mgi', taxid=10090) data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False) for i in data: i = change_keys(i) i['taxid'] = 10090 genes.add(i) data = folder['MRK_Sequence.rpt'].parse(header=True, printing=False) genes.addData(data, key='mgi', taxid=10090) header = "mgi symbol status name cm_position chromosome type "\ "secondary_accession_ids id synonyms feature_types start "\ "stop strand biotypes".split() data = folder["MGI_EntrezGene.rpt"].parse(header=header, printing=False) genes.addData(data, key="mgi", taxid=10090) print len(genes) if cleanup: if interactions: ftp.remove(confirm=False) for f in folder.downloads: folder.remove(f) genes.keep("category", "Gene") genes.remove("name", "withdrawn") genes.save() genes.buildMappings()
def func(args): return Folder.table(args.dirname, **args.kwds)
def main(interactions=False, download=True, parse=True, withdrawn=True, cleanup=True): """Performs the download of interaction and annotation files from MGI. Builds a gene annotation file and mapping tables. TODO: - Inspect and eventually use interaction file, else discard from this module. - Also check whether other information from MGI is worse to integrate such as homology or phenotypes.""" os.chdir(path) genes.name = "MGI" genes.key = "mgi" folder = Folder(path) if interactions: ftp = FTP(url="ftp://ftp.informatics.jax.org/pub/protein-interaction-data/", path=path) ftp.download(path) if download: url = "ftp://ftp.informatics.jax.org/pub/reports/" files = [ "MRK_List1.rpt", "MRK_List2.rpt", "MGI_Coordinate.rpt", "MRK_Sequence.rpt", "MRK_SwissProt_TrEMBL.rpt", "MRK_VEGA.rpt", "MRK_ENSEMBL.rpt", "MGI_EntrezGene.rpt", ] # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt for f in files: f = File(url=url + f) # automatically does f.download() res = f.parse(header=True, printing=False) folder.downloads.append(f.name) if parse: folder.update() if withdrawn: filename = "MRK_List1.rpt" else: filename = "MRK_List2.rpt" data = folder[filename].parse(header=True, printing=False) genes.addData(data, key="mgi", taxid=10090) data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False) for i in data: i = change_keys(i) i["taxid"] = 10090 genes.add(i) data = folder["MRK_Sequence.rpt"].parse(header=True, printing=False) genes.addData(data, key="mgi", taxid=10090) header = ( "mgi symbol status name cm_position chromosome type " "secondary_accession_ids id synonyms feature_types start " "stop strand biotypes".split() ) data = folder["MGI_EntrezGene.rpt"].parse(header=header, printing=False) genes.addData(data, key="mgi", taxid=10090) print len(genes) if cleanup: if interactions: ftp.remove(confirm=False) for f in folder.downloads: folder.remove(f) genes.keep("category", "Gene") genes.remove("name", "withdrawn") genes.save() genes.buildMappings()
def main(generator=False): os.chdir(path) # Version: url = 'http://thebiogrid.org/' f = urllib.urlopen(url) contents = f.read() f.close() lines = contents.split('\n') for line in lines: if '<div class="newspost-title">BioGRID Version' in line: print line VERSION = line.split('<div class="newspost-title">BioGRID Version ' )[1].split(' Release ')[0] break print VERSION # Urls: url = 'http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-' tab2url = url + VERSION + '/BIOGRID-ALL-' + VERSION + '.tab2.zip' mitaburl = url + VERSION + '/BIOGRID-ALL-' + VERSION + '.mitab.zip' # Files: folder = Folder() folder.get([tab2url, mitaburl]) tab2 = folder.contains('tab2')[0].parse(printing=False, seperator=None) mitab = folder.contains('mitab')[0].parse(printing=False, seperator=None) # Parsing: header = tab2[0].split('\t') D = {} output = open(os.path.join(path, 'interactions.txt'), 'w') for x in xrange(0, len(tab2)): line = tab2[x] if "#BioGRID Interaction ID" not in line and line != "": columns = line.split('\t') if int(columns[0]) not in D: D[int(columns[0])] = {} systematic_name_intactor_a = columns[5] systematic_name_intactor_b = columns[6] official_gene_symbol_a = columns[7] official_gene_symbol_b = columns[8] synonymns_interactor_a = columns[9].split('|') synonymns_interactor_b = columns[10].split('|') AliasA, AliasB = [columns[1]], [columns[2]] if systematic_name_intactor_a != '-' and systematic_name_intactor_a not in AliasA: AliasA.append(systematic_name_intactor_a) if systematic_name_intactor_b != '-' and systematic_name_intactor_b not in AliasB: AliasB.append(systematic_name_intactor_b) if official_gene_symbol_a != '-' and official_gene_symbol_a not in AliasA: AliasA.append(official_gene_symbol_a) if official_gene_symbol_b != '-' and official_gene_symbol_b not in AliasB: AliasB.append(official_gene_symbol_b) for i in synonymns_interactor_a: if i != "-" and i not in AliasA: AliasA.append(i) for i in synonymns_interactor_b: if i != "-" and i not in AliasB: AliasB.append(i) experimental_system_type = [columns[12], 'direct'] experimental_system = columns[11] type = mitab[x].split('\t')[11].split('(')[1].split(')')[0] pmid = int(columns[14]) taxid_a = int(columns[15]) if taxid_a == 559292: taxid_a = 4932 taxid_b = int(columns[16]) if taxid_b == 559292: taxid_b = 4932 throughput = columns[17].split('|') if columns[19] != '-': modification = columns[19] #print modification else: modification = '' source = columns[23] #'BioGRID' r = '\t'.join([ '; '.join(AliasA), '; '.join(AliasB), '; '.join(experimental_system_type), experimental_system, type, modification, str(taxid_a), str(taxid_b), str(pmid), source + '\n' ]) output.write(r) D[int(columns[0])][int(columns[1])] = { 'source': source, 'experimental_system': experimental_system, 'experimental_system_type': experimental_system_type, 'taxid_a': taxid_a, 'taxid_b': taxid_b, 'pmid': pmid, 'throughput': throughput } output.close() new_entry = '\n%s %s' % (VERSION, datetime.datetime.now()) version_file = open(os.path.join(path, 'version.txt'), 'a') version_file.write(new_entry) version_file.close() #Cleaning up: filenames = os.listdir(path) for filename in filenames: if "BIOGRID" in filename: print("Deleting %s" % filename) os.remove(os.path.join(filename))