def converter(GENE_FILE, MONDO_FILE, namespace): omim2gene = defaultdict(set) #Gene to gene gene2omim = defaultdict(set) # Gene to Phene omim2mondo = defaultdict(set) # Phene to Phene with handle(GENE_FILE) as genes: rows = csv.DictReader(genes, delimiter="\t") for row in rows: MID = row["#MIM number"] GID = row["GeneID"] if GID != "-": if row["type"] == "gene": omim2gene[MID].add(f"{namespace}.id:{GID}") elif row["type"] == "phenotype": gene2omim[f"{namespace}.id:{GID}"].add(MID) with handle(MONDO_FILE) as mf: omim_id = [] mondo_id = "" is_obsolete = False for line in mf: if line.startswith("[Term]"): if mondo_id and omim_id and not is_obsolete: for om in omim_id: omim2mondo[om] = mondo_id omim_id = [] mondo_id = "" is_obsolete = False elif line.startswith("id:"): mondo_id = line.strip().split("id: ")[1] elif line.startswith("xref: OMIM"): omim_id.append(line.split(" ")[1].split(":")[1]) elif line.startswith("is_obsolete"): is_obsolete = True return omim2gene, gene2omim, omim2mondo
def parseGenes(ctdGenesFile, entrez_gene): """ creates map for ctd assigned gene symbols to entrez gene ids also parses set of human genes """ logger.info('Parsing human genes file') # single column file of 59599 human(taxid:9606) gene ids from entrez gene #humanGenes = '/Users/rlinchan/Google Drive/PhD/dissert/data/neo4j_input/testNetwork/nodes/HumanEntrezGeneIDs.txt' sym2idDict = {} humanSet = set() with parse_it.handle(entrez_gene) as geneFile: #geneFile = parse_it.handle(geneFile) reader = csv.DictReader(geneFile, delimiter="\t") for row in reader: found = False if row["#tax_id"] != "9606" and found: break elif row["#tax_id"] == "9606": found = True humanSet.add(row["GeneID"]) logger.info(f"I found {len(humanSet)} human genes") with parse_it.handle(ctdGenesFile) as inFile: #ctdGenesFile = parse_it.handle(ctdGenesFile) for line in inFile: if line.startswith('#'): continue else: line = line.split('\t') geneSymbol = line[0] geneID = line[2] if geneSymbol not in sym2idDict: sym2idDict[geneSymbol] = geneID return sym2idDict, humanSet
def parse(self): """ Reads a file defined by the class instance and parses nodes and relationships """ with handle(self.file) as infile: reader = csv.DictReader(decomment(infile), delimiter="\t") for row in reader: predicate = self.__check_predicate_exists(row) if predicate: nodes = self.parse_nodes(row) if self.nodes != nodes: if self.nodes: self.write_out() self.nodes = nodes # self.parse_relationship(row) evidence = "" if "PubMedIDs" in row.keys(): if row["PubMedIDs"]: evidence = row["PubMedIDs"] self.parse_relationship(predicate, evidence.replace("|",";")) # except KeyError: # if "PubMedIDs" in row.keys(): # self.a2b["xref"].add(("xref", row["PubMedIDs"].replace("|",";"))) # else: # self.a2b["xref"].add(("xref", "")) self.write_out() self.outfile.close()
def taxParser(tsvFile, nodesFile, namespace): """ Parses names.dmp and nodes.dmp files from NCBI Taxonomy to generate nodes and edges for Neo4J python oboParser.py -i NCBI_taxdump_9.29.16/names.dmp -c NCBI_taxdump_9.29.16/nodes.dmp -o neo4j_input/ -n nih.nlm.ncbi.taxonomy """ synTitles = set([ 'acronym', 'blast name', 'common name', 'equivalent name', 'genbank acronym', 'genbank common name', 'genbank synonym', 'synonym', 'scientific name' ]) taxDict = defaultdict(dict) taxChildrenTree = defaultdict(set) files = [tsvFile, nodesFile] for dmp in files: with handle(dmp) as inFile: for line in inFile: # Can replace with map(strip, line.split('|'), # line.replace('\"', '')) line = line.replace('\"', '') column_List = [i.strip() for i in line.split('|')] if 'names.dmp' in dmp: taxID = namespace + '.id:' + column_List[0] name = column_List[1] # needs to account for different names/synonyms # scientific name is PT if taxID not in taxDict: taxDict[taxID] = { 'name': name, 'synonyms': [name], 'edges': [], 'labels': set(['Organism', namespace.replace('.', '_')]) } # 'def': '', if 'scientific name' == column_List[3]: taxDict[taxID]['name'] = name else: taxDict[taxID]['synonyms'].append(name) if 'scientific name' == column_List[3]: taxDict[taxID]['name'] = name if 'nodes.dmp' in dmp: taxID = namespace + '.id:' + column_List[0] parentID = namespace + '.id:' + column_List[1] taxDict[taxID]['edges'].append(('is_a', parentID)) if taxID not in taxChildrenTree: taxChildrenTree[parentID].add(taxID) if taxID in taxChildrenTree: taxChildrenTree[parentID].add(taxID) # recursively find all children of Embryophyta to label as plant outDict = defaultdict(dict) childSet = set() # Add plant label addLabel(taxChildrenTree, childSet, 'nih.nlm.ncbi.taxonomy.id:3193') for child in childSet: outDict[child] = taxDict[child] outDict[child]['labels'].add('Plant') logger.info( f"""{len(taxDict)} total nodes\nwith {len(outDict)} labeled nodes """) return outDict
def parseMEDIC(inFile, meshIDs): """ Borrows obo parsing code from bareSourceParser.py to parse the MEDIC ontology MEDIC maps MeSH Diseases to OMIM Diseases """ # assocDict = {startID:set(endID)} assocDict = collections.defaultdict(set) #myFile = open(inFile, 'rU') myFile = parse_it.handle(inFile) parse_it.getTerm(myFile) # Breaks when the term returned is empty, indicating end of file while 1: term = parse_it.parseTagValue(parse_it.getTerm(myFile)) if len(term) != 0: subID = term['id'][0].split(':') # Finds TYPEDEF stanzas and ignores them if len(subID) < 2: continue termID = term['id'][0].split(':')[-1] if 'alt_id' in term.keys() and termID in meshIDs: mimID = 'omim.disease.id:' + term['alt_id'][0].split(':')[-1] assocDict[termID].update([mimID]) else: break logger.info(len(assocDict)) return assocDict
def readIxns(filename): """ CTD File is TSV, has #Fields:\n#Field1\tField2\tetc. This method parses unique file:CTD_chem_gene_ixns.tsv assocDict = {(startID,endID):{interaction:set(pmid)}} """ # :START_ID|:TYPE|source:string|occurrence:float|articles:string[]|:END_ID # ID | xref | Comparative Toxicogenomics Database | len(PMID) | PMID; # PMID; | ID assocDict = collections.defaultdict(dict) geneFormset = set() #with open(filename, 'rU') as inFile: # inFile = parse_it.handle(inFile) with parse_it.handle(filename) as inFile: for line in inFile: if line.startswith('#'): continue else: line = line.split('\t') startID = line[1] endID = line[4] # will always be one organism, based on GeneID organismID = line[7] if organismID == '9606': interaction = set(line[9].strip().replace('^', '_').split('|')) pmid = set(line[10].strip().split('|')) assocTuple = (startID, endID) for ixn in interaction: if assocTuple not in assocDict and ixn not in assocDict[ assocTuple]: assocDict[assocTuple][ixn] = pmid elif assocTuple in assocDict and ixn not in assocDict[ assocTuple]: assocDict[assocTuple][ixn] = (pmid) elif assocTuple in assocDict and ixn in assocDict[ assocTuple]: assocDict[assocTuple][ixn].update(pmid) return assocDict
def parseMESH(inFiles): """ Method creates set of 'valid' MESH IDs (conflicting versions) Input is array of 2, d201*.bin and c201*.bin """ meshIDs = set() for filename in inFiles: logger.info(f"Working on {filename}") #myFile = open(filename, 'rU') myFile = parse_it.handle(filename) parse_it.getTerm(myFile) # Breaks when the term returned is empty, indicating end of file while 1: term = parse_it.parseMeshValue(parse_it.getTerm(myFile)) # for descriptors if len(term) != 0: termID = term['UI'][0] meshIDs.add(termID) else: break logger.info(f"I found {len(meshIDs)} Mesh IDs") return meshIDs
edgeType = "xref" #predicate = line.split('\"') if linkType in link2id.keys(): outNodeID = nodeOut.split(":")[1] linkTypes.add(linkType) edges.append( ("xref", str(link2id[linkType] + ":" + outNodeID))) elif line.startswith("is_a"): nodeOut = line.split(" ")[1] linkType = nodeOut.split(":")[0].lower() if linkType in link2id.keys(): outNodeID = nodeOut.split(":")[1] linkTypes.add(linkType) edges.append(("is_a", nodeOut)) elif line.startswith("is_obsolete"): obsolete = True self.nodeDict[nodeID] = { "name": name, "synonyms": list(synonyms), "edges": edges, "labels": labels } return self.nodeDict if __name__ == "__main__": mFile = "docker/data/mondo/mondo.obo" parser = Parser() parser.parse(handle(mFile))
def geneParser(gene_info, gene2go, mimgen, mondo, namespace): """ Parser function for ncbi gene file """ # EDGES REQUIRE OTHER FILES, all are XREF # python oboParser.py -i /Users/rlinchan/Google\ # Drive/PhD/dissert/data/EntrezGene_01042017/gene_info -c # /Users/rlinchan/Google\ # Drive/PhD/dissert/data/EntrezGene_01042017/gene2go -o # /Users/rlinchan/Google\ Drive/PhD/dissert/data/neo4j_input/v3/ -n # nih.nlm.ncbi.gene organisms = { "9606": "H**o sapiens", "10116": "Rattus norvicigus", "10090": "Mus musculus" } def form(item): """ Removes quotation marks and ; from items """ chars = "'\";" for a in chars: item = item.replace(a, "") return item geneDict = defaultdict(dict) with handle(gene_info) as infile: reader = csv.DictReader(infile, delimiter='\t') for row in reader: idee = row["#tax_id"] # h**o sapiens, rattus norvicigus, mus musculus if idee in organisms.keys(): #("9606","10116","10090"): geneID = f"{namespace}.id:{row['GeneID']}" symbol_list = form(row["Symbol"]).split("|") symbol = symbol_list[0] syn = set(row["Synonyms"].split("|")) syn.update(symbol_list) geneType = form(row["type_of_gene"]) organism = organisms[idee] syn.update(form(row["description"]).split("|")) if "-" in syn: syn.remove("-") geneDict[geneID] = { 'name': symbol, "gene_type": geneType, "organism": organism, 'synonyms': list(syn), 'parents': set(), 'edges': [], 'labels': set(['Gene', namespace.replace('.', '_')]) } #geneDict[geneID]['edges'].append(('xref', f'nih.nlm.ncbi.taxonomy.id:{idee}')) logger.info("Nodes Dict created") # Can also take down PubMedId with handle(gene2go) as edgeFile: reader = csv.DictReader(edgeFile, delimiter='\t') for row in reader: idee = row["#tax_id"] if idee in organisms.keys(): #("9606","10116","10090"): geneID = f"{namespace}.id:{row['GeneID']}" goID = row["GO_ID"] geneDict[geneID]['edges'].append(('xref', goID)) logger.info(f"{len(geneDict)} edges added to Dict") omim2gene, gene2omim, omim2mondo = converter(mimgen, mondo, namespace) # Find OMIM associated phenotypes matches = 0 for g in geneDict.keys(): #geneID = g.split(":")[1] if g in gene2omim.keys(): OMIM_ID = gene2omim[g] for oid in OMIM_ID: if oid in omim2mondo.keys(): matches += 1 geneDict[g]['edges'].append( ('associated_with', omim2mondo[oid])) logger.info(f"{matches} Gene to Phenotype associations") return geneDict
def main(): args = readArgs() INFILE = args.infiles OUTPUT_DIRECTORY = args.outdir NAMESPACE = args.namespace # NAMESPACE is UID prefix for: # gene, taxonomy, omim, nal # MeSH parsing if NAMESPACE == "nih.nlm.mesh": logger.info("Parsing mesh files...") nodeOut = open(path.join(OUTPUT_DIRECTORY, f"{NAMESPACE}.nodes.csv"), "w") edgeOut = open(path.join(OUTPUT_DIRECTORY, f"{NAMESPACE}.edges.csv"), "w") parser = mesh.Parser(nodeOut, edgeOut) for F in INFILE: parser.read(F) parser.writeEdges() nodeOut.close() edgeOut.close() sys.exit(0) # NCBI Taxonomy parsing elif NAMESPACE == "nih.nlm.ncbi.taxonomy": # CHEMFILE is nodes.dmp (edges) logger.info('Parsing NCBI Taxonomy files...') for F in INFILE: if F.endswith("names.dmp"): f1 = F else: f2 = F outDict = taxParser(f1, f2, NAMESPACE) # OMIM parsing elif NAMESPACE == "omim.disease": logger.info('Parsing omim files...') for F in INFILE: if F.endswith("genemap2.txt"): f1 = F else: f2 = F outDict = omim_parser(f1, f2, NAMESPACE) # NCBI Gene parsing elif NAMESPACE == "nih.nlm.ncbi.gene": logger.info('Parsing NCBI gene files...') for F in INFILE: if "gene_info" in F: gene_info = F elif "gene2go" in F: gene2go = F elif "mim2gene" in F: mimgen = F elif "mondo" in F: mon = F else: logger.error( f"I'm not sure what to do with this file!\n file:{F}") outDict = geneParser(gene_info, gene2go, mimgen, mon, NAMESPACE) # NALT parsing elif NAMESPACE == "usda.nal.thesaurus": logger.info('Parsing NAL files...') parser = nal.Parser() outDict = parser.parse(handle(INFILE[0])) # Parse the mondo obo file elif NAMESPACE == "ctd": for F in INFILE: ct = ctd.Parser(F, OUTPUT_DIRECTORY) ct.parse() exit(0) elif NAMESPACE == "mondo": logger.info("Parsing mondo ontology...") parser = mondo.Parser() outDict = parser.parse(handle(INFILE[0])) # Parse a different obo file else: logger.info(f"Parsing {NAMESPACE}: {INFILE[0]}...") outDict = oboParser(INFILE[0], NAMESPACE) logger.info(f"Writing to:\t{OUTPUT_DIRECTORY}") csvNodeOutput(outDict, OUTPUT_DIRECTORY, NAMESPACE)
def tsvParser(tsv_File, new_rel_dict): """ Method input: Linguamatics TSV Result files Each TSV file should follow the format (object, predicate, subject) --> (col 1-3, col 4, col 5-7) followed by hit text (col7) and docID (col10) Method returns a relationship dictionary of format { predicate : { (subjID, objID) : [[objectTerm,subjectTerm,hitText,doc,location], ... ]} """ ## file used has the object and subject reversed ## object (col5-7), predicate(col4), and subject(col1-3) ## docID (col12), hit text (col16) verb_endings = ('s', 'es', 'er', 'e', 'ed', 't', 'ur', 'rn', 'ng', 'ing', 'fy', 'in', 'nd', 'ck', 'ic', 'ad', 'ld', 'ar', 'lp', 'ow', 'en', 'ish', 'ply', 'ize', 'or', 'ts', 'rd', 'ry', 'rk', 'ir', 'rm') rel_Dict = defaultdict(dict) tags = [] unfiltered_preds = set() # Set of predicates filtered_preds = set() langs = set() # Languages being used in the article hits # Possible languages from Agricola langCodes = set([ "spa", "fre", "rus", "swe", "chi", "jpn", "afr", "ger", "tur", "por", "ita", "nor", "ind", "cat", "fin", "dan" ]) otherLang = 0 unknown_location = False #logging.info("Working on:\t%s" % tsv_File.split('/')[-1]) n = 2 with handle(tsv_File) as i: reader = csv.DictReader(i, delimiter='\t') for row in reader: hit = row["Hit"].replace('\'', '').replace('|', '').replace("\"", '') hit_language = hit.split("... ")[-1] subjID = convertID(row["[SNID] Subject"], n) objID = convertID(row["[SNID] Object"], n) if hit_language.lower() in langCodes: # Exclude non-english hits langs.add(hit_language) continue elif subjID == None or objID == None: # Skip results with blank ids continue else: objectTerm = (row["[PT] Object"], row["Object"]) subjectTerm = (row["[PT] Subject"], row["Subject"]) try: location = row["Location"] except: if "abstract" in tsv_File.lower(): location = "abstract" elif "title" in tsv_File.lower(): location = "title" else: unknown_location = True location = "unknown" doc = row["Doc"] predicatePhrase = row["Predicate"] # ***** ***** Aaron made it to here.. woohoo! Jul 2 ***** ***** # Add to overall, unfiltered set of predicates unfiltered_preds.add(predicatePhrase) # Filter the predicates predicate, tagged_text = filterPredicates(predicatePhrase) # Add filtered predicate to list filtered_preds.add(predicate) tags.append(tagged_text) if predicate: preds = [ rel["predicate"] for rel in new_rel_dict[(subjID, objID)] ] if predicate not in preds: new_rel_dict[(subjID, objID)].append({ "predicate": predicate, "pmid": [doc], "file": set([tsv_File]) }) #"evidence":[f"{location}:{hit}"]}) else: ind = preds.index(predicate) new_rel_dict[(subjID, objID)][ind]["pmid"].append(doc) new_rel_dict[(subjID, objID)][ind]["file"].add(tsv_File) #new_rel_dict[(subjID,objID)][ind]["evidence"].append(f"{location}:{hit}") # change set for analysis of blanks #if predicate == '': # filtered_preds.add(predicatePhrase) # continue if predicate: if predicate not in rel_Dict and ( subjID, objID) not in rel_Dict[predicate]: rel_Dict[predicate][(subjID, objID)] = [[ objectTerm, subjectTerm, predicatePhrase, hit, doc, location ]] elif predicate in rel_Dict and ( subjID, objID) not in rel_Dict[predicate]: rel_Dict[predicate][(subjID, objID)] = [[ objectTerm, subjectTerm, predicatePhrase, hit, doc, location ]] elif predicate in rel_Dict and ( subjID, objID) in rel_Dict[predicate]: rel_Dict[predicate][(subjID, objID)].append([ objectTerm, subjectTerm, predicatePhrase, hit, doc, location ]) n += 1 if unknown_location: logging.warn( "Location of text mining hit undiscernable; set to unknown") return rel_Dict, langs, unfiltered_preds, filtered_preds, new_rel_dict
Some nodes have multiple parents """ edges = [] multiParent = 0 for name in props["children"]: edges.append(("has_child",self.name2id[name])) for name in props["parents"]: parentID = self.name2id[name] if multiParent == 0: self.tree.move_node(nodeID,parentID) else: self.tree.create_node(tag=props["name"],identifier=nodeID+".{}".format(multiParent),parent=parentID) edges.append(("is_a",parentID)) multiParent += 1 return edges def extractElem(node,tag): """ Extracts an element """ elemSet = set() for elem in node.findall(tag): elemSet.add(elem.text) return elemSet if __name__ == "__main__": parser = Parser() mydict = parser.parse(handle("docker/data/nal/NAL_Thesaurus_2019_XML.zip")) print(mydict) #sub = parser.tree.subtree("usda.nal.thesaurus.id:858") #print(sub.all_nodes())
def readTSV(filename, entrez_gene, ctd_gene, writeFile, meshIDs, goldStd): """ This method reads all other CTD files listed in methods section Exceptions: OK CTD_Disease-GO_biological_process_associations.tsv MESH 2 GO FLIP edge direction OK CTD_Disease-GO_cellular_component_associations.tsv MESH 2 GO FLIP edge direction OK CTD_Disease-GO_molecular_function_associations.tsv MESH 2 GO FLIP edge direction OK CTD_chem_go_enriched.tsv (4,629,998) MESH 2 GO NA CTD_chem_pathways_enriched.tsv MESH 2 KEGG,REACT NA CTD_diseases_pathways.tsv FLIP edge direction MESH: or OMIM: ot KEGG,REACT NA CTD_genes_pathways.tsv NCBI GENE ID to KEGG,REACT LARGE MODIFY CTD_chemicals_diseases.tsv-> (4,338,326) MESH 2 MESH:,OMIM: MODIFY CTD_genes_diseases.tsv-> (46,613,048) NCBI GENE ID 2 MESH:,OMIM: """ logger.info(f"Parsing {filename}") idArray = [] source = 'Comparative Toxicogenomics Database' relType = 'xref' #ctdGeneFile = '/Users/rlinchan/Google Drive/PhD/dissert/data/assocDBs/CTD_2242015/CTD_genes.tsv' sym2id = {} humanSet = set() # noPMIDDict = {startID:set(endID)} noPMIDDict = collections.defaultdict(set) gene2goDict = collections.defaultdict(set) mod = 0 checkBoth = 0 goFilter = 0 inferredCount = 0 directCount = 0 if 'Disease' in filename: # flag to flip IDs for direction of relationship mod = 1 sym2id, humanSet = parseGenes(ctd_gene, entrez_gene) # symbol to id map if 'chemicals' in filename: checkBoth = 1 if 'genes_diseases' in filename: sym2id, humanSet = parseGenes(ctd_gene, entrez_gene) # symbol to id map if 'chem_go' in filename: goFilter = 1 with parse_it.handle(filename) as inFile: for line in inFile: if line.startswith('# Fields:'): line = next(inFile) header = line[2:].strip().split('\t') for column in header: # define columns by header line if column.endswith('ID'): # 1,4 arrayPos = 0,1 idArray.append(header.index(column)) if column == 'OmimIDs': # 8 arrayPos = 3 idArray.append(header.index(column)) if column == 'PubMedIDs': # 9 arrayPos = 4 idArray.append(header.index(column)) if column == 'DirectEvidence': # 5 arrayPos = 2 idArray.append(header.index(column)) elif line.startswith('#'): # skip file header continue else: # associations line = line.strip().split('\t') startID = '' endID = '' ontology = '' geneID = [] if mod == 0: # retain original order startID = line[idArray[0]] endID = line[idArray[1]] if goFilter == 1: ontology = line[3] # only include bioProc for CTD_chem_go if ontology != 'Biological Process': continue if mod == 1: # flip order, CTD_Disease-GO*.tsv startID = line[idArray[1]] # goID endID = line[idArray[0]] # diseaseID geneSym = line[5].split('|') # gene symbols for gene in geneSym: if gene in sym2id: geneID.append(sym2id[gene]) # account for non-prefixed omim if endID.isdigit(): endID = 'omim.disease.id:' + endID if len(idArray) == 2: # all other files if startID in meshIDs or endID in meshIDs: # does this ignore OMIM? if startID not in noPMIDDict: noPMIDDict[startID].add(endID) else: noPMIDDict[startID].add(endID) for gene in geneID: # adds gene to GO if gene not in noPMIDDict and gene in humanSet: # filter human genes noPMIDDict['nih.nlm.ncbi.gene.id:' + gene].add(startID) if gene in noPMIDDict and gene in humanSet: # filter human genes noPMIDDict['nih.nlm.ncbi.gene.id:' + gene].add(startID) elif len(idArray) == 5: # chemicals_diseases.tsv = MESH to MESH: or OMIM: # genes_diseases.tsv = Entrez ID to MESH: or OMIM: # idArray = [0:ID,1:ID,2:DirectEvidence,3:OMIM,4:PubMedIDs] omim = line[idArray[3]].strip().split('|') evidence = line[idArray[2]] if evidence != '': directCount += 1 if evidence == '': inferredCount += 1 pmids = '' # PMID may be empty, must check occurrence = '1' # occurrence set to 1 if no pmid evidence # Addition of 1 because 0 based language if len(line) == idArray[4] + 1: pmids = line[idArray[4]].strip().split('|') # occurrence based on citation evidence occurrence = str(len(pmids)) if startID.isdigit(): # convert to geneID startID = 'nih.nlm.ncbi.gene.id:' + startID if 'MESH' in endID: endID = endID.split(':')[-1] if 'OMIM' in endID: endID = 'omim.disease.id:' + endID.split(':')[-1] if checkBoth == 1: # chemicals_diseases.tsv if startID in meshIDs and endID in meshIDs: lineOut = '%s|%s|%s|%s|%s|%s\n' % ( startID, relType, source, occurrence, ';'.join(pmids), endID) if goldStd == 1 and evidence != '': lineOut = '%s\t%s\n' % (startID, endID) writeFile.write(lineOut) if goldStd == 0: writeFile.write(lineOut) # when omimIDs exist, create chemical to omim # relationship if omim[0] != '': omimIDs = set(omim) for mim in omimIDs: mimOut = '%s|%s|%s|%s|%s|%s\n' % ( startID, relType, source, occurrence, ';'.join(pmids), 'omim.disease.id:' + mim) if goldStd == 1 and evidence != '': writeFile.write(mimOut) if goldStd == 0: writeFile.write(mimOut) elif checkBoth == 0 and endID in meshIDs: # genes_diseases.tsv geneID = startID.split(':')[1] if geneID in humanSet: lineOut = startID + '|' + relType + '|' + source + '|' + \ occurrence + '|' + \ ';'.join(pmids) + '|' + endID + '\n' writeFile.write(lineOut) # when omimIDs exist, create chemical to omim # relationship if omim[0] != '': omimIDs = set(omim) for mim in omimIDs: mimOut = startID + '|' + relType + '|' + source + '|' + \ occurrence + '|' + \ ';'.join(pmids) + '|' + \ 'omim.disease.id:' + mim + '\n' writeFile.write(mimOut) if len(idArray) == 2: return noPMIDDict else: logger.info( f"Direct Associations:{directCount}\nInferred Associations:{inferredCount}" ) writeFile.flush() writeFile.close()
if __name__ == "__main__": logger = logging.getLogger(__name__) logging.basicConfig( format='%(asctime)s [%(funcName)s] %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG) BASE_DIRECTORY = "docker/data" GENE_FILE = f"{BASE_DIRECTORY}/ncbi_gene/mim2gene_medgen.gz" MONDO_FILE = f"{BASE_DIRECTORY}/mondo/mondo.obo" CONVERTER_DIRECTORY = f"{BASE_DIRECTORY}/converter" omim2gene = defaultdict(set) #Gene to gene gene2omim = defaultdict(set) # Gene to Phene omim2mondo = defaultdict(set) # Phene to Phene with handle(GENE_FILE) as genes: rows = csv.DictReader(genes, delimiter="\t") for row in rows: MID = row["#MIM number"] GID = row["GeneID"] if GID != "-": if row["type"] == "gene": omim2gene[MID].add(GID) elif row["type"] == "phenotype": gene2omim[GID].add(MID) with handle(MONDO_FILE) as mf: omim_id = [] mondo_id = "" is_obsolete = False for line in mf: if line.startswith("[Term]"):