Ejemplo n.º 1
0
def converter(GENE_FILE, MONDO_FILE, namespace):
    omim2gene = defaultdict(set)  #Gene to gene
    gene2omim = defaultdict(set)  # Gene to Phene
    omim2mondo = defaultdict(set)  # Phene to Phene
    with handle(GENE_FILE) as genes:
        rows = csv.DictReader(genes, delimiter="\t")
        for row in rows:
            MID = row["#MIM number"]
            GID = row["GeneID"]
            if GID != "-":
                if row["type"] == "gene":
                    omim2gene[MID].add(f"{namespace}.id:{GID}")
                elif row["type"] == "phenotype":
                    gene2omim[f"{namespace}.id:{GID}"].add(MID)
    with handle(MONDO_FILE) as mf:
        omim_id = []
        mondo_id = ""
        is_obsolete = False
        for line in mf:
            if line.startswith("[Term]"):
                if mondo_id and omim_id and not is_obsolete:
                    for om in omim_id:
                        omim2mondo[om] = mondo_id
                    omim_id = []
                    mondo_id = ""
                is_obsolete = False
            elif line.startswith("id:"):
                mondo_id = line.strip().split("id: ")[1]
            elif line.startswith("xref: OMIM"):
                omim_id.append(line.split(" ")[1].split(":")[1])
            elif line.startswith("is_obsolete"):
                is_obsolete = True
    return omim2gene, gene2omim, omim2mondo
Ejemplo n.º 2
0
def parseGenes(ctdGenesFile, entrez_gene):
    """
    creates map for ctd assigned gene symbols to entrez gene ids
    also parses set of human genes
    """
    logger.info('Parsing human genes file')
    # single column file of 59599 human(taxid:9606) gene ids from entrez gene
    #humanGenes = '/Users/rlinchan/Google Drive/PhD/dissert/data/neo4j_input/testNetwork/nodes/HumanEntrezGeneIDs.txt'
    sym2idDict = {}
    humanSet = set()
    with parse_it.handle(entrez_gene) as geneFile:
        #geneFile = parse_it.handle(geneFile)
        reader = csv.DictReader(geneFile, delimiter="\t")
        for row in reader:
            found = False
            if row["#tax_id"] != "9606" and found:
                break
            elif row["#tax_id"] == "9606":
                found = True
                humanSet.add(row["GeneID"])
    logger.info(f"I found {len(humanSet)} human genes")
    with parse_it.handle(ctdGenesFile) as inFile:
        #ctdGenesFile = parse_it.handle(ctdGenesFile)
        for line in inFile:
            if line.startswith('#'):
                continue
            else:
                line = line.split('\t')
                geneSymbol = line[0]
                geneID = line[2]
                if geneSymbol not in sym2idDict:
                    sym2idDict[geneSymbol] = geneID
    return sym2idDict, humanSet
Ejemplo n.º 3
0
 def parse(self):
     """
     Reads a file defined by the class instance and parses nodes and relationships
     """
     with handle(self.file) as infile:
         reader = csv.DictReader(decomment(infile), delimiter="\t")
         for row in reader:
             predicate = self.__check_predicate_exists(row)
             if predicate:
                 nodes = self.parse_nodes(row)
                 if self.nodes != nodes:
                     if self.nodes:
                         self.write_out()
                 self.nodes = nodes
                 # self.parse_relationship(row)
                 evidence = ""
                 if "PubMedIDs" in row.keys():
                     if row["PubMedIDs"]:
                         evidence = row["PubMedIDs"]
                 self.parse_relationship(predicate, evidence.replace("|",";"))
                 # except KeyError:
                 #     if "PubMedIDs" in row.keys():
                 #         self.a2b["xref"].add(("xref", row["PubMedIDs"].replace("|",";")))
                 #     else:
                 #         self.a2b["xref"].add(("xref", ""))
     self.write_out()
     self.outfile.close()
Ejemplo n.º 4
0
def taxParser(tsvFile, nodesFile, namespace):
    """
    Parses names.dmp and nodes.dmp files from NCBI Taxonomy to generate nodes and edges for Neo4J
    python oboParser.py -i NCBI_taxdump_9.29.16/names.dmp -c NCBI_taxdump_9.29.16/nodes.dmp -o neo4j_input/ -n nih.nlm.ncbi.taxonomy
    """
    synTitles = set([
        'acronym', 'blast name', 'common name', 'equivalent name',
        'genbank acronym', 'genbank common name', 'genbank synonym', 'synonym',
        'scientific name'
    ])
    taxDict = defaultdict(dict)
    taxChildrenTree = defaultdict(set)
    files = [tsvFile, nodesFile]
    for dmp in files:
        with handle(dmp) as inFile:
            for line in inFile:
                # Can replace with map(strip, line.split('|'),
                # line.replace('\"', ''))
                line = line.replace('\"', '')
                column_List = [i.strip() for i in line.split('|')]
                if 'names.dmp' in dmp:
                    taxID = namespace + '.id:' + column_List[0]
                    name = column_List[1]
                    # needs to account for different names/synonyms
                    # scientific name is PT
                    if taxID not in taxDict:
                        taxDict[taxID] = {
                            'name':
                            name,
                            'synonyms': [name],
                            'edges': [],
                            'labels':
                            set(['Organism',
                                 namespace.replace('.', '_')])
                        }  # 'def': '',
                        if 'scientific name' == column_List[3]:
                            taxDict[taxID]['name'] = name
                    else:
                        taxDict[taxID]['synonyms'].append(name)
                        if 'scientific name' == column_List[3]:
                            taxDict[taxID]['name'] = name
                if 'nodes.dmp' in dmp:
                    taxID = namespace + '.id:' + column_List[0]
                    parentID = namespace + '.id:' + column_List[1]
                    taxDict[taxID]['edges'].append(('is_a', parentID))
                    if taxID not in taxChildrenTree:
                        taxChildrenTree[parentID].add(taxID)
                    if taxID in taxChildrenTree:
                        taxChildrenTree[parentID].add(taxID)
    # recursively find all children of Embryophyta to label as plant
    outDict = defaultdict(dict)
    childSet = set()
    # Add plant label
    addLabel(taxChildrenTree, childSet, 'nih.nlm.ncbi.taxonomy.id:3193')
    for child in childSet:
        outDict[child] = taxDict[child]
        outDict[child]['labels'].add('Plant')
    logger.info(
        f"""{len(taxDict)} total nodes\nwith {len(outDict)} labeled nodes """)
    return outDict
Ejemplo n.º 5
0
def parseMEDIC(inFile, meshIDs):
    """
    Borrows obo parsing code from bareSourceParser.py to parse the MEDIC ontology
    MEDIC maps MeSH Diseases to OMIM Diseases
    """
    # assocDict = {startID:set(endID)}
    assocDict = collections.defaultdict(set)
    #myFile = open(inFile, 'rU')
    myFile = parse_it.handle(inFile)
    parse_it.getTerm(myFile)
    # Breaks when the term returned is empty, indicating end of file
    while 1:
        term = parse_it.parseTagValue(parse_it.getTerm(myFile))

        if len(term) != 0:
            subID = term['id'][0].split(':')
            # Finds TYPEDEF stanzas and ignores them
            if len(subID) < 2:
                continue
            termID = term['id'][0].split(':')[-1]

            if 'alt_id' in term.keys() and termID in meshIDs:
                mimID = 'omim.disease.id:' + term['alt_id'][0].split(':')[-1]
                assocDict[termID].update([mimID])
        else:
            break
    logger.info(len(assocDict))
    return assocDict
Ejemplo n.º 6
0
def readIxns(filename):
    """
    CTD File is TSV, has #Fields:\n#Field1\tField2\tetc.
    This method parses unique file:CTD_chem_gene_ixns.tsv
    assocDict = {(startID,endID):{interaction:set(pmid)}}
    """
    # :START_ID|:TYPE|source:string|occurrence:float|articles:string[]|:END_ID
    # ID | xref | Comparative Toxicogenomics Database | len(PMID) | PMID;
    # PMID; | ID
    assocDict = collections.defaultdict(dict)
    geneFormset = set()
    #with open(filename, 'rU') as inFile:
    #    inFile = parse_it.handle(inFile)
    with parse_it.handle(filename) as inFile:
        for line in inFile:
            if line.startswith('#'):
                continue
            else:
                line = line.split('\t')
                startID = line[1]
                endID = line[4]
                # will always be one organism, based on GeneID
                organismID = line[7]
                if organismID == '9606':
                    interaction = set(line[9].strip().replace('^',
                                                              '_').split('|'))
                    pmid = set(line[10].strip().split('|'))
                    assocTuple = (startID, endID)
                    for ixn in interaction:
                        if assocTuple not in assocDict and ixn not in assocDict[
                                assocTuple]:
                            assocDict[assocTuple][ixn] = pmid
                        elif assocTuple in assocDict and ixn not in assocDict[
                                assocTuple]:
                            assocDict[assocTuple][ixn] = (pmid)
                        elif assocTuple in assocDict and ixn in assocDict[
                                assocTuple]:
                            assocDict[assocTuple][ixn].update(pmid)
    return assocDict
Ejemplo n.º 7
0
def parseMESH(inFiles):
    """
    Method creates set of 'valid' MESH IDs (conflicting versions)
    Input is array of 2, d201*.bin and c201*.bin
    """
    meshIDs = set()
    for filename in inFiles:
        logger.info(f"Working on {filename}")
        #myFile = open(filename, 'rU')
        myFile = parse_it.handle(filename)
        parse_it.getTerm(myFile)
        # Breaks when the term returned is empty, indicating end of file
        while 1:
            term = parse_it.parseMeshValue(parse_it.getTerm(myFile))
            # for descriptors
            if len(term) != 0:
                termID = term['UI'][0]
                meshIDs.add(termID)
            else:
                break
    logger.info(f"I found {len(meshIDs)} Mesh IDs")
    return meshIDs
Ejemplo n.º 8
0
                edgeType = "xref"
                #predicate = line.split('\"')
                if linkType in link2id.keys():
                    outNodeID = nodeOut.split(":")[1]
                    linkTypes.add(linkType)
                    edges.append(
                        ("xref", str(link2id[linkType] + ":" + outNodeID)))
            elif line.startswith("is_a"):
                nodeOut = line.split(" ")[1]
                linkType = nodeOut.split(":")[0].lower()
                if linkType in link2id.keys():
                    outNodeID = nodeOut.split(":")[1]
                    linkTypes.add(linkType)
                    edges.append(("is_a", nodeOut))
            elif line.startswith("is_obsolete"):
                obsolete = True
        self.nodeDict[nodeID] = {
            "name": name,
            "synonyms": list(synonyms),
            "edges": edges,
            "labels": labels
        }

        return self.nodeDict


if __name__ == "__main__":
    mFile = "docker/data/mondo/mondo.obo"
    parser = Parser()
    parser.parse(handle(mFile))
Ejemplo n.º 9
0
def geneParser(gene_info, gene2go, mimgen, mondo, namespace):
    """
    Parser function for ncbi gene file
    """
    # EDGES REQUIRE OTHER FILES, all are XREF
    # python oboParser.py -i /Users/rlinchan/Google\
    # Drive/PhD/dissert/data/EntrezGene_01042017/gene_info -c
    # /Users/rlinchan/Google\
    # Drive/PhD/dissert/data/EntrezGene_01042017/gene2go -o
    # /Users/rlinchan/Google\ Drive/PhD/dissert/data/neo4j_input/v3/ -n
    # nih.nlm.ncbi.gene

    organisms = {
        "9606": "H**o sapiens",
        "10116": "Rattus norvicigus",
        "10090": "Mus musculus"
    }

    def form(item):
        """
        Removes quotation marks and ; from items
        """
        chars = "'\";"
        for a in chars:
            item = item.replace(a, "")
        return item

    geneDict = defaultdict(dict)
    with handle(gene_info) as infile:
        reader = csv.DictReader(infile, delimiter='\t')
        for row in reader:
            idee = row["#tax_id"]
            # h**o sapiens, rattus norvicigus, mus musculus
            if idee in organisms.keys():  #("9606","10116","10090"):
                geneID = f"{namespace}.id:{row['GeneID']}"
                symbol_list = form(row["Symbol"]).split("|")
                symbol = symbol_list[0]
                syn = set(row["Synonyms"].split("|"))
                syn.update(symbol_list)
                geneType = form(row["type_of_gene"])
                organism = organisms[idee]
                syn.update(form(row["description"]).split("|"))
                if "-" in syn:
                    syn.remove("-")
                geneDict[geneID] = {
                    'name': symbol,
                    "gene_type": geneType,
                    "organism": organism,
                    'synonyms': list(syn),
                    'parents': set(),
                    'edges': [],
                    'labels': set(['Gene', namespace.replace('.', '_')])
                }
                #geneDict[geneID]['edges'].append(('xref', f'nih.nlm.ncbi.taxonomy.id:{idee}'))
    logger.info("Nodes Dict created")
    # Can also take down PubMedId
    with handle(gene2go) as edgeFile:
        reader = csv.DictReader(edgeFile, delimiter='\t')
        for row in reader:
            idee = row["#tax_id"]
            if idee in organisms.keys():  #("9606","10116","10090"):
                geneID = f"{namespace}.id:{row['GeneID']}"
                goID = row["GO_ID"]
                geneDict[geneID]['edges'].append(('xref', goID))
    logger.info(f"{len(geneDict)} edges added to Dict")

    omim2gene, gene2omim, omim2mondo = converter(mimgen, mondo, namespace)
    # Find OMIM associated phenotypes
    matches = 0
    for g in geneDict.keys():
        #geneID = g.split(":")[1]
        if g in gene2omim.keys():
            OMIM_ID = gene2omim[g]
            for oid in OMIM_ID:
                if oid in omim2mondo.keys():
                    matches += 1
                    geneDict[g]['edges'].append(
                        ('associated_with', omim2mondo[oid]))
    logger.info(f"{matches} Gene to Phenotype associations")
    return geneDict
Ejemplo n.º 10
0
def main():
    args = readArgs()
    INFILE = args.infiles
    OUTPUT_DIRECTORY = args.outdir
    NAMESPACE = args.namespace
    # NAMESPACE is UID prefix for:
    # gene, taxonomy, omim, nal
    # MeSH parsing
    if NAMESPACE == "nih.nlm.mesh":
        logger.info("Parsing mesh files...")
        nodeOut = open(path.join(OUTPUT_DIRECTORY, f"{NAMESPACE}.nodes.csv"),
                       "w")
        edgeOut = open(path.join(OUTPUT_DIRECTORY, f"{NAMESPACE}.edges.csv"),
                       "w")
        parser = mesh.Parser(nodeOut, edgeOut)
        for F in INFILE:
            parser.read(F)
        parser.writeEdges()
        nodeOut.close()
        edgeOut.close()
        sys.exit(0)
    # NCBI Taxonomy parsing
    elif NAMESPACE == "nih.nlm.ncbi.taxonomy":
        # CHEMFILE is nodes.dmp (edges)
        logger.info('Parsing NCBI Taxonomy files...')
        for F in INFILE:
            if F.endswith("names.dmp"):
                f1 = F
            else:
                f2 = F
        outDict = taxParser(f1, f2, NAMESPACE)
    # OMIM parsing
    elif NAMESPACE == "omim.disease":
        logger.info('Parsing omim files...')
        for F in INFILE:
            if F.endswith("genemap2.txt"):
                f1 = F
            else:
                f2 = F
        outDict = omim_parser(f1, f2, NAMESPACE)
    # NCBI Gene parsing
    elif NAMESPACE == "nih.nlm.ncbi.gene":
        logger.info('Parsing NCBI gene files...')
        for F in INFILE:
            if "gene_info" in F:
                gene_info = F
            elif "gene2go" in F:
                gene2go = F
            elif "mim2gene" in F:
                mimgen = F
            elif "mondo" in F:
                mon = F
            else:
                logger.error(
                    f"I'm not sure what to do with this file!\n file:{F}")
        outDict = geneParser(gene_info, gene2go, mimgen, mon, NAMESPACE)
    # NALT parsing
    elif NAMESPACE == "usda.nal.thesaurus":
        logger.info('Parsing NAL files...')
        parser = nal.Parser()
        outDict = parser.parse(handle(INFILE[0]))
    # Parse the mondo obo file
    elif NAMESPACE == "ctd":
        for F in INFILE:
            ct = ctd.Parser(F, OUTPUT_DIRECTORY)
            ct.parse()
        exit(0)
    elif NAMESPACE == "mondo":
        logger.info("Parsing mondo ontology...")
        parser = mondo.Parser()
        outDict = parser.parse(handle(INFILE[0]))
    # Parse a different obo file
    else:
        logger.info(f"Parsing {NAMESPACE}: {INFILE[0]}...")
        outDict = oboParser(INFILE[0], NAMESPACE)

    logger.info(f"Writing to:\t{OUTPUT_DIRECTORY}")
    csvNodeOutput(outDict, OUTPUT_DIRECTORY, NAMESPACE)
Ejemplo n.º 11
0
def tsvParser(tsv_File, new_rel_dict):
    """
    Method input: Linguamatics TSV Result files
    Each TSV file should follow the format
        (object, predicate, subject) --> (col 1-3, col 4, col 5-7)
        followed by hit text (col7) and docID (col10)
    Method returns a relationship dictionary of format
        { predicate : { (subjID, objID) :
        [[objectTerm,subjectTerm,hitText,doc,location], ... ]}
    """
    ## file used has the object and subject reversed
    ## object (col5-7), predicate(col4), and subject(col1-3)
    ## docID (col12), hit text (col16)

    verb_endings = ('s', 'es', 'er', 'e', 'ed', 't', 'ur', 'rn', 'ng', 'ing',
                    'fy', 'in', 'nd', 'ck', 'ic', 'ad', 'ld', 'ar', 'lp', 'ow',
                    'en', 'ish', 'ply', 'ize', 'or', 'ts', 'rd', 'ry', 'rk',
                    'ir', 'rm')
    rel_Dict = defaultdict(dict)
    tags = []
    unfiltered_preds = set()  # Set of predicates
    filtered_preds = set()
    langs = set()  # Languages being used in the article hits
    # Possible languages from Agricola
    langCodes = set([
        "spa", "fre", "rus", "swe", "chi", "jpn", "afr", "ger", "tur", "por",
        "ita", "nor", "ind", "cat", "fin", "dan"
    ])
    otherLang = 0
    unknown_location = False
    #logging.info("Working on:\t%s" % tsv_File.split('/')[-1])
    n = 2
    with handle(tsv_File) as i:
        reader = csv.DictReader(i, delimiter='\t')
        for row in reader:
            hit = row["Hit"].replace('\'', '').replace('|',
                                                       '').replace("\"", '')
            hit_language = hit.split("... ")[-1]
            subjID = convertID(row["[SNID] Subject"], n)
            objID = convertID(row["[SNID] Object"], n)
            if hit_language.lower() in langCodes:  # Exclude non-english hits
                langs.add(hit_language)
                continue
            elif subjID == None or objID == None:  # Skip results with blank ids
                continue
            else:
                objectTerm = (row["[PT] Object"], row["Object"])
                subjectTerm = (row["[PT] Subject"], row["Subject"])
                try:
                    location = row["Location"]
                except:
                    if "abstract" in tsv_File.lower():
                        location = "abstract"
                    elif "title" in tsv_File.lower():
                        location = "title"
                    else:
                        unknown_location = True
                        location = "unknown"
                doc = row["Doc"]
                predicatePhrase = row["Predicate"]
                # ***** ***** Aaron made it to here.. woohoo! Jul 2 ***** *****
                # Add to overall, unfiltered set of predicates
                unfiltered_preds.add(predicatePhrase)
                # Filter the predicates
                predicate, tagged_text = filterPredicates(predicatePhrase)
                # Add filtered predicate to list
                filtered_preds.add(predicate)
                tags.append(tagged_text)
                if predicate:
                    preds = [
                        rel["predicate"]
                        for rel in new_rel_dict[(subjID, objID)]
                    ]

                    if predicate not in preds:
                        new_rel_dict[(subjID, objID)].append({
                            "predicate":
                            predicate,
                            "pmid": [doc],
                            "file":
                            set([tsv_File])
                        })  #"evidence":[f"{location}:{hit}"]})
                    else:
                        ind = preds.index(predicate)
                        new_rel_dict[(subjID, objID)][ind]["pmid"].append(doc)
                        new_rel_dict[(subjID,
                                      objID)][ind]["file"].add(tsv_File)
                        #new_rel_dict[(subjID,objID)][ind]["evidence"].append(f"{location}:{hit}")

                # change set for analysis of blanks
                #if predicate == '':
                #    filtered_preds.add(predicatePhrase)
                #    continue

                if predicate:
                    if predicate not in rel_Dict and (
                            subjID, objID) not in rel_Dict[predicate]:
                        rel_Dict[predicate][(subjID, objID)] = [[
                            objectTerm, subjectTerm, predicatePhrase, hit, doc,
                            location
                        ]]
                    elif predicate in rel_Dict and (
                            subjID, objID) not in rel_Dict[predicate]:
                        rel_Dict[predicate][(subjID, objID)] = [[
                            objectTerm, subjectTerm, predicatePhrase, hit, doc,
                            location
                        ]]
                    elif predicate in rel_Dict and (
                            subjID, objID) in rel_Dict[predicate]:
                        rel_Dict[predicate][(subjID, objID)].append([
                            objectTerm, subjectTerm, predicatePhrase, hit, doc,
                            location
                        ])
            n += 1
    if unknown_location:
        logging.warn(
            "Location of text mining hit undiscernable; set to unknown")
    return rel_Dict, langs, unfiltered_preds, filtered_preds, new_rel_dict
Ejemplo n.º 12
0
        Some nodes have multiple parents
        """
        edges = []
        multiParent = 0
        for name in props["children"]:
            edges.append(("has_child",self.name2id[name]))
        for name in props["parents"]:
            parentID = self.name2id[name]
            if multiParent == 0:
                self.tree.move_node(nodeID,parentID)
            else:
                self.tree.create_node(tag=props["name"],identifier=nodeID+".{}".format(multiParent),parent=parentID)
            edges.append(("is_a",parentID))
            multiParent += 1
        return edges

def extractElem(node,tag):
    """
    Extracts an element
    """
    elemSet = set()
    for elem in node.findall(tag):
        elemSet.add(elem.text)
    return elemSet

if __name__ == "__main__":
    parser = Parser()
    mydict = parser.parse(handle("docker/data/nal/NAL_Thesaurus_2019_XML.zip"))
    print(mydict)
    #sub = parser.tree.subtree("usda.nal.thesaurus.id:858")
    #print(sub.all_nodes())
Ejemplo n.º 13
0
def readTSV(filename, entrez_gene, ctd_gene, writeFile, meshIDs, goldStd):
    """
    This method reads all other CTD files listed in methods section
    Exceptions:
    OK CTD_Disease-GO_biological_process_associations.tsv MESH 2 GO FLIP edge direction
    OK CTD_Disease-GO_cellular_component_associations.tsv MESH 2 GO FLIP edge direction
    OK CTD_Disease-GO_molecular_function_associations.tsv MESH 2 GO FLIP edge direction
    OK CTD_chem_go_enriched.tsv (4,629,998) MESH 2 GO
    NA CTD_chem_pathways_enriched.tsv MESH 2 KEGG,REACT
    NA CTD_diseases_pathways.tsv FLIP edge direction MESH: or OMIM: ot KEGG,REACT
    NA CTD_genes_pathways.tsv NCBI GENE ID to KEGG,REACT
    LARGE
    MODIFY CTD_chemicals_diseases.tsv-> (4,338,326) MESH 2 MESH:,OMIM:
    MODIFY CTD_genes_diseases.tsv-> (46,613,048) NCBI GENE ID 2 MESH:,OMIM:
    """
    logger.info(f"Parsing {filename}")
    idArray = []
    source = 'Comparative Toxicogenomics Database'
    relType = 'xref'
    #ctdGeneFile = '/Users/rlinchan/Google Drive/PhD/dissert/data/assocDBs/CTD_2242015/CTD_genes.tsv'
    sym2id = {}
    humanSet = set()
    #  noPMIDDict = {startID:set(endID)}
    noPMIDDict = collections.defaultdict(set)
    gene2goDict = collections.defaultdict(set)
    mod = 0
    checkBoth = 0
    goFilter = 0
    inferredCount = 0
    directCount = 0
    if 'Disease' in filename:  # flag to flip IDs for direction of relationship
        mod = 1
        sym2id, humanSet = parseGenes(ctd_gene,
                                      entrez_gene)  # symbol to id map
    if 'chemicals' in filename:
        checkBoth = 1
    if 'genes_diseases' in filename:
        sym2id, humanSet = parseGenes(ctd_gene,
                                      entrez_gene)  # symbol to id map
    if 'chem_go' in filename:
        goFilter = 1
    with parse_it.handle(filename) as inFile:
        for line in inFile:
            if line.startswith('# Fields:'):
                line = next(inFile)
                header = line[2:].strip().split('\t')
                for column in header:  # define columns by header line
                    if column.endswith('ID'):  # 1,4 arrayPos = 0,1
                        idArray.append(header.index(column))
                    if column == 'OmimIDs':  # 8 arrayPos = 3
                        idArray.append(header.index(column))
                    if column == 'PubMedIDs':  # 9 arrayPos = 4
                        idArray.append(header.index(column))
                    if column == 'DirectEvidence':  # 5 arrayPos = 2
                        idArray.append(header.index(column))
            elif line.startswith('#'):  # skip file header
                continue
            else:  # associations
                line = line.strip().split('\t')
                startID = ''
                endID = ''
                ontology = ''
                geneID = []
                if mod == 0:  # retain original order
                    startID = line[idArray[0]]
                    endID = line[idArray[1]]
                    if goFilter == 1:
                        ontology = line[3]
                        # only include bioProc for CTD_chem_go
                        if ontology != 'Biological Process':
                            continue
                if mod == 1:  # flip order, CTD_Disease-GO*.tsv
                    startID = line[idArray[1]]  # goID
                    endID = line[idArray[0]]  # diseaseID
                    geneSym = line[5].split('|')  # gene symbols
                    for gene in geneSym:
                        if gene in sym2id:
                            geneID.append(sym2id[gene])
                    # account for non-prefixed omim
                    if endID.isdigit():
                        endID = 'omim.disease.id:' + endID
                if len(idArray) == 2:  # all other files
                    if startID in meshIDs or endID in meshIDs:  # does this ignore OMIM?
                        if startID not in noPMIDDict:
                            noPMIDDict[startID].add(endID)
                        else:
                            noPMIDDict[startID].add(endID)
                        for gene in geneID:  # adds gene to GO
                            if gene not in noPMIDDict and gene in humanSet:  # filter human genes
                                noPMIDDict['nih.nlm.ncbi.gene.id:' +
                                           gene].add(startID)
                            if gene in noPMIDDict and gene in humanSet:  # filter human genes
                                noPMIDDict['nih.nlm.ncbi.gene.id:' +
                                           gene].add(startID)
                elif len(idArray) == 5:
                    # chemicals_diseases.tsv = MESH to MESH: or OMIM:
                    # genes_diseases.tsv = Entrez ID to MESH: or OMIM:
                    # idArray = [0:ID,1:ID,2:DirectEvidence,3:OMIM,4:PubMedIDs]
                    omim = line[idArray[3]].strip().split('|')
                    evidence = line[idArray[2]]
                    if evidence != '':
                        directCount += 1
                    if evidence == '':
                        inferredCount += 1
                    pmids = ''  # PMID may be empty, must check
                    occurrence = '1'  # occurrence set to 1 if no pmid evidence
                    # Addition of 1 because 0 based language
                    if len(line) == idArray[4] + 1:
                        pmids = line[idArray[4]].strip().split('|')
                        # occurrence based on citation evidence
                        occurrence = str(len(pmids))
                    if startID.isdigit():  # convert to geneID
                        startID = 'nih.nlm.ncbi.gene.id:' + startID
                    if 'MESH' in endID:
                        endID = endID.split(':')[-1]
                    if 'OMIM' in endID:
                        endID = 'omim.disease.id:' + endID.split(':')[-1]
                    if checkBoth == 1:  # chemicals_diseases.tsv
                        if startID in meshIDs and endID in meshIDs:
                            lineOut = '%s|%s|%s|%s|%s|%s\n' % (
                                startID, relType, source, occurrence,
                                ';'.join(pmids), endID)
                            if goldStd == 1 and evidence != '':
                                lineOut = '%s\t%s\n' % (startID, endID)
                                writeFile.write(lineOut)
                            if goldStd == 0:
                                writeFile.write(lineOut)
                            # when omimIDs exist, create chemical to omim
                            # relationship
                            if omim[0] != '':
                                omimIDs = set(omim)
                                for mim in omimIDs:
                                    mimOut = '%s|%s|%s|%s|%s|%s\n' % (
                                        startID, relType, source, occurrence,
                                        ';'.join(pmids),
                                        'omim.disease.id:' + mim)
                                    if goldStd == 1 and evidence != '':
                                        writeFile.write(mimOut)
                                    if goldStd == 0:
                                        writeFile.write(mimOut)
                    elif checkBoth == 0 and endID in meshIDs:  # genes_diseases.tsv
                        geneID = startID.split(':')[1]
                        if geneID in humanSet:
                            lineOut = startID + '|' + relType + '|' + source + '|' + \
                                occurrence + '|' + \
                                ';'.join(pmids) + '|' + endID + '\n'
                            writeFile.write(lineOut)
                            # when omimIDs exist, create chemical to omim
                            # relationship
                            if omim[0] != '':
                                omimIDs = set(omim)
                                for mim in omimIDs:
                                    mimOut = startID + '|' + relType + '|' + source + '|' + \
                                        occurrence + '|' + \
                                        ';'.join(pmids) + '|' + \
                                        'omim.disease.id:' + mim + '\n'
                                    writeFile.write(mimOut)

    if len(idArray) == 2:
        return noPMIDDict
    else:
        logger.info(
            f"Direct Associations:{directCount}\nInferred Associations:{inferredCount}"
        )
        writeFile.flush()
        writeFile.close()
Ejemplo n.º 14
0
if __name__ == "__main__":
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format='%(asctime)s [%(funcName)s] %(levelname)s - %(message)s',
        datefmt='%d-%b-%y %H:%M:%S',
        level=logging.DEBUG)
    BASE_DIRECTORY = "docker/data"
    GENE_FILE = f"{BASE_DIRECTORY}/ncbi_gene/mim2gene_medgen.gz"
    MONDO_FILE = f"{BASE_DIRECTORY}/mondo/mondo.obo"
    CONVERTER_DIRECTORY = f"{BASE_DIRECTORY}/converter"

    omim2gene = defaultdict(set)  #Gene to gene
    gene2omim = defaultdict(set)  # Gene to Phene
    omim2mondo = defaultdict(set)  # Phene to Phene
    with handle(GENE_FILE) as genes:
        rows = csv.DictReader(genes, delimiter="\t")
        for row in rows:
            MID = row["#MIM number"]
            GID = row["GeneID"]
            if GID != "-":
                if row["type"] == "gene":
                    omim2gene[MID].add(GID)
                elif row["type"] == "phenotype":
                    gene2omim[GID].add(MID)
    with handle(MONDO_FILE) as mf:
        omim_id = []
        mondo_id = ""
        is_obsolete = False
        for line in mf:
            if line.startswith("[Term]"):