def init(filename, __EXCLUDE_PSEUDOGENES=0):
    global __geneDB, __symbols, __gene_symbol_parents, __gene_symbol_synonyms
    
    __geneDB = pyCSV()
    __geneDB.load(filename,"\t")
    
    for row in xrange(1, __geneDB.rows+1):
        
        if __EXCLUDE_PSEUDOGENES and __geneDB.get(row, 2).count("pseudogene")>0:
            continue
        
        symbol = geneUtils.formatGeneSymbol(__geneDB.get(row, 1))
        
        __original_names[symbol] = __geneDB.get(row, 1)
        __symbols.add(symbol)
        
        __sym_rows[symbol] = row
        
        if __isApproved(symbol):
            __approved_symbols.add(symbol)
        
    print "Loaded:", len(__approved_symbols), "approved gene symbols..."
        
    for symbol in __approved_symbols:
        past_symbols = getPastSymbols(symbol)
        
        for child in past_symbols:
            try:
                __gene_symbol_parents[child].add(symbol)
            except KeyError:
                __gene_symbol_parents[child] = set([symbol])
    
        for synSym in getSynonyms(symbol):
            try:
                __gene_symbol_synonyms[synSym].add(symbol)
            except KeyError:
                __gene_symbol_synonyms[synSym] = set([symbol])
                
    for symbol in __gene_symbol_synonyms:
        remove = []
        
        for synGene in __gene_symbol_synonyms[symbol]:
            if synGene not in __approved_symbols:
                remove.append(synGene)
        
        for r in remove:
            __gene_symbol_synonyms[symbol].remove(r)
def mapTargetNames(targets_file,
                   __ENABLE_GENE_UPDATES=1,
                   __ENABLE_GENE_VERIFICATION=1):
    global __targets_unnamed, __targets, __target_names, __geneSet

    __targetCatalogue = pyCSV()
    __targetCatalogue.load(targets_file)
    rejectednames = 0

    representedids = set([])

    for r in xrange(1, __targetCatalogue.rows + 1):
        targetId = int(__targetCatalogue.get(r, 0))
        targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r, 2))

        representedids.add(targetId)

        if targetGene != None:
            parentSym = geneDB.findUpdatedSymbol(targetGene)
            if __ENABLE_GENE_UPDATES and parentSym != None:
                if __DEBUG > 1:
                    print "Updated:", targetGene, "to:", parentSym
                targetGene = parentSym
            if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(
                    targetGene):
                if __DEBUG > 1:
                    print "Rejected:", targetGene
                rejectednames += 1
                continue

            __geneSet.add(targetGene)
            __target_names[targetId] = targetGene

    for targetId in __targets:
        if targetId not in __target_names.keys():
            __targets_unnamed += 1

    if __DEBUG > 0:
        print "Rejected:        ", rejectednames
        print "Unrepresented:   ", len(set(__targets.keys()) - representedids)

    __drugSet = set([])
    for drugbankid in __drugs:
        drug = __drugs[drugbankid]

        for target in drug['targets']:
            targetId = target['partner']
            if targetId in __target_names:
                targetGene = __target_names[targetId]

                __drugSet.add(drugbankid)
                try:
                    __drugDict[targetGene].add(drugbankid)
                except KeyError:
                    __drugDict[targetGene] = set([drugbankid])

    removable_drugs = set([])
    for drugbankid in __drugs:
        if drugbankid not in __drugSet:
            removable_drugs.add(drugbankid)

    for drugbankid in removable_drugs:
        del __drugs[drugbankid]

    lenbefore = len(__geneSet)
    __geneSet = __geneSet & set(__drugDict.keys())
    lenafter = len(__geneSet)
    if __DEBUG > 0:
        print "Removed", (lenafter - lenbefore), "untargeted gene names"

    print "Total drugs with targets:   ", len(__drugSet), len(__drugs)
    print "Total geneset size:         ", len(__geneSet)
def mapTargetNames(targets_file,__ENABLE_GENE_UPDATES=1,__ENABLE_GENE_VERIFICATION=1):
    global __targets_unnamed, __targets, __target_names, __geneSet
    
    __targetCatalogue = pyCSV()
    __targetCatalogue.load(targets_file)
    rejectednames = 0

    representedids = set([])

    for r in xrange(1, __targetCatalogue.rows+1):
        targetId   = int(__targetCatalogue.get(r,0))
        targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2))
        
        representedids.add(targetId)

        if targetGene != None:
            parentSym = geneDB.findUpdatedSymbol(targetGene)
            if __ENABLE_GENE_UPDATES and parentSym != None:
                if __DEBUG>1:
                    print "Updated:", targetGene, "to:", parentSym
                targetGene = parentSym
            if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(targetGene):
                if __DEBUG>1:
                    print "Rejected:", targetGene
                rejectednames+=1
                continue

            __geneSet.add(targetGene)
            __target_names[targetId] = targetGene

    
    for targetId in __targets:
        if targetId not in __target_names.keys():
            __targets_unnamed += 1

    if __DEBUG>0:
        print "Rejected:        ", rejectednames
        print "Unrepresented:   ", len(set(__targets.keys()) - representedids)
    
    __drugSet = set([])
    for drugbankid in __drugs:
        drug = __drugs[drugbankid]

        for target in drug['targets']:
            targetId = target['partner']
            if targetId in __target_names:
                targetGene = __target_names[targetId]
                
                __drugSet.add(drugbankid)
                try:
                    __drugDict[targetGene].add(drugbankid)
                except KeyError:
                    __drugDict[targetGene] = set([drugbankid])
    
    removable_drugs = set([])
    for drugbankid in __drugs:
        if drugbankid not in __drugSet:
            removable_drugs.add(drugbankid)

    for drugbankid in removable_drugs:
        del __drugs[drugbankid]

    lenbefore = len(__geneSet)
    __geneSet = __geneSet & set(__drugDict.keys())
    lenafter = len(__geneSet)
    if __DEBUG>0:
        print "Removed", (lenafter - lenbefore), "untargeted gene names"
    
    print "Total drugs with targets:   ", len(__drugSet), len(__drugs)
    print "Total geneset size:         ", len(__geneSet)
Beispiel #4
0
def loadEvolutionaryGenes(filename, __ENABLE_GENE_VERIFICATION=0,
        __ENABLE_GENE_UPDATES=0, __CROSS_MATCH_LEVEL = 1, include_studies = [1,2,3,4,5]):
    global __DEBUG
    
    genesTSV = pyCSV()
    genesTSV.load(filename, "\t")
    
    bustamante = []
    vamathevan_human = []
    kosiol_human = []

    if 1 in include_studies:
        bustamante          = [item.lower() for item in geneUtils.columnToList(genesTSV, 1, 2)]
    if 2 in include_studies:
        vamathevan_human    = [item.lower() for item in geneUtils.columnToList(genesTSV, 3, 2)]
    if 3 in include_studies:
        kosiol_human        = [item.lower() for item in geneUtils.columnToList(genesTSV, 8, 2)]
    
    conflicts = []
    
    bakewell = []
    nielsen = []
    
    if 4 in include_studies:
        bakewell, c         = geneUtils.mergeColumns(genesTSV, 12, 13, 2)
        bakewell = [item.lower() for item in bakewell]
        conflicts.extend(c)

    if 5 in include_studies:
        nielsen, c          = geneUtils.mergeColumns(genesTSV, 17, 18, 2)
        nielsen = [item.lower() for item in nielsen]
        conflicts.extend(c)
    
    # verify gene symbols
    
    duplicates = 0
    
    geneCounts = {}
    geneUtils.geneFrequency(geneCounts, bustamante)
    geneUtils.geneFrequency(geneCounts, vamathevan_human)
    geneUtils.geneFrequency(geneCounts, kosiol_human)
    geneUtils.geneFrequency(geneCounts, bakewell)
    geneUtils.geneFrequency(geneCounts, nielsen)
    
    
    
    #duplicates+=geneUtils.addAll(geneSet, bustamante)
    #duplicates+=geneUtils.addAll(geneSet, vamathevan_human)
    #duplicates+=geneUtils.addAll(geneSet, kosiol_human)
    #duplicates+=geneUtils.addAll(geneSet, bakewell)
    #duplicates+=geneUtils.addAll(geneSet, nielsen)
    
    
    
    if __ENABLE_GENE_VERIFICATION:
        for pair in conflicts:
            g1 = pair[0].lower()
            g2 = pair[1].lower()
            
            if __ENABLE_GENE_UPDATES:
                g1parent = geneDB.findUpdatedSymbol(g1)
                g2parent = geneDB.findUpdatedSymbol(g2)
                
                if g1parent != None:
                    g1 = g1parent
                if g2parent != None:
                    g2 = g2parent
            
            if geneDB.isApproved(g1):
                if __DEBUG>1:
                    print "Gene", g2, "not approved, but",g1,"is fine"
                try:
                    geneCounts[g1]+=1
                except KeyError:
                    geneCounts[g1] = 1
                    
            elif geneDB.isApproved(g2):
                if __DEBUG>1:
                    print "Gene", g1, "not approved, but",g2,"is fine"
                try:
                    geneCounts[g2]+=1
                except KeyError:
                    geneCounts[g2] = 1
                    
            else:
                if __DEBUG>1:
                    print "Neither",g1,"nor",g2,"are valid"
        
    
    else:
        for pair in conflicts:
            g1 = pair[0].lower()
            g2 = pair[1].lower()
            
            try:
                geneCounts[g2]+=1
            except KeyError:
                geneCounts[g2] = 1
            
    duplicates, geneSet = geneUtils.addFilterFrequency(geneCounts, __CROSS_MATCH_LEVEL)
    ofile = open(os.sep.join(["results","log","geneSetDuplicateFrequency.txt"]),'w')
    
    glist = []
    
    for gene in geneCounts:
        glist.append((gene, geneCounts[gene]))
        
    for item in sorted(glist, key=lambda item: -item[1]):
        ofile.write("%-20s%d\n" % item)
    
    ofile.close()
    
    geneSet = set([geneUtils.formatGeneSymbol(geneSym) for geneSym in geneSet])
            
    if __ENABLE_GENE_UPDATES:
        geneUtils.updateGeneSet(geneSet)
    
    if __ENABLE_GENE_VERIFICATION:        
        geneUtils.removeInvalidGenes(geneSet)
    
    if __DEBUG>0:
        print "\n-----------------------------"
        print "Total Duplicates:      ", duplicates
        print "Total Name Conflicts:  ", len(conflicts)
        print "Total Genes Remaining: ", len(geneSet)
        print "-----------------------------\n"
        
    log_file = open(os.sep.join(["results","log","loaded_genelist.txt"]),'w')
    for gene in geneSet:
        log_file.write(gene+"\n")
    log_file.close()
    return geneSet
Beispiel #5
0
def init(filename, __ENABLE_GENE_VERIFICATION = 0, __ENABLE_GENE_UPDATES = 0, __INCLUDE_MAPPED_GENES = 0, trait_exclude_file = 0, pfilter = 0.05):
    global __DEBUG, __pValues, __gwasCatalogue, __studyByTrait, __geneSet, __traitDict, __studyByGene
    
    exclude_traits = set([])
    if trait_exclude_file != 0:
        ifile = open(trait_exclude_file,'r')
        for line in ifile:
            exclude_traits.add(line.strip())
        ifile.close()
    
    
    __gwasCatalogue.load(filename, "\t")

    invalidGeneSet = set([])
    updatedGeneSet = set([])
    
    for i in xrange(1, __gwasCatalogue.rows+1):
        geneString = __gwasCatalogue.get(i, 13).strip()
        geneTrait = __gwasCatalogue.get(i, 7).strip()
        
        pvalueText = __gwasCatalogue.get(i, 27)
        pvalue = 0
        
        try:
            pvalue = float(pvalueText)
        except ValueError:
            pvalue = -1
        
        if pvalue > pfilter:
            continue
        if geneTrait in exclude_traits:
            continue
        if geneString==None:
            continue
        if geneString == "":
            continue
        
        __pValues[i] = pvalue
        
        try:
            __studyByTrait[geneTrait].add(i)
        except KeyError:
            __studyByTrait[geneTrait] = set([i])
        
        geneItems = geneString.split(",")
        for item in geneItems:
            
            geneSymbols = item.split(" - ")
            
            for geneSym in geneSymbols:
                geneSym = geneUtils.formatGeneSymbol(geneSym.strip())
                __addGene(i,geneSym, geneTrait, __ENABLE_GENE_VERIFICATION, __ENABLE_GENE_UPDATES, updatedGeneSet, invalidGeneSet)
        
        if __INCLUDE_MAPPED_GENES:
            mappedGenes = __gwasCatalogue.get(i, 14)
            
            mappedItems = mappedGenes.split(";")
            
            for item in mappedItems:
                
                geneSymbols = item.split(" - ")
                
                for geneSym in geneSymbols:
                    geneSym = geneUtils.formatGeneSymbol(geneSym)
                    __addGene(i,geneSym, geneTrait, __ENABLE_GENE_VERIFICATION, __ENABLE_GENE_UPDATES, updatedGeneSet, invalidGeneSet)
        
    invalid_file = open(os.sep.join(["results","log","invalid_gwas.txt"]),'w')
    
    for geneSym in invalidGeneSet:
        invalid_file.write(geneSym+"\n")
    
    invalid_file.close()
    
    background_file = open(os.sep.join(["results", "log",
        "gwas_background.txt"]), 'w')

    for geneSym in __geneSet:
        background_file.write(geneDB.__original_names[geneSym] + "\n")
    background_file.close()

    
    if __DEBUG > 0:
        print "\n---------------------------------"
        print "GWAS Invalid Gene Symbols:  ", len(invalidGeneSet)
        print "GWAS Updated Gene Symbols:  ", len(updatedGeneSet)
        print "GWAS Total Genes Remaining: ", len(__geneSet)
        print "---------------------------------\n"
def getSynonyms(geneSym):
    global __geneDB, __sym_rows, __synonyms
    return [geneUtils.formatGeneSymbol(item) for item in __strToList(__geneDB.get(__sym_rows[geneSym.lower()], __synonyms))]
def initTargets(targets_file, protein_file,__ENABLE_GENE_VERIFICATION=0, __ENABLE_GENE_UPDATES=0):
    global __DEBUG, __targetCatalogue, __geneSet, __geneNames, __drugDict
    
    __targetCatalogue.load(targets_file)
    
    rejectedSet = set([])
    updatedSet = set([])
    
    for r in xrange(1, __targetCatalogue.rows+1):
        geneId = int(__targetCatalogue.get(r,0))
        geneName = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2))
        
        if geneName != None:
            if __ENABLE_GENE_UPDATES:
                parentSym = geneDB.findUpdatedSymbol(geneName)
                if parentSym != None:
                    updatedSet.add(geneName)
                    geneName = parentSym
                
            if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(geneName):
                if __DEBUG>2 and geneName != "" or geneName == "papc":
                    print "Rejected:", geneName
                rejectedSet.add(geneName)
                continue
                
            __geneNames[geneId] = geneName
            __geneSet.add(geneName)
            __drugDict[geneName] = set([])
    
    invalid_file = open(os.sep.join(["results","log","invalid_drugbank.txt"]),'w')
    for geneName in rejectedSet:
        invalid_file.write(geneName+"\n")
    invalid_file.close()            
    
    proteins = parseFASTA(protein_file)
    
    __drugSet = set([])
    empty_gene_drug_targets = 0
    for fasta in proteins:
        items = fasta[1].split()
        geneId = int(items[0])
        
        if geneId in __geneNames:
            parenthetical = fasta[1][fasta[1].rfind("(")+1 : fasta[1].rfind(")")]
            
            drugs = parenthetical.split(";")
            
            for drug in drugs:
                drugbankid = drug.strip()
                __drugDict[__geneNames[geneId]].add(drugbankid)
                __drugSet.add(drugbankid)
    
    removable = set([])
    for gene in __geneSet:
        if gene not in __drugDict or len(__drugDict[gene]) == 0:
            removable.add(gene)

            empty_gene_drug_targets += 1
    __geneSet -= removable
    
    removable_drugs = set([])
    for drugbankid in __drugs:
        if drugbankid not in __drugSet:
            removable_drugs.add(drugbankid)
    
    for drugbankid in __drugSet:
        if drugbankid not in __drugs:
            __drugs[drugbankid] = {'name':drugbankid}

    for drugbankid in removable_drugs:
        del __drugs[drugbankid]

    if __DEBUG>0:
        print "\n------------------------------------------"
        print "Invalid Drug Target Gene Symbols:   ", len(rejectedSet)
        print "Updated Drug Target Gene Symbols:   ", len(updatedSet)
        print "Remaining Drug Target Gene Symbols: ", len(__geneSet)
        print "Drugs with targets:                 ", len(__drugSet), len(__drugs)
        print "Removed:", empty_gene_drug_targets, "genes without targeting drugs"
        print "------------------------------------------\n"