def main(cosmicParsedGene, cosmicParsedGene2Mutation, cosmicParsedMutation, databasePassword, schemaProteins, tableCOSMICGene, tableCOSMICGene2Mutation, tableCOSMICMutation): #=========================================================================== # Extract and format the parsed data. #=========================================================================== cosmicGene = utilities.file2list.main(cosmicParsedGene) cosmicGene = [eval(i) for i in cosmicGene] cosmicGene2Mutation = utilities.file2list.main(cosmicParsedGene2Mutation) cosmicGene2Mutation = [eval(i) for i in cosmicGene2Mutation] cosmicMutation = utilities.file2list.main(cosmicParsedMutation) cosmicMutation = [eval(i) for i in cosmicMutation] #=========================================================================== # Update the database tables. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) values = '(' + ('%s,' * len(cosmicGene[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableCOSMICGene, values, cosmicGene) values = '(' + ('%s,' * len(cosmicMutation[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableCOSMICMutation, values, cosmicMutation) values = '(' + ('%s,' * len(cosmicGene2Mutation[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableCOSMICGene2Mutation, values, cosmicGene2Mutation) mysql.closeConnection(conn, cursor)
def update_gene_table(ensemblParsedTranscripts, schemaProteins, tableEnsemblGene, databasePassword): # =========================================================================== # Extract and format the parsed gene data. # =========================================================================== geneData = utilities.file2list.main(ensemblParsedTranscripts) geneData = [eval(i) for i in geneData] geneDict = dict([(i[0], i) for i in geneData]) values = "(" + ("%s," * len(geneData[0])) values = values[:-1] + ")" rowsToAdd = geneData conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute("TRUNCATE " + tableEnsemblGene) cursor = mysql.tableINSERT(cursor, tableEnsemblGene, values, rowsToAdd) mysql.closeConnection(conn, cursor) print "\tEntries added to the Ensembl gene table: ", len(rowsToAdd)
def update_homolog_table(ensemblParsedHomology, schemaProteins, tableHomologs, databasePassword): # =========================================================================== # Extract and format the parsed gene data. # =========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute("TRUNCATE " + tableHomologs) readVariants = open(ensemblParsedHomology, "r") count = 0 for line in readVariants: count += 1 line = eval(line) values = "(" + ("%s," * len(line)) values = values[:-1] + ")" try: cursor = mysql.tableINSERT(cursor, tableHomologs, values, [line]) except: print line print count raise readVariants.close() mysql.closeConnection(conn, cursor)
def main(UPPPIData, schemaProteins, tablePPI, databasePassword): #=========================================================================== # Extract and format the parsed gene data. #=========================================================================== ppiData = utilities.file2list.main(UPPPIData) ppiData = [eval(i) for i in ppiData] ppiDict = dict([(tuple([i[0], i[1]]), i) for i in ppiData]) #=========================================================================== # Extract the gene information recorded in the database. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, '*', tablePPI) results = cursor.fetchall() mysql.closeConnection(conn, cursor) #=========================================================================== # Compare the parsed data with the data recorded in the table. #=========================================================================== columnIndices = range(1, len(ppiData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('SHOW COLUMNS FROM ' + tablePPI) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = ppiDict.keys() for i in results: proteinOne = i[0] proteinTwo = i[1] dictKey = tuple([proteinOne, proteinTwo]) if ppiDict.has_key(dictKey): # If the key is in both the parsed file and the table, then it does not need to be added. toAdd.remove(dictKey) # Compare the row from the table with the parsed file, to determine if the table needs updating. for j in columnIndices: if i[j] != ppiDict[dictKey][j]: if not toUpdate.has_key(dictKey): toUpdate[dictKey] = [] toUpdate[dictKey].append(j) else: # If the key is in the table, but not in the parsed file, then the row needs to be removed. toRemove.append(dictKey) values = '(' + ('%s,' * len(ppiData[0])) values = values[:-1] + ')' #=========================================================================== # Remove rows from the table that are not in the parsed file. #=========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tablePPI, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"') mysql.closeConnection(conn, cursor) print '\tEntries removed from the PPI table: ', len(toRemove) #=========================================================================== # Update rows that have different values in the parsed file and the table. #=========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + str(ppiDict[i][j]) + '"' toSet.append(updateString) toSet = ', '.join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tablePPI, toSet, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"') mysql.closeConnection(conn, cursor) print '\tEntries updated in the PPI table: ', len(toUpdate) #=========================================================================== # Add rows which are not in the table, but are in the parsed file. #=========================================================================== rowsToAdd = [ppiDict[i] for i in toAdd] conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tablePPI, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the PPI table: ', len(toAdd)
def main(databasePassword, schemaPharmGKB, pharmGKBDiseases, pharmGKBDrugs, pharmGKBGenes, pharmGKBRelationships, pharmGKBInitialisation, MYSQLBIN): # Define the tables needed. tableDisease = schemaPharmGKB + '.disease' tableDrug = schemaPharmGKB + '.drug' tableGene = schemaPharmGKB + '.gene' tableDrug2Drug = schemaPharmGKB + '.drug2drug' tableDrug2DrugClass = schemaPharmGKB + '.drug2drugclass' tableDrug2Disease = schemaPharmGKB + '.drug2disease' tableDrug2Gene = schemaPharmGKB + '.drug2gene' tableDrugClass2Disease = schemaPharmGKB + '.drugclass2disease' tableDrugClass2Gene = schemaPharmGKB + '.drugclass2gene' tableGene2Disease = schemaPharmGKB + '.gene2disease' tableGene2Gene = schemaPharmGKB + '.gene2gene' tableDisease2MeSH = schemaPharmGKB + '.disease2mesh' tableDisease2SnoMED = schemaPharmGKB + '.disease2snomed' tableDisease2UMLS = schemaPharmGKB + '.disease2umls' # Create the schema and the tables. subprocess.call('mysql.exe -u root -p' + databasePassword + ' mysql < ' + pharmGKBInitialisation, shell=True, cwd=MYSQLBIN) # Parse the files that contain the data to enter into the database. diseaseTuples = [] meshTuples = [] snomedTuples = [] umlsTuples = [] readIn = open(pharmGKBDiseases, 'r') readIn.readline() # Strip off the header of the file. for line in readIn: chunks = line.split('\t') diseaseID = chunks[0] diseaseName = chunks[1] diseaseTuples.append(tuple([diseaseID, diseaseName])) externalVocab = chunks[4] mesh = re.findall('(?<=MeSH:)D[0-9]+', externalVocab) meshTuples.extend([tuple([diseaseID, i]) for i in mesh]) snomed = re.findall('(?<=SnoMedCT:)[0-9]+', externalVocab) snomedTuples.extend([tuple([diseaseID, i]) for i in snomed]) umls = re.findall('(?<=UMLS:)C[0-9]+', externalVocab) umlsTuples.extend([tuple([diseaseID, i]) for i in umls]) readIn.close() drugTuples = [] readIn = open(pharmGKBDrugs, 'r') readIn.readline() # Strip off the header of the file. for line in readIn: chunks = line.split('\t') drugID = chunks[0] drugName = chunks[1] compoundType = chunks[5] if compoundType == 'Drug/Small Molecule': compoundType = 'Drug' xrefs = chunks[6] drugBank = re.findall('(?<=drugBank:)DB[0-9]+', xrefs) drugBank = drugBank[0] if drugBank != [] else 'NA' pubchemCID = re.findall('(?<=pubChemCompound:)[0-9]+', xrefs) pubchemCID = pubchemCID[0] if pubchemCID != [] else 'NA' drugTuples.append(tuple([drugID, drugName, compoundType, drugBank, pubchemCID])) readIn.close() geneTuples = [] readIn = open(pharmGKBGenes, 'r') readIn.readline() # Strip off the header of the file. for line in readIn: chunks = line.split('\t') geneID = chunks[0] entrezID = chunks[1] if entrezID == '': entrezID = '0' geneTuples.append(tuple([geneID, entrezID])) readIn.close() drug2drugTuples = [] drug2drugClassTuples = {} drug2diseaseTuples = [] drugClass2diseaseTuples = [] drug2geneTuples = {} drugClass2geneTuples = {} gene2diseaseTuples = {} gene2geneTuples = {} readIn = open(pharmGKBRelationships, 'r') readIn.readline() # Strip off the header of the file. for line in readIn: chunks = line.split('\t') entityOne = chunks[0].split(':') entityTwo = chunks[2].split(':') evidence = chunks[5] publication = 'Y' if 'Publication' in evidence else 'N' pathway = 'Y' if 'Pathway' in evidence else 'N' variant = 'Y' if 'Variant' in evidence else 'N' if entityOne[0] == 'Drug': if entityTwo[0] == 'Drug': drug2drugTuples.append(tuple([entityOne[1], entityTwo[1], pathway, publication, variant])) elif entityTwo[0] == 'Drug Class': key = tuple([entityOne[1], entityTwo[1]]) if drug2drugClassTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or drug2drugClassTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or drug2drugClassTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or drug2drugClassTuples[key][4] == 'Y' else 'N' drug2drugClassTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: drug2drugClassTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityTwo[0] == 'Gene': key = tuple([entityOne[1], entityTwo[1]]) if drug2geneTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or drug2geneTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or drug2geneTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or drug2geneTuples[key][4] == 'Y' else 'N' drug2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: drug2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityTwo[0] == 'Disease': drug2diseaseTuples.append(tuple([entityOne[1], entityTwo[1], pathway, publication, variant])) elif entityOne[0] == 'Gene': if entityTwo[0] == 'Drug': key = tuple([entityTwo[1], entityOne[1]]) if drug2geneTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or drug2geneTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or drug2geneTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or drug2geneTuples[key][4] == 'Y' else 'N' drug2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: drug2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityTwo[0] == 'Drug Class': key = tuple([entityTwo[1], entityOne[1]]) if drugClass2geneTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or drugClass2geneTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or drugClass2geneTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or drugClass2geneTuples[key][4] == 'Y' else 'N' drugClass2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: drugClass2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityTwo[0] == 'Disease': key = tuple([entityOne[1], entityTwo[1]]) if gene2diseaseTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or gene2diseaseTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or gene2diseaseTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or gene2diseaseTuples[key][4] == 'Y' else 'N' gene2diseaseTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: gene2diseaseTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityTwo[0] == 'Gene': key = tuple(sorted([entityOne[1], entityTwo[1]])) if gene2geneTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or gene2geneTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or gene2geneTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or gene2geneTuples[key][4] == 'Y' else 'N' gene2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: gene2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityOne[0] == 'Disease': if entityTwo[0] == 'Drug': drug2diseaseTuples.append(tuple([entityTwo[1], entityOne[1], pathway, publication, variant])) elif entityTwo[0] == 'Drug Class': drugClass2diseaseTuples.append(tuple([entityTwo[1], entityOne[1], pathway, publication, variant])) elif entityTwo[0] == 'Gene': key = tuple([entityTwo[1], entityOne[1]]) if gene2diseaseTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or gene2diseaseTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or gene2diseaseTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or gene2diseaseTuples[key][4] == 'Y' else 'N' gene2diseaseTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: gene2diseaseTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityOne[0] == 'Drug Class': if entityTwo[0] == 'Drug': key = tuple([entityTwo[1], entityOne[1]]) if drug2drugClassTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or drug2drugClassTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or drug2drugClassTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or drug2drugClassTuples[key][4] == 'Y' else 'N' drug2drugClassTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: drug2drugClassTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) elif entityTwo[0] == 'Disease': drugClass2diseaseTuples.append(tuple([entityOne[1], entityTwo[1], pathway, publication, variant])) elif entityTwo[0] == 'Gene': key = tuple([entityOne[1], entityTwo[1]]) if drugClass2geneTuples.has_key(key): pathEvidence = 'Y' if pathway == 'Y' or drugClass2geneTuples[key][2] == 'Y' else 'N' pubEvidence = 'Y' if publication == 'Y' or drugClass2geneTuples[key][3] == 'Y' else 'N' varEvidence = 'Y' if variant == 'Y' or drugClass2geneTuples[key][4] == 'Y' else 'N' drugClass2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence]) else: drugClass2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant]) readIn.close() # Enter the data into the database. conn, cursor = mysql.openConnection(databasePassword, schemaPharmGKB) values = '(' + ('%s,' * len(diseaseTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDisease, values, diseaseTuples) values = '(' + ('%s,' * len(meshTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDisease2MeSH, values, meshTuples) values = '(' + ('%s,' * len(snomedTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDisease2SnoMED, values, snomedTuples) values = '(' + ('%s,' * len(umlsTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDisease2UMLS, values, umlsTuples) values = '(' + ('%s,' * len(drugTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrug, values, drugTuples) values = '(' + ('%s,' * len(geneTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableGene, values, geneTuples) values = '(' + ('%s,' * len(drug2drugTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrug2Drug, values, drug2drugTuples) drug2drugClassTuples = [drug2drugClassTuples[i] for i in drug2drugClassTuples.keys()] values = '(' + ('%s,' * len(drug2drugClassTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrug2DrugClass, values, drug2drugClassTuples) values = '(' + ('%s,' * len(drug2diseaseTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrug2Disease, values, drug2diseaseTuples) values = '(' + ('%s,' * len(drugClass2diseaseTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrugClass2Disease, values, drugClass2diseaseTuples) drug2geneTuples = [drug2geneTuples[i] for i in drug2geneTuples.keys()] values = '(' + ('%s,' * len(drug2geneTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrug2Gene, values, drug2geneTuples) drugClass2geneTuples = [drugClass2geneTuples[i] for i in drugClass2geneTuples.keys()] values = '(' + ('%s,' * len(drugClass2geneTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableDrugClass2Gene, values, drugClass2geneTuples) gene2diseaseTuples = [gene2diseaseTuples[i] for i in gene2diseaseTuples.keys()] values = '(' + ('%s,' * len(gene2diseaseTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableGene2Disease, values, gene2diseaseTuples) gene2geneTuples = [gene2geneTuples[i] for i in gene2geneTuples.keys()] values = '(' + ('%s,' * len(gene2geneTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableGene2Gene, values, gene2geneTuples) mysql.closeConnection(conn, cursor)
def main(CGCParsed, cancerTargets, UPExternalLinks, UPHumanAccessionMap, TTDTarget2Drug, DBTargetIDs, schemaProteins, tableCancerGene, DATABASEPASSWORD): # Record the mapping of non-representative UniProt accessions to representative accessions. accMap = {} readIn = open(UPHumanAccessionMap, 'r') for line in readIn: chunks = (line.strip()).split() accMap[chunks[0]] = chunks[1] readIn.close() # Read in the CGC information. CGCData = {} readIn = open(CGCParsed, 'r') for line in readIn: chunks = (line.strip()).split('\t') geneID = chunks[0] germline = chunks[1] somatic = chunks[2] CGCData[geneID] = {'Cancer' : 'Y', 'Target' : 'N', 'Somatic' : somatic, 'Germline' : germline} readIn.close() cancerTTDTargets = set([]) cancerDrugBankDrugs = set([]) cancerDrugTargetUPAccs = set([]) # Read in the information about anti-neoplastic drugs. readIn = open(cancerTargets, 'r') readIn.readline() # Strip the header line. for line in readIn: chunks = line.split('\t') cancerTTDTargets.add(chunks[2]) cancerDrugBankDrugs.add(chunks[4]) readIn.close() cancerTTDTargets -= set(['']) cancerDrugBankDrugs -= set(['']) # Determine the UniProt accessions of the TTD targets. readIn = open(TTDTarget2Drug, 'r') for line in readIn: accessions = line.split('\t')[1] for i in accessions.split(','): cancerDrugTargetUPAccs.add(i) readIn.close() # Determine the UniProt accessions of the targets of the DrugBank drugs. readIn = open(DBTargetIDs, 'r') for line in readIn: chunks = line.split('\t') accession = chunks[0] drugs = chunks[1].split(';') for i in drugs: if i in cancerDrugBankDrugs: cancerDrugTargetUPAccs.add(accession) readIn.close() geneXRef = {} readIn = open(UPExternalLinks, 'r') for line in readIn: chunks = line.split(',') UPAcc = chunks[0] geneID = chunks[1] if geneID == '': pass else: geneID = geneID.split(';') for i in geneID: if geneXRef.has_key(i): geneXRef[i].add(UPAcc) else: geneXRef[i] = set([UPAcc]) readIn.close() # Create the set of all representative UniProt accessions. allUPAccessions = set([]) for i in accMap: allUPAccessions.add(accMap[i]) # For every representative UniProt accession, determine whether it is implicated in cancer and if it is a target for anti-neoplastic drugs. cancerProteins = dict([(i, {'Cancer' : 'N', 'Target' : 'N', 'Somatic' : 'N', 'Germline' : 'N'}) for i in allUPAccessions]) for i in CGCData.keys(): if geneXRef.has_key(i): for j in geneXRef[i]: cancerProteins[j]['Cancer'] = 'Y' if cancerProteins[j]['Somatic'] == 'Y': # Keep it as a yes. pass elif cancerProteins[j]['Somatic'] == 'N': if CGCData[i]['Somatic'] == 'Y': # Change it to a yes. cancerProteins[j]['Somatic'] = 'Y' if cancerProteins[j]['Germline'] == 'Y': # Keep it as a yes. pass elif cancerProteins[j]['Germline'] == 'N': if CGCData[i]['Germline'] == 'Y': # Change it to a yes. cancerProteins[j]['Germline'] = 'Y' for i in cancerDrugTargetUPAccs: # Record all the proteins that are targets of anti-neoplastic drugs as being implicated in cancer and being targets. try: # Only adding representative accessions. representativeAcc = accMap[i] cancerProteins[representativeAcc]['Cancer'] = 'Y' cancerProteins[representativeAcc]['Target'] = 'Y' except: pass dataRecords = [tuple([i, cancerProteins[i]['Cancer'], cancerProteins[i]['Target'], cancerProteins[i]['Somatic'], cancerProteins[i]['Germline']]) for i in cancerProteins] conn, cursor = mysql.openConnection(DATABASEPASSWORD, schemaProteins) cursor.execute('TRUNCATE TABLE ' + tableCancerGene) values = '(' + ('%s,' * len(dataRecords[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableCancerGene, values, dataRecords) mysql.closeConnection(conn, cursor)
def main(UPProteinInfo, schemaProteins, tableProteinInfo, folderSEG, SEGExe, folderEpestfind, epestfindExe, folderPepstats, pepstatsExe, folderBLAST, psiblastExe, makeBLASTDatabaseExe, tableBLASTResults, databasePassword): #=========================================================================== # Extract and format the parsed protein data. #=========================================================================== uniprotData = utilities.file2list.main(UPProteinInfo) uniprotData = [eval(i) for i in uniprotData] uniprotDict = dict([(i[0], i) for i in uniprotData]) sequenceOfAllProteinsDict = dict([(i, uniprotDict[i][-1]) for i in uniprotDict]) # A dictionary indexed by UniProt accession with the value for each index being the sequence of the protein. #=========================================================================== # Add the data to the database. #=========================================================================== rowsToAdd = [uniprotDict[i] for i in uniprotDict] values = '(' + ('%s,' * len(uniprotData[0])) values = values[:-1] + ')' conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableProteinInfo, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the UniProt table: ', len(toAdd) #=========================================================================== # Annotate the proteins which have just been added to the database. #=========================================================================== # Calculate the number of low complexity regions. proteinFasta = folderSEG + '/TempSEGFasta.fasta' SEGOutput = folderSEG + '/TempSEGOutput.txt' calculate_low_complexity(sequenceOfAllProteinsDict, SEGExe, proteinFasta, SEGOutput, schemaProteins, tableProteinInfo, databasePassword) os.remove(proteinFasta) os.remove(SEGOutput) # Calculate the number of pest motifs. proteinFasta = folderEpestfind + '/TempEpestfindFasta.fasta' epestfindOutput = folderEpestfind + '/TempEpestfindOutput.txt' calculate_pest_motif(sequenceOfAllProteinsDict, epestfindExe, proteinFasta, epestfindOutput, schemaProteins, tableProteinInfo, databasePassword) os.remove(proteinFasta) os.remove(epestfindOutput) # Calculate simple sequence statistics. proteinFasta = folderPepstats + '/TempPepstatsFasta.fasta' pepstatsOutput = folderPepstats + '/TempPepstatsOutput.txt' calculate_sequence_stats(sequenceOfAllProteinsDict, pepstatsExe, proteinFasta, pepstatsOutput, schemaProteins, tableProteinInfo, databasePassword) os.remove(proteinFasta) os.remove(pepstatsOutput) #=========================================================================================== # Use BLAST to determine the pairwise sequence identity of all the proteins in the table. #=========================================================================================== BLASTOutput = folderBLAST + '/ProcessedBLAST.txt' if os.path.isfile(BLASTOutput): os.remove(BLASTOutput) # Generate the two fasta file to make the BLAST database from. allProteinsFasta = folderBLAST + '/TempAllProteinsFasta.fasta' writeTo = open(allProteinsFasta, 'w') for i in sequenceOfAllProteinsDict.keys(): writeTo.write('>' + i + '\n') writeTo.write(sequenceOfAllProteinsDict[i] + '\n') writeTo.close() # Set the non-unique PSI-BLAST parameters tempQuery = folderBLAST + '/TempQuery.fasta' evalue = ' -evalue 1' inclusionEThresh = ' -inclusion_ethresh 0.0001' numIterations = ' -num_iterations 3' gapTrigger = ' -gap_trigger 18' numDescriptions = ' -num_descriptions 10000' numAlignments = ' -num_alignments 10000' dbsize = ' -dbsize 0' outputFormat = ' -outfmt "7 qseqid sseqid pident length evalue"' numThreads = ' -num_threads 2' # BLAST the proteins. tempBlastDatabaseFolder = folderBLAST + '/TempAllProtDB' os.mkdir(tempBlastDatabaseFolder) tempBlastDatabase = tempBlastDatabaseFolder + '/AllProt' makeDBArgs = makeBLASTDatabaseExe + ' -in ' + allProteinsFasta + ' -out ' + tempBlastDatabase subprocess.call(makeDBArgs) allBLASTOutput = folderBLAST + '/AllBLASTOutput.txt' out = ' -out ' + allBLASTOutput db = ' -db ' + tempBlastDatabase for i in sequenceOfAllProteinsDict.keys(): writeTo = open(tempQuery, 'w') writeTo.write('>' + i + '\n') writeTo.write(sequenceOfAllProteinsDict[i] + '\n') writeTo.close() query = ' -query ' + tempQuery argsPSI = (query + out + evalue + inclusionEThresh + numIterations + gapTrigger + numDescriptions + numAlignments + dbsize + db + outputFormat + numThreads) subprocess.call(psiblastExe + argsPSI, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) parsers.parsePSIBLAST.main(allBLASTOutput, BLASTOutput) shutil.rmtree(tempBlastDatabaseFolder) os.remove(allBLASTOutput) os.remove(tempQuery) os.remove(allProteinsFasta) # Enter the BLAST information into the blast results table. conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) matchesFound = {} readIn = open(BLASTOutput, 'r') for line in readIn: chunks = line.split('\t') index = tuple(sorted([chunks[0], chunks[1]])) query = index[0] hit = index[1] similarity = float(chunks[2]) length = int(chunks[3]) eValue = float(chunks[4]) if not matchesFound.has_key(index): # If this is the first time the (query, hit) pair has been found. matchesFound[index] = {} matchesFound[index]['Similarity'] = similarity matchesFound[index]['Tuple'] = tuple([query, hit, similarity, length, eValue]) else: # If the (query, hit) pair has been found previously. if similarity < matchesFound[index]['Similarity']: # If the similarity of the new (query, hit) pair is less than the previously found occurence of the # pair, then discard this pair. This is because we are looking for the situation where the # similarity is greatest, in order to know which pairs are redundant. Continue to next loop # iteration as we do not want to update the database with this pair, or add this pair. continue else: matchesFound[index]['Similarity'] = similarity matchesFound[index]['Tuple'] = tuple([query, hit, similarity, length, eValue]) readIn.close() tuplesToAdd = [] for i in matchesFound: query = matchesFound[i]['Tuple'][0] hit = matchesFound[i]['Tuple'][1] if not query == hit: tuplesToAdd.append(matchesFound[i]['Tuple']) if tuplesToAdd != []: values = '(' + ('%s,' * len(tuplesToAdd[0])) values = values[:-1] + ')' cursor.execute('TRUNCATE TABLE ' + tableBLASTResults) cursor = mysql.tableINSERT(cursor, tableBLASTResults, values, tuplesToAdd) mysql.closeConnection(conn, cursor)
def main(parsedGOOutput, schemaProteins, tableGOInfo, databasePassword): # =========================================================================== # Extract and format the parsed GO data. # =========================================================================== GOData = utilities.file2list.main(parsedGOOutput) GOData = [eval(i) for i in GOData] GODict = dict([(i[0], i) for i in GOData]) # =========================================================================== # Extract the GO information recorded in the database. # =========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, "*", tableGOInfo) results = cursor.fetchall() mysql.closeConnection(conn, cursor) # =========================================================================== # Compare the parsed data with the data recorded in the table. # =========================================================================== columnIndices = range(1, len(GOData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute("SHOW COLUMNS FROM " + tableGOInfo) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = GODict.keys() for i in results: geneID = i[0] if GODict.has_key(geneID): # If the key is in both the parsed file and the table, then it does not need to be added. toAdd.remove(i[0]) # Compare the row from the table with the parsed file, to determine if the table needs updating. for j in columnIndices: if i[j] != GODict[geneID][j]: if not toUpdate.has_key(geneID): toUpdate[geneID] = [] toUpdate[geneID].append(j) else: # If the key is in the table, but not in the parsed file, then the row needs to be removed. toRemove.append(i[0]) values = "(" + ("%s," * len(GOData[0])) values = values[:-1] + ")" # =========================================================================== # Remove rows from the table that are not in the parsed file. # =========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tableGOInfo, 'GOTermID="' + str(i) + '"') mysql.closeConnection(conn, cursor) print "\tEntries removed from the GO table: ", len(toRemove) # =========================================================================== # Update rows that have different values in the parsed file and the table. # =========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + GODict[i][j] + '"' toSet.append(updateString) toSet = ", ".join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tableGOInfo, toSet, 'GOTermID="' + str(i) + '"') mysql.closeConnection(conn, cursor) print "\tEntries updated in the GO table: ", len(toUpdate) # =========================================================================== # Add rows which are not in the table, but are in the parsed file. # =========================================================================== # Split the records to be inserted into smaller chunks so the database connection is not lost # In order to get this to work you need to increase the size of max_allowed_packet for the # MySQL server. This is because the size of some of the paths is very large. # I did this by altering the default my.ini file to contain the line: # max_allowed_packets=32M # and put this line under the [mysqld] section rowsToAdd = [GODict[i] for i in toAdd] length = len(rowsToAdd) itemsInSplit = 5 numberOfSplits = length / itemsInSplit if length % itemsInSplit != 0: numberOfSplits += 1 recordsToInsert = [rowsToAdd[i * itemsInSplit : (i + 1) * itemsInSplit] for i in range(numberOfSplits)] for i in recordsToInsert: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableGOInfo, values, i) mysql.closeConnection(conn, cursor) print "\tEntries added to the GO table: ", len(toAdd)
def main(pathwayElements, UPHumanNames, UPHumanAccessionMap, databasePassword, schemaProteins, tablePathways): # Generate the human accession mapping: UPAccMap = utilities.file2list.main(UPHumanAccessionMap) # Make a dictionary where the index is the, possibly deprecated UniProt accession, and the entry is the # representative accession. allUPID = {} for i in UPAccMap: chunks = i.split() allUPID[chunks[0]] = chunks[1] # Extract the names of the valid proteins. validProteinNames = utilities.file2list.main(UPHumanNames) # Extract the information about what pathways each element is in. proteinData = {} readIn = open(pathwayElements, 'r') for line in readIn: line = line.rstrip() chunks = line.split('\t') pathway = chunks[0] elements = chunks[2:] for i in elements: # For every element in the pathway, extract the names and accessions of the protein. Also record the # name of the pathways that the protein is associated with. chunks = i.split(':') cpath = chunks[0] type = chunks[1] name = chunks[2] UPAccession = chunks[3] if proteinData.has_key(cpath): proteinData[cpath]['Names'] |= set(name.split(',')) proteinData[cpath]['UPAccessions'] |= set(UPAccession.split(',')) proteinData[cpath]['Pathways'].add(pathway) else: proteinData[cpath] = {'Names' : set(name.split(',')), 'UPAccessions' : set(UPAccession.split(',')), 'Pathways' : set([pathway])} readIn.close() # Consolidate the pathway element information extracted. This means combining records with UniProt accessions # that map to the same accession, and removing any proteins that can't be found to have a known accession or # a valid name. consolidatedProteinData = {} for i in proteinData.keys(): UPAccessions = proteinData[i]['UPAccessions'] pathways = proteinData[i]['Pathways'] for j in UPAccessions: j = j.split('-')[0] # Combine all isoforms into one record. # Convert each UniProt accession into its representative form. if allUPID.has_key(j): reprAcc = allUPID[j] if consolidatedProteinData.has_key(reprAcc): consolidatedProteinData[reprAcc]['UPAccessions'] |= UPAccessions consolidatedProteinData[reprAcc]['Pathways'] |= pathways else: consolidatedProteinData[reprAcc] = {'UPAccessions' : UPAccessions, 'Pathways' : pathways} else: # Don't use the names for anything right now. pass proteinTuples = [] for i in consolidatedProteinData.keys(): proteinTuples.append(tuple([i, len(consolidatedProteinData[i]['Pathways'])])) values = '(' + ('%s,' * len(proteinTuples[0])) values = values[:-1] + ')' conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor.execute('TRUNCATE TABLE ' + tablePathways) mysql.tableINSERT(cursor, tablePathways, values, proteinTuples) mysql.closeConnection(conn, cursor)
def main(DBDrugIDs, DBTargetIDs, TTDUPAccessions, ChEMBLUPAccessions, UPHumanAccessionMap, UPDrugIDs, folderCulling, schemaProteins, tableProteinInfo, tableNonRedundant, tableBLASTResults, databasePassword, viewsDict): allTargetsDB = xref_drugbank_uniprot(DBTargetIDs, UPHumanAccessionMap) allTargetsTTD = xref_TTD_uniprot(TTDUPAccessions, UPHumanAccessionMap) allTargets = list(set(allTargetsChEMBL) | set(allTargetsDB) | set(allTargetsTTD) | set(allTargetsUP)) print '\tTotal number of unique targets found: ', len(allTargets) # Extract mode of action and clear the target information. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, ModeOfAction', tableProteinInfo) resultsModeOfAction = cursor.fetchall() for i in resultsModeOfAction: upid = i[0] mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="N"', 'UPAccession="' + upid + '"') mysql.closeConnection(conn, cursor) # Generate the sets of proteins that are GPCRs, kinases, ion channels and proteases. gpcr = [] kinases = [] ionChannels = [] proteases = [] for i in resultsModeOfAction: if i[1] == 'G-protein coupled receptor': gpcr.append(i[0]) elif i[1] == 'Kinase': kinases.append(i[0]) elif i[1] == 'Ion Channel': ionChannels.append(i[0]) elif i[1] == 'Protease': proteases.append(i[0]) # Update the table to indicate which proteins are targets. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) for i in allTargets: mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="Y"', 'UPAccession="' + i + '"') mysql.closeConnection(conn, cursor) # Perform redundancy removal using Leaf. print '\tPerforming redundancy removal.' # Proteins have had their target status changed, so the redundancy needs to be recalculated. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) # Set the number of columns in the nonredundant table. cursor.execute('SHOW COLUMNS FROM ' + tableNonRedundant) numberColumns = len(cursor.fetchall()) # Select all the proteins recorded in the database. The number of columns has one subtracted from it as the # UP accession column does not take the default value. cursor = mysql.tableSELECT(cursor, 'UPAccession', tableProteinInfo) allProteins = [tuple([i[0]] + (['N'] * (numberColumns - 1))) for i in cursor.fetchall()] # Wipe and refill the nonredundant table. cursor.execute('TRUNCATE TABLE ' + tableNonRedundant) values = '(' + ('%s,' * numberColumns) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableNonRedundant, values, allProteins) for column in sorted(viewsDict.keys()): print '\t\tRunning redundancy removal on ', column # For each set of proteins, run the Leaf program. inputLocation = folderCulling + '/' + column + '.txt' cursor = mysql.tableSELECT(cursor, '*', viewsDict[column]) results = cursor.fetchall() # Determine the accessions of the proteins in the current set. proteinSet = [i[0] for i in results] print '\t\t\tSize of Redundant Dataset: ', len(proteinSet) proteinSetString = '\',\''.join(proteinSet) # Select all the BLAST results where both the hit and query protein are in the set to cull. cursor = mysql.tableSELECT(cursor, '*', tableBLASTResults, 'ProteinA IN (\'' + proteinSetString + '\') AND ProteinB IN (\'' + proteinSetString + '\')') protResults = cursor.fetchall() # Generate the file that is going to be used to perform the culling. writeTo = open(inputLocation, 'w') for i in protResults: writeTo.write('\t'.join([str(j) for j in i]) + '\n') writeTo.close() # Perform the culling. adjMatrix, proteinNames = culling.adjlistcreation.main(inputLocation, cutoffPercent=20, maxEValue=1, minAlignLength=20) print '\t\t\tNumber of Proteins in Similarity Graph: ', len(proteinNames) proteinsToCull = culling.Leafcull.main(adjMatrix, proteinNames) print '\t\t\tNumber of Proteins to Cull: ', len(proteinsToCull) for i in proteinsToCull: mysql.tableUPDATE(cursor, tableNonRedundant, column + '="N"', 'UPAccession="' + str(i) + '"') proteinsToKeep = [i for i in proteinSet if i not in proteinsToCull] print '\t\t\tNumber of Proteins to Keep: ', len(proteinsToKeep) for i in proteinsToKeep: mysql.tableUPDATE(cursor, tableNonRedundant, column + '="Y"', 'UPAccession="' + str(i) + '"') mysql.closeConnection(conn, cursor)
def main(UPDrugIDs, DBDrugIDs, DBTargetIDs, TTDTarget2Drug, ChEMBLUPAccessions, ChEMBLCID, bindingParsed, UPHumanAccessionMap, databasePassword, schemaProteins, tableDrugs): # Generate the human accession mapping: UPAccMap = utilities.file2list.main(UPHumanAccessionMap) # Make a dictionary where the index is the, possibly deprecated UniProt accession, and the entry is the # representative accession. allUPID = {} for i in UPAccMap: chunks = i.split() allUPID[chunks[0]] = chunks[1] # Extract the DrugBank drug IDs, the CAS number and the CID for every approved DrugBank drug. approvedDrugBankDrugs = {} approvedDrugTuples = set([]) readDrugs = open(DBDrugIDs, 'r') for line in readDrugs: chunks = (line.replace('\n', '')).split('\t') if len(chunks) < 4 or not 'approved' in chunks[2]: continue name = chunks[1] CAS = chunks[3] CID = chunks[4] approvedDrugBankDrugs[chunks[0]] = tuple([name, CAS, CID]) approvedDrugTuples.add(tuple([name, CAS, CID])) readDrugs.close() approvedDrugs = approvedDrugBankDrugs.keys() # Extract the information about which DrugBank drugs target UniProt proteins, as recorded by UniProt. targetsUP = utilities.file2list.main(UPDrugIDs) targetDrugLinks = {} for i in targetsUP: chunks = i.split('\t') targetDrugLinks[chunks[0]] = set([i for i in chunks[1].split(';') if i in approvedDrugs]) # Extract the information about which DrugBank drugs target UniProt proteins, as recorded by DrugBank. targetsDB = utilities.file2list.main(DBTargetIDs) for i in targetsDB: chunks = i.split('\t') proteinAcc = chunks[0] if not allUPID.has_key(proteinAcc): continue proteinAcc = allUPID[proteinAcc] if targetDrugLinks.has_key(proteinAcc): targetDrugLinks[proteinAcc].union([i for i in chunks[1].split(';') if i in approvedDrugs]) #targetDrugLinks[proteinAcc].intersection([i for i in chunks[1].split(';') if i in approvedDrugs]) else: targetDrugLinks[proteinAcc] = [i for i in chunks[1].split(';') if i in approvedDrugs] # Convert DrugBank drug IDs to CIDs where possible. for i in targetDrugLinks.keys(): targetDrugLinks[i] = [approvedDrugBankDrugs[j] for j in targetDrugLinks[i]] # Extract the TTD target-drug relationship information. targetsTTD = utilities.file2list.main(TTDTarget2Drug) for i in targetsTTD: chunks = i.split('\t') proteinAccs = [allUPID[i] for i in chunks[1].split(',') if allUPID.has_key(i)] if proteinAccs == []: continue drugInfo = [tuple(j.split(',')) for j in chunks[2].split(';')] for j in drugInfo: approvedDrugTuples.add(j) for j in proteinAccs: if not targetDrugLinks.has_key(j): # If the UniProt accession associated with the TTD target is not already recorded as a drug target, # then add it along with the drugs that target it. targetDrugLinks[j] = drugInfo else: # The UniProt accession is already recorded in the target drug relationship dictionary. # Collect all the (Name, CAS, CID) tuples, and see if the drugs associated with the UniProt # accession in the TTD, are already accounted for. drugTuples = targetDrugLinks[j] drugNames = set([]) drugCASes = set([]) drugCIDs = set([]) for k in drugTuples: drugNames.add(k[0]) drugCASes.add(k[1]) drugCIDs.add(k[2]) emptySet = set(['']) drugNames -= emptySet drugCASes -= emptySet drugCIDs -= emptySet for k in drugInfo: if k[0] in drugNames or k[1] in drugCASes or k[2] in drugCIDs: # If this is True, then that means that one of the name, CAS or CID is already recorded in the # list of drugs that target the protein. Therefore, the drug is already recorded as targeting # the protein and should not be added. continue targetDrugLinks[j].append(k) # Extract the ChEMBL drug-target relationship information, along with the binding information. molregno2CID = {} readCIDs = open(ChEMBLCID, 'r') for line in readCIDs: chunks = (line.strip()).split('\t') molregno2CID[chunks[0]] = chunks[1] readCIDs.close() targetDrugBinding = {} targetsChEMBL = utilities.file2list.main(ChEMBLUPAccessions) for i in targetsChEMBL: chunks = i.split('\t') UPAccession = chunks[0] if not allUPID.has_key(UPAccession): # If the target protein is not a UniProt human protein. continue else: UPAccession = allUPID[UPAccession] molregno = chunks[1] drugID = molregno2CID[molregno] if molregno2CID.has_key(molregno) else 'm' + molregno name = chunks[2] # Update the drug-target relationship information. drugInfo = tuple([name, '', drugID]) if not targetDrugLinks.has_key(UPAccession): # If the UniProt accession associated with the TTD target is not already recorded as a drug target, # then add it along with the drugs that target it. targetDrugLinks[UPAccession] = [drugInfo] else: # The UniProt accession is already recorded in the target drug relationship dictionary. # Collect all the (Name, CAS, CID) tuples, and see if the drugs associated with the UniProt # accession in the TTD, are already accounted for. drugTuples = targetDrugLinks[UPAccession] drugNames = set([]) drugCASes = set([]) drugCIDs = set([]) for j in drugTuples: drugNames.add(j[0]) drugCASes.add(j[1]) drugCIDs.add(j[2]) emptySet = set(['']) drugNames -= emptySet drugCASes -= emptySet drugCIDs -= emptySet if drugInfo[0] in drugNames or drugInfo[1] in drugCASes or drugInfo[2] in drugCIDs: # If this is True, then that means that one of the name, CAS or CID is already recorded in the # list of drugs that target the protein. Therefore, the drug is already recorded as targeting # the protein and should not be added. pass else: targetDrugLinks[UPAccession].append(drugInfo) # Record drug binding information. if chunks[4] == 'None' or chunks[5] == 'None': continue value = float(chunks[4]) units = chunks[5] if units == 'M': # Convert the value to nM. value *= 1000000000 elif units == 'mM': # Convert the value to nM. value *= 1000000 elif units == 'uM': # Convert the value to nM. value *= 1000 type = chunks[6] bindingKey = tuple([UPAccession, drugID]) if type.lower() == 'ki': if not targetDrugBinding.has_key(bindingKey): targetDrugBinding[bindingKey] = {'Ki' : value, 'Kd' : float('Inf')} else: targetDrugBinding[bindingKey]['Ki'] = min(targetDrugBinding[bindingKey]['Ki'], value) elif type.lower() == 'kd': if not targetDrugBinding.has_key(bindingKey): targetDrugBinding[bindingKey] = {'Ki' : float('Inf'), 'Kd' : value} else: targetDrugBinding[bindingKey]['Kd'] = min(targetDrugBinding[bindingKey]['Kd'], value) # Extract binding information from BindingDB. bindingData = utilities.file2list.main(bindingParsed) for i in bindingData: chunks = i.split('\t') UPAccession = chunks[0] if allUPID.has_key(UPAccession): UPAccession = allUPID[UPAccession] else: continue CID = chunks[1] Ki = chunks[2] Kd = chunks[3] if CID == 'n/a' or (Ki == 'n/a' and Kd == 'n/a'): # If any of these are True then the binding information can not be cross-referenced (CID == 'n/a', # or there is no binding information. continue if Ki[0] in ['>', '<', '=']: Ki = float(Ki[1:]) else: try: Ki = float(Ki) except: pass if Kd[0] in ['>', '<', '=']: Ki = float(Kd[1:]) else: try: Kd = float(Kd) except: pass bindingKey = tuple([UPAccession, CID]) if Ki != 'n/a': if not targetDrugBinding.has_key(bindingKey): targetDrugBinding[bindingKey] = {'Ki' : Ki, 'Kd' : float('Inf')} else: targetDrugBinding[bindingKey]['Ki'] = min(targetDrugBinding[bindingKey]['Ki'], Ki) if Kd != 'n/a': if not targetDrugBinding.has_key(bindingKey): targetDrugBinding[bindingKey] = {'Ki' : float('Inf'), 'Kd' : Kd} else: targetDrugBinding[bindingKey]['Kd'] = min(targetDrugBinding[bindingKey]['Kd'], Kd) # Generate the tuples to insert into the database. tuplesToInsert = [] for i in targetDrugLinks: for j in targetDrugLinks[i]: drugName = j[0] drugID = j[2] bindingKey = tuple([i, drugID]) if targetDrugBinding.has_key(bindingKey): bindingData = targetDrugBinding[bindingKey] Ki = bindingData['Ki'] if Ki == float('inf'): Ki = -1 Kd = bindingData['Kd'] if Kd == float('inf'): Kd = -1 else: Ki = -1 Kd = -1 tuplesToInsert.append(tuple([i, drugID, drugName, Ki, Kd])) # Remove potential duplicates caused by the names of the drugs having different capital letters, and mysql # ignoring the capitalisation of the names. tuplesToInsert = [tuple([i[0], i[1], i[2].upper(), i[3], i[4]]) for i in tuplesToInsert] tuplesToInsert = list(set(tuplesToInsert)) # Insert the tuples into the database. values = '(' + ('%s,' * len(tuplesToInsert[0])) values = values[:-1] + ')' conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor.execute('TRUNCATE TABLE ' + tableDrugs) # Remove the old data in the table first. mysql.tableINSERT(cursor, tableDrugs, values, tuplesToInsert) mysql.closeConnection(conn, cursor)
def main(UPExternalLinks, tableUniProt2Ensembl, tableUniProt2GO, tableUniProt2UniGene, tableUniProt2HGNC, tableDict, schemaProteins, databasePassword): # Determine all the external database links for each represetnative UniProt accession. proteinLinks = {} readIn = open(UPExternalLinks, 'r') for line in readIn: line = line.strip() chunks = line.split(',') entrezLinks = [i for i in chunks[1].split(';')] if chunks[1] != '' else [] unigeneLinks = [int(i.replace('Hs.', '')) for i in chunks[2].split(';')] if chunks[2] != '' else [] goLinks = [int(i.replace('GO:', '')) for i in chunks[3].split(';')] if chunks[3] != '' else [] hgncLinks = [i for i in chunks[4].split(';')] if chunks[4] != '' else [] ensemblLinks = [i.split('-') for i in chunks[5].split(';')] if chunks[4] != '' else [] proteinLinks[chunks[0]] = {'Entrez' : entrezLinks, 'UniGene' : unigeneLinks, 'GO' : goLinks, 'HGNC' : hgncLinks, 'Ensembl' : ensemblLinks } readIn.close() conn, cursor = mysql.openConnection(databasePassword, schemaProteins) validXrefIDs = {'EnsemblGeneID' : set([]), 'GeneID' : set([]), 'GOTermID' : set([]), 'UPAccession' : set([]), 'UniGeneID' : set([]), 'EnsemblTranscriptID' : set([])} for i in tableDict.keys(): for j in tableDict[i]: # For each of the external databases with its own table in the database (e.g. Ensembl genes, UniGene, GO terms), determine which of the cross-references recorded # are actually referencing a valid ID in the respective database. # For example, if the file of cross-references says that UniProt accession U is linked to GO term 123, then make sure that 123 is in fact a valid Go term ID (i.e. that # it is in the table that contains all the GO term IDs). cursor = mysql.tableSELECT(cursor, i, j) results = [k[0] for k in cursor.fetchall()] results = set(results) validXrefIDs[i] = validXrefIDs[i].union(results) # Determine all the UniProt accession cross-references that are referencing a valid external database identifier. entrezInsert = [] unigeneInsert = [] goInsert = [] hgncInsert = [] ensemblInsert = [] for i in proteinLinks.keys(): if not i in validXrefIDs['UPAccession']: continue for j in proteinLinks[i]['Entrez']: if j in validXrefIDs['GeneID']: entrezInsert.append(tuple([i, str(j)])) for j in proteinLinks[i]['UniGene']: if j in validXrefIDs['UniGeneID']: unigeneInsert.append(tuple([i, j])) for j in proteinLinks[i]['GO']: if j in validXrefIDs['GOTermID']: goInsert.append(tuple([i, j])) for j in proteinLinks[i]['HGNC']: hgncInsert.append(tuple([i, j])) for j in proteinLinks[i]['Ensembl']: if j[0] in validXrefIDs['EnsemblGeneID']:# and j[1] in validXrefIDs['EnsemblTranscriptID']: ensemblInsert.append(tuple([i] + j)) print '\tNow Recording UniGene Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2UniGene) values = '(' + ('%s,' * len(unigeneInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2UniGene, values, unigeneInsert) print '\tNow Recording GO Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2GO) values = '(' + ('%s,' * len(goInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2GO, values, goInsert) print '\tNow Recording HGNC Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2HGNC) values = '(' + ('%s,' * len(hgncInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2HGNC, values, hgncInsert) print '\tNow Recording Ensembl Crossreferences.' cursor.execute('TRUNCATE TABLE ' + tableUniProt2Ensembl) values = '(' + ('%s,' * len(ensemblInsert[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableUniProt2Ensembl, values, ensemblInsert) mysql.closeConnection(conn, cursor)
def main(unigeneParsedOutput, unigeneParsedTotals, schemaProteins, tableUniGene, tableUniGeneTotals, databasePassword): #=========================================================================== # Extract and format the parsed UniGene data, and the UniGene expression totals. #=========================================================================== UGData = utilities.file2list.main(unigeneParsedOutput) UGData = [tuple([int(j) for j in eval(i)]) for i in UGData] UGDict = dict([(i[0], i) for i in UGData]) UGTotalsData = utilities.file2list.main(unigeneParsedTotals) UGTotalsData = [tuple([j[0], eval(j[1])]) for j in [eval(i) for i in UGTotalsData]] #=========================================================================== # Extract the UniGene information recorded in the database. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, '*', tableUniGene) results = cursor.fetchall() mysql.closeConnection(conn, cursor) #=========================================================================== # Compare the parsed data with the data recorded in the expression table. #=========================================================================== columnIndices = range(1, len(UGData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('SHOW COLUMNS FROM ' + tableUniGene) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = UGDict.keys() for i in results: UniGeneID = i[0] if UGDict.has_key(UniGeneID): # If the key is in both the parsed file and the expression table, then it does not need to be added. toAdd.remove(i[0]) # Compare the row from the expression table with the parsed file, to determine if the expression table needs updating. for j in columnIndices: if i[j] != UGDict[UniGeneID][j]: if not toUpdate.has_key(UniGeneID): toUpdate[UniGeneID] = [] toUpdate[UniGeneID].append(j) else: # If the key is in the expression table, but not in the parsed file, then the row needs to be removed. toRemove.append(i[0]) values = '(' + ('%s,' * len(UGData[0])) values = values[:-1] + ')' #=========================================================================== # Remove rows from the expression table that are not in the parsed file. #=========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tableUniGene, 'UniGeneID="' + i + '"') mysql.closeConnection(conn, cursor) print '\tEntries removed from the UniGene table: ', len(toRemove) #=========================================================================== # Update rows that have different values in the parsed file and the expression table. #=========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + UGDict[i][j] + '"' toSet.append(updateString) toSet = ', '.join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tableUniGene, toSet, 'UniGeneID="' + i + '"') mysql.closeConnection(conn, cursor) print '\tEntries updated in the UniGene table: ', len(toUpdate) #=========================================================================== # Add rows which are not in the expression table, but are in the parsed file. #=========================================================================== rowsToAdd = [UGDict[i] for i in toAdd] conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableUniGene, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the UniGene table: ', len(toAdd) #=========================================================================== # Enter the expression totals in the totals table. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('TRUNCATE TABLE ' + tableUniGeneTotals) values = '(' + ('%s,' * len(UGTotalsData[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableUniGeneTotals, values, UGTotalsData) mysql.closeConnection(conn, cursor) print '\tUniGene totals table updated.'
def main(databasePassword, schemaProteins, tableProteinInfo, tableStability): # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool). halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3, 'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2, 'V' : 100.0, 'W' : 2.8, 'Y' : 2.8} # Extract all the sequences stored in the database. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo) results = cursor.fetchall() # Calculate the half life and instability index for each protein. stabilityTuples = [] for i in results: sequence = i[1] if halfLife.has_key(sequence[0]): protHalfLife = halfLife[sequence[0]] else: # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.) protHalfLife = -1 analysedSeq = ProteinAnalysis(sequence) try: instabilityIndex = analysedSeq.instability_index() except: instabilityIndex = -1 print '\tContains invalid aa code: ', i[0] stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex])) cursor.execute('TRUNCATE TABLE ' + tableStability) values = '(' + ('%s,' * len(stabilityTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableStability, values, stabilityTuples) mysql.closeConnection(conn, cursor) #def instability_index(prot, sequence): # # # A two dimentional dictionary for calculating the instability index. # # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). # # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G']. # DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60, # 'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0, # 'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0}, # 'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54, # 'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601}, # 'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0, # 'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49, # 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49}, # 'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94}, # 'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0, # 'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28, # 'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54, # 'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68}, # 'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26, # 'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0}, # 'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0, # 'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0}, # 'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54}, # 'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54, # 'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54, # 'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0}, # 'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28, # 'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54}, # 'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68, # 'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54}, # 'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34}, # } # # score = 0.0 # for i in range(len(sequence) - 1): # if DIWV.has_key(sequence[i]): # if DIWV[sequence[i]].has_key(sequence[i+1]): # score += DIWV[sequence[i]][sequence[i+1]] # return (10.0 / len(sequence)) * score