def calculate_pest_motif(sequenceOfAllProteinsDict, epestfindExe, proteinFasta, epestfindOutput, schemaProteins, tableProteinInfo, databasePassword): # Connect to the specified schema. conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) for i in sequenceOfAllProteinsDict.keys(): # Run every protein in the table through epestfind. UPAcc = i seq = sequenceOfAllProteinsDict[i] # Create a FASTA format file for the protein. This is the input format used for epestfind. epestfindInput = open(proteinFasta, 'w') epestfindInput.write('>' + UPAcc + '\n') epestfindInput.write(seq) epestfindInput.close() # Run epestfind on the fasta file just created. subprocess.call(epestfindExe + ' -sequence ' + proteinFasta + ' -outfile ' + epestfindOutput + ' -auto -window 10 -order score -graph none') # Parse the epestfind output file to determine if there is a valid PEST motif. motifPresent = parsers.parseepestfind.main(epestfindOutput) # Write the number of valid PEST motifs into the PESTMotif column of the protein being analysed. cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'PESTMotif=' + str(motifPresent), 'UPAccession = \'' + UPAcc + '\'') mysql.closeConnection(conn, cursor)
def calculate_low_complexity(sequenceOfAllProteinsDict, SEGExe, proteinFasta, SEGOutput, schemaProteins, tableProteinInfo, databasePassword): conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) for i in sequenceOfAllProteinsDict.keys(): # Run every protein in the table through segmasker. UPAcc = i seq = sequenceOfAllProteinsDict[i] # Create a FASTA format file for the protein. This is the input format used for segmasker. SEGInput = open(proteinFasta, 'w') SEGInput.write('>' + UPAcc + '\n') SEGInput.write(seq) SEGInput.close() # Run segmasker on the fasta file just created. subprocess.call(SEGExe + ' -in ' + proteinFasta + ' -out ' + SEGOutput) # Parse the segmasker output file to determine if there are any low complexity regions. numLowComplexity = parsers.parseSEG.main(SEGOutput) # Write the number of low complexity regions into the LowComplexity column of the protein being analysed. cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'LowComplexity=' + str(numLowComplexity), 'UPAccession = \'' + UPAcc + '\'') mysql.closeConnection(conn, cursor)
def main(UPPPIData, schemaProteins, tablePPI, databasePassword): #=========================================================================== # Extract and format the parsed gene data. #=========================================================================== ppiData = utilities.file2list.main(UPPPIData) ppiData = [eval(i) for i in ppiData] ppiDict = dict([(tuple([i[0], i[1]]), i) for i in ppiData]) #=========================================================================== # Extract the gene information recorded in the database. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, '*', tablePPI) results = cursor.fetchall() mysql.closeConnection(conn, cursor) #=========================================================================== # Compare the parsed data with the data recorded in the table. #=========================================================================== columnIndices = range(1, len(ppiData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('SHOW COLUMNS FROM ' + tablePPI) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = ppiDict.keys() for i in results: proteinOne = i[0] proteinTwo = i[1] dictKey = tuple([proteinOne, proteinTwo]) if ppiDict.has_key(dictKey): # If the key is in both the parsed file and the table, then it does not need to be added. toAdd.remove(dictKey) # Compare the row from the table with the parsed file, to determine if the table needs updating. for j in columnIndices: if i[j] != ppiDict[dictKey][j]: if not toUpdate.has_key(dictKey): toUpdate[dictKey] = [] toUpdate[dictKey].append(j) else: # If the key is in the table, but not in the parsed file, then the row needs to be removed. toRemove.append(dictKey) values = '(' + ('%s,' * len(ppiData[0])) values = values[:-1] + ')' #=========================================================================== # Remove rows from the table that are not in the parsed file. #=========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tablePPI, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"') mysql.closeConnection(conn, cursor) print '\tEntries removed from the PPI table: ', len(toRemove) #=========================================================================== # Update rows that have different values in the parsed file and the table. #=========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + str(ppiDict[i][j]) + '"' toSet.append(updateString) toSet = ', '.join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tablePPI, toSet, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"') mysql.closeConnection(conn, cursor) print '\tEntries updated in the PPI table: ', len(toUpdate) #=========================================================================== # Add rows which are not in the table, but are in the parsed file. #=========================================================================== rowsToAdd = [ppiDict[i] for i in toAdd] conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tablePPI, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the PPI table: ', len(toAdd)
def calculate_sequence_stats(sequenceOfAllProteinsDict, pepstatsExe, proteinFasta, pepstatsOutput, schemaProteins, tableProteinInfo, databasePassword): # Create the lists of the different types of amino acids. trueAAs are the 20 amino acids that are coded for by the genetic code. aminoAcids = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] trueAAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] numAA = len(aminoAcids) tinyAAs = ['A', 'C', 'G', 'S', 'T'] smallAAs = ['A', 'C', 'D', 'G', 'N', 'P', 'S', 'T', 'V'] aliphaticAAs = ['I', 'L', 'V'] aromaticAAs = ['F', 'H', 'W', 'Y'] nonpolarAAs = ['A', 'C', 'F', 'G', 'I', 'L', 'M', 'P', 'V', 'W', 'Y'] polarAAs = ['D', 'E', 'H', 'K', 'N', 'Q', 'R', 'S', 'T'] chargedAAs = ['D', 'E', 'H', 'K', 'R'] basicAAs = ['H', 'K', 'R'] negativelyCharged = ['D', 'E'] positivelyCharged = ['H', 'K', 'R'] # The hydrophobicity of different amino acid residues, as measured by the Kyte and Doolittle scale. hydro = {'A' : 1.8, 'C' : 2.5, 'D' : -3.5, 'E' : -3.5, 'F' : 2.8, 'G' : -0.4, 'H' : -3.2, 'I' : 4.5, 'K' : -3.9, 'L' : 3.8, 'M' : 1.9, 'N' : -3.5, 'P' : -1.6, 'Q' : -3.5, 'R' : -4.5, 'S' : -0.8, 'T' : -0.7, 'V' : 4.2, 'W' : -0.9, 'Y' : -1.3} conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) #=========================================================================== # Run pepstats on all the proteins. #=========================================================================== pepstatsInput = open(proteinFasta, 'w') for i in sequenceOfAllProteinsDict.keys(): UPAcc = i seq = sequenceOfAllProteinsDict[i] pepstatsInput.write('>' + UPAcc + '\n') pepstatsInput.write(seq + '\n') pepstatsInput.close() # Run Pepstats on the newly created fasta file. subprocess.call(pepstatsExe + ' -sequence ' + proteinFasta + ' -outfile ' + pepstatsOutput + ' -auto') # Parse the Pepstats output file to get the isoelectric point of the protein. pIDict = parsers.parsePepstats.main(pepstatsOutput) for i in sequenceOfAllProteinsDict.keys(): UPAcc = i pI = pIDict[UPAcc]['pI'] # Write the isoelectric point into the Isoelectric column of the protein being analysed. cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Isoelectric=' + str(pI), 'UPAccession = \'' + UPAcc + '\'') #=========================================================================== # Calculate the sequence statistics for the proteins. #=========================================================================== for i in sequenceOfAllProteinsDict.keys(): stats = [0.0]*numAA # The summation of the number of each type of amino acid in the protein. UPAcc = i seq = sequenceOfAllProteinsDict[i] seqLen = len(seq) # Go through the amino acids in the sequence and sum up the different types. for aa in seq: index = aminoAcids.index(aa) stats[index] += 1 # Compensate for the fact that not all amino acids recorded in the sequence will be from the 20 coded for by the genome. ## # Remove O, U and X from the count of amino acids ## O = stats[aminoAcids.index('O')] ## U = stats[aminoAcids.index('U')] ## X = stats[aminoAcids.index('X')] ## seqLen = seqLen - O - U - X # B corresponds to asparagine (N) or aspartic acid (D) # Get the number of N and the number of D and treat a B as N/(N+D) asparagines and D/(N+D) aspartic acids B = stats[aminoAcids.index('B')] N = stats[aminoAcids.index('N')] D = stats[aminoAcids.index('D')] if B != 0: extraN = N / 2.#N / (N + D) extraD = D / 2.#D / (N + D) stats[aminoAcids.index('N')] += B * extraN stats[aminoAcids.index('D')] += B * extraD # J corresponds to leucine (L) or isoleucine (I) # Get the number of L and the number of I and treat a J as L/(L+I) leucines and I/(L+I) isoleucines J = stats[aminoAcids.index('J')] L = stats[aminoAcids.index('L')] I = stats[aminoAcids.index('I')] if J != 0: extraL = L / 2.#L / (L + I) extraI = I / 2.#I / (L + I) stats[aminoAcids.index('L')] += J * extraL stats[aminoAcids.index('I')] += J * extraI # Z corresponds to glutamine (Q) or glutamic acid (E) # Get the number of Q and the number of E and treat a Z as Q/(Q+E) glutamines and E/(Q+E) glutamic acids Z = stats[aminoAcids.index('Z')] Q = stats[aminoAcids.index('Q')] E = stats[aminoAcids.index('E')] if Z != 0: extraQ = Q / 2.#Q / (Q + E) extraE = E / 2.#E / (Q + E) stats[aminoAcids.index('Q')] += Z * extraQ stats[aminoAcids.index('E')] += Z * extraE hydroCalc = 0 tinySum = 0 smallSum = 0 aliphaticSum = 0 aromaticSum = 0 nonpolarSum = 0 polarSum = 0 chargedSum = 0 basicSum = 0 negativelyChargedSum = 0 positivelyChargedSum = 0 for i in trueAAs: # For each of the 20 amino acids coded for by the genome, insert the amino acid frequency information into the table. insertValue = stats[aminoAcids.index(i)] / seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, i + '=' + str(insertValue), 'UPAccession = \'' + UPAcc + '\'') # Calculate the hydrophobicity information. hydroCalc = hydroCalc + (stats[aminoAcids.index(i)] * hydro[i]) # Determine the number of tiny, small, etc etc amino acids. if i in tinyAAs: tinySum += stats[aminoAcids.index(i)] if i in smallAAs: smallSum += stats[aminoAcids.index(i)] if i in aliphaticAAs: aliphaticSum += stats[aminoAcids.index(i)] if i in aromaticAAs: aromaticSum += stats[aminoAcids.index(i)] if i in nonpolarAAs: nonpolarSum += stats[aminoAcids.index(i)] if i in polarAAs: polarSum += stats[aminoAcids.index(i)] if i in chargedAAs: chargedSum += stats[aminoAcids.index(i)] if i in basicAAs: basicSum += stats[aminoAcids.index(i)] if i in negativelyCharged: negativelyChargedSum += stats[aminoAcids.index(i)] if i in positivelyCharged: positivelyChargedSum += stats[aminoAcids.index(i)] # Calculate the mean hydrophobicity of the sequence. hydroCalc /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Hydrophobicity=' + str(hydroCalc), 'UPAccession = \'' + UPAcc + '\'') # Calculate the fraction of the sequence that is made up of each class of amino acids. tinySum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Tiny=' + str(tinySum), 'UPAccession = \'' + UPAcc + '\'') smallSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Small=' + str(smallSum), 'UPAccession = \'' + UPAcc + '\'') aliphaticSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Aliphatic=' + str(aliphaticSum), 'UPAccession = \'' + UPAcc + '\'') aromaticSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Aromatic=' + str(aromaticSum), 'UPAccession = \'' + UPAcc + '\'') nonpolarSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'NonPolar=' + str(nonpolarSum), 'UPAccession = \'' + UPAcc + '\'') polarSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Polar=' + str(polarSum), 'UPAccession = \'' + UPAcc + '\'') chargedSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Charged=' + str(chargedSum), 'UPAccession = \'' + UPAcc + '\'') basicSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Basic=' + str(basicSum), 'UPAccession = \'' + UPAcc + '\'') negativelyChargedSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'NegativelyCharged=' + str(negativelyChargedSum), 'UPAccession = \'' + UPAcc + '\'') positivelyChargedSum /= seqLen cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'PositivelyCharged=' + str(positivelyChargedSum), 'UPAccession = \'' + UPAcc + '\'') mysql.closeConnection(conn, cursor)
def main(parsedGOOutput, schemaProteins, tableGOInfo, databasePassword): # =========================================================================== # Extract and format the parsed GO data. # =========================================================================== GOData = utilities.file2list.main(parsedGOOutput) GOData = [eval(i) for i in GOData] GODict = dict([(i[0], i) for i in GOData]) # =========================================================================== # Extract the GO information recorded in the database. # =========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, "*", tableGOInfo) results = cursor.fetchall() mysql.closeConnection(conn, cursor) # =========================================================================== # Compare the parsed data with the data recorded in the table. # =========================================================================== columnIndices = range(1, len(GOData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute("SHOW COLUMNS FROM " + tableGOInfo) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = GODict.keys() for i in results: geneID = i[0] if GODict.has_key(geneID): # If the key is in both the parsed file and the table, then it does not need to be added. toAdd.remove(i[0]) # Compare the row from the table with the parsed file, to determine if the table needs updating. for j in columnIndices: if i[j] != GODict[geneID][j]: if not toUpdate.has_key(geneID): toUpdate[geneID] = [] toUpdate[geneID].append(j) else: # If the key is in the table, but not in the parsed file, then the row needs to be removed. toRemove.append(i[0]) values = "(" + ("%s," * len(GOData[0])) values = values[:-1] + ")" # =========================================================================== # Remove rows from the table that are not in the parsed file. # =========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tableGOInfo, 'GOTermID="' + str(i) + '"') mysql.closeConnection(conn, cursor) print "\tEntries removed from the GO table: ", len(toRemove) # =========================================================================== # Update rows that have different values in the parsed file and the table. # =========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + GODict[i][j] + '"' toSet.append(updateString) toSet = ", ".join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tableGOInfo, toSet, 'GOTermID="' + str(i) + '"') mysql.closeConnection(conn, cursor) print "\tEntries updated in the GO table: ", len(toUpdate) # =========================================================================== # Add rows which are not in the table, but are in the parsed file. # =========================================================================== # Split the records to be inserted into smaller chunks so the database connection is not lost # In order to get this to work you need to increase the size of max_allowed_packet for the # MySQL server. This is because the size of some of the paths is very large. # I did this by altering the default my.ini file to contain the line: # max_allowed_packets=32M # and put this line under the [mysqld] section rowsToAdd = [GODict[i] for i in toAdd] length = len(rowsToAdd) itemsInSplit = 5 numberOfSplits = length / itemsInSplit if length % itemsInSplit != 0: numberOfSplits += 1 recordsToInsert = [rowsToAdd[i * itemsInSplit : (i + 1) * itemsInSplit] for i in range(numberOfSplits)] for i in recordsToInsert: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableGOInfo, values, i) mysql.closeConnection(conn, cursor) print "\tEntries added to the GO table: ", len(toAdd)
def main(DBDrugIDs, DBTargetIDs, TTDUPAccessions, ChEMBLUPAccessions, UPHumanAccessionMap, UPDrugIDs, folderCulling, schemaProteins, tableProteinInfo, tableNonRedundant, tableBLASTResults, databasePassword, viewsDict): allTargetsDB = xref_drugbank_uniprot(DBTargetIDs, UPHumanAccessionMap) allTargetsTTD = xref_TTD_uniprot(TTDUPAccessions, UPHumanAccessionMap) allTargets = list(set(allTargetsChEMBL) | set(allTargetsDB) | set(allTargetsTTD) | set(allTargetsUP)) print '\tTotal number of unique targets found: ', len(allTargets) # Extract mode of action and clear the target information. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, ModeOfAction', tableProteinInfo) resultsModeOfAction = cursor.fetchall() for i in resultsModeOfAction: upid = i[0] mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="N"', 'UPAccession="' + upid + '"') mysql.closeConnection(conn, cursor) # Generate the sets of proteins that are GPCRs, kinases, ion channels and proteases. gpcr = [] kinases = [] ionChannels = [] proteases = [] for i in resultsModeOfAction: if i[1] == 'G-protein coupled receptor': gpcr.append(i[0]) elif i[1] == 'Kinase': kinases.append(i[0]) elif i[1] == 'Ion Channel': ionChannels.append(i[0]) elif i[1] == 'Protease': proteases.append(i[0]) # Update the table to indicate which proteins are targets. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) for i in allTargets: mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="Y"', 'UPAccession="' + i + '"') mysql.closeConnection(conn, cursor) # Perform redundancy removal using Leaf. print '\tPerforming redundancy removal.' # Proteins have had their target status changed, so the redundancy needs to be recalculated. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) # Set the number of columns in the nonredundant table. cursor.execute('SHOW COLUMNS FROM ' + tableNonRedundant) numberColumns = len(cursor.fetchall()) # Select all the proteins recorded in the database. The number of columns has one subtracted from it as the # UP accession column does not take the default value. cursor = mysql.tableSELECT(cursor, 'UPAccession', tableProteinInfo) allProteins = [tuple([i[0]] + (['N'] * (numberColumns - 1))) for i in cursor.fetchall()] # Wipe and refill the nonredundant table. cursor.execute('TRUNCATE TABLE ' + tableNonRedundant) values = '(' + ('%s,' * numberColumns) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableNonRedundant, values, allProteins) for column in sorted(viewsDict.keys()): print '\t\tRunning redundancy removal on ', column # For each set of proteins, run the Leaf program. inputLocation = folderCulling + '/' + column + '.txt' cursor = mysql.tableSELECT(cursor, '*', viewsDict[column]) results = cursor.fetchall() # Determine the accessions of the proteins in the current set. proteinSet = [i[0] for i in results] print '\t\t\tSize of Redundant Dataset: ', len(proteinSet) proteinSetString = '\',\''.join(proteinSet) # Select all the BLAST results where both the hit and query protein are in the set to cull. cursor = mysql.tableSELECT(cursor, '*', tableBLASTResults, 'ProteinA IN (\'' + proteinSetString + '\') AND ProteinB IN (\'' + proteinSetString + '\')') protResults = cursor.fetchall() # Generate the file that is going to be used to perform the culling. writeTo = open(inputLocation, 'w') for i in protResults: writeTo.write('\t'.join([str(j) for j in i]) + '\n') writeTo.close() # Perform the culling. adjMatrix, proteinNames = culling.adjlistcreation.main(inputLocation, cutoffPercent=20, maxEValue=1, minAlignLength=20) print '\t\t\tNumber of Proteins in Similarity Graph: ', len(proteinNames) proteinsToCull = culling.Leafcull.main(adjMatrix, proteinNames) print '\t\t\tNumber of Proteins to Cull: ', len(proteinsToCull) for i in proteinsToCull: mysql.tableUPDATE(cursor, tableNonRedundant, column + '="N"', 'UPAccession="' + str(i) + '"') proteinsToKeep = [i for i in proteinSet if i not in proteinsToCull] print '\t\t\tNumber of Proteins to Keep: ', len(proteinsToKeep) for i in proteinsToKeep: mysql.tableUPDATE(cursor, tableNonRedundant, column + '="Y"', 'UPAccession="' + str(i) + '"') mysql.closeConnection(conn, cursor)
def main(unigeneParsedOutput, unigeneParsedTotals, schemaProteins, tableUniGene, tableUniGeneTotals, databasePassword): #=========================================================================== # Extract and format the parsed UniGene data, and the UniGene expression totals. #=========================================================================== UGData = utilities.file2list.main(unigeneParsedOutput) UGData = [tuple([int(j) for j in eval(i)]) for i in UGData] UGDict = dict([(i[0], i) for i in UGData]) UGTotalsData = utilities.file2list.main(unigeneParsedTotals) UGTotalsData = [tuple([j[0], eval(j[1])]) for j in [eval(i) for i in UGTotalsData]] #=========================================================================== # Extract the UniGene information recorded in the database. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableSELECT(cursor, '*', tableUniGene) results = cursor.fetchall() mysql.closeConnection(conn, cursor) #=========================================================================== # Compare the parsed data with the data recorded in the expression table. #=========================================================================== columnIndices = range(1, len(UGData[0])) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('SHOW COLUMNS FROM ' + tableUniGene) columns = cursor.fetchall() mysql.closeConnection(conn, cursor) columns = [i[0] for i in columns] toRemove = [] toUpdate = {} toAdd = UGDict.keys() for i in results: UniGeneID = i[0] if UGDict.has_key(UniGeneID): # If the key is in both the parsed file and the expression table, then it does not need to be added. toAdd.remove(i[0]) # Compare the row from the expression table with the parsed file, to determine if the expression table needs updating. for j in columnIndices: if i[j] != UGDict[UniGeneID][j]: if not toUpdate.has_key(UniGeneID): toUpdate[UniGeneID] = [] toUpdate[UniGeneID].append(j) else: # If the key is in the expression table, but not in the parsed file, then the row needs to be removed. toRemove.append(i[0]) values = '(' + ('%s,' * len(UGData[0])) values = values[:-1] + ')' #=========================================================================== # Remove rows from the expression table that are not in the parsed file. #=========================================================================== for i in toRemove: conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.rowDELETE(cursor, tableUniGene, 'UniGeneID="' + i + '"') mysql.closeConnection(conn, cursor) print '\tEntries removed from the UniGene table: ', len(toRemove) #=========================================================================== # Update rows that have different values in the parsed file and the expression table. #=========================================================================== for i in toUpdate.keys(): toSet = [] for j in toUpdate[i]: updateString = columns[j] + ' = "' + UGDict[i][j] + '"' toSet.append(updateString) toSet = ', '.join(toSet) conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableUPDATE(cursor, tableUniGene, toSet, 'UniGeneID="' + i + '"') mysql.closeConnection(conn, cursor) print '\tEntries updated in the UniGene table: ', len(toUpdate) #=========================================================================== # Add rows which are not in the expression table, but are in the parsed file. #=========================================================================== rowsToAdd = [UGDict[i] for i in toAdd] conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor = mysql.tableINSERT(cursor, tableUniGene, values, rowsToAdd) mysql.closeConnection(conn, cursor) print '\tEntries added to the UniGene table: ', len(toAdd) #=========================================================================== # Enter the expression totals in the totals table. #=========================================================================== conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins) cursor.execute('TRUNCATE TABLE ' + tableUniGeneTotals) values = '(' + ('%s,' * len(UGTotalsData[0])) values = values[:-1] + ')' cursor = mysql.tableINSERT(cursor, tableUniGeneTotals, values, UGTotalsData) mysql.closeConnection(conn, cursor) print '\tUniGene totals table updated.'