def calculate_pest_motif(sequenceOfAllProteinsDict, epestfindExe, proteinFasta, epestfindOutput, schemaProteins,
                         tableProteinInfo, databasePassword):

    # Connect to the specified schema.
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)

    for i in sequenceOfAllProteinsDict.keys():
        # Run every protein in the table through epestfind.
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]

        # Create a FASTA format file for the protein. This is the input format used for epestfind.
        epestfindInput = open(proteinFasta, 'w')
        epestfindInput.write('>' + UPAcc + '\n')
        epestfindInput.write(seq)
        epestfindInput.close()

        # Run epestfind on the fasta file just created.
        subprocess.call(epestfindExe + ' -sequence ' + proteinFasta + ' -outfile ' + epestfindOutput + ' -auto -window 10 -order score -graph none')

        # Parse the epestfind output file to determine if there is a valid PEST motif.
        motifPresent = parsers.parseepestfind.main(epestfindOutput)

        # Write the number of valid PEST motifs into the PESTMotif column of the protein being analysed.
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'PESTMotif=' + str(motifPresent), 'UPAccession = \'' + UPAcc + '\'')


    mysql.closeConnection(conn, cursor)
def calculate_low_complexity(sequenceOfAllProteinsDict, SEGExe, proteinFasta, SEGOutput, schemaProteins,
                             tableProteinInfo, databasePassword):

    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)

    for i in sequenceOfAllProteinsDict.keys():
        # Run every protein in the table through segmasker.
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]

        # Create a FASTA format file for the protein. This is the input format used for segmasker.
        SEGInput = open(proteinFasta, 'w')
        SEGInput.write('>' + UPAcc + '\n')
        SEGInput.write(seq)
        SEGInput.close()

        # Run segmasker on the fasta file just created.
        subprocess.call(SEGExe + ' -in ' + proteinFasta + ' -out ' + SEGOutput)

        # Parse the segmasker output file to determine if there are any low complexity regions.
        numLowComplexity = parsers.parseSEG.main(SEGOutput)

        # Write the number of low complexity regions into the LowComplexity column of the protein being analysed.
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'LowComplexity=' + str(numLowComplexity), 'UPAccession = \'' + UPAcc + '\'')


    mysql.closeConnection(conn, cursor)
def main(UPPPIData, schemaProteins, tablePPI, databasePassword):

    #===========================================================================
    # Extract and format the parsed gene data.
    #===========================================================================
    ppiData = utilities.file2list.main(UPPPIData)
    ppiData = [eval(i) for i in ppiData]
    ppiDict = dict([(tuple([i[0], i[1]]), i) for i in ppiData])

    #===========================================================================
    # Extract the gene information recorded in the database.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableSELECT(cursor, '*', tablePPI)
    results = cursor.fetchall()
    mysql.closeConnection(conn, cursor)

    #===========================================================================
    # Compare the parsed data with the data recorded in the table.
    #===========================================================================
    columnIndices = range(1, len(ppiData[0]))
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute('SHOW COLUMNS FROM ' + tablePPI)
    columns = cursor.fetchall()
    mysql.closeConnection(conn, cursor)
    columns = [i[0] for i in columns]

    toRemove = []
    toUpdate = {}
    toAdd = ppiDict.keys()
    for i in results:
        proteinOne = i[0]
        proteinTwo = i[1]
        dictKey = tuple([proteinOne, proteinTwo])
        if ppiDict.has_key(dictKey):
            # If the key is in both the parsed file and the table, then it does not need to be added.
            toAdd.remove(dictKey)
            # Compare the row from the table with the parsed file, to determine if the table needs updating.
            for j in columnIndices:
                if i[j] != ppiDict[dictKey][j]:
                    if not toUpdate.has_key(dictKey):
                        toUpdate[dictKey] = []
                    toUpdate[dictKey].append(j)
        else:
            # If the key is in the table, but not in the parsed file, then the row needs to be removed.
            toRemove.append(dictKey)
    values = '(' + ('%s,' * len(ppiData[0]))
    values = values[:-1] + ')'

    #===========================================================================
    # Remove rows from the table that are not in the parsed file.
    #===========================================================================
    for i in toRemove:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.rowDELETE(cursor, tablePPI, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries removed from the PPI table: ', len(toRemove)

    #===========================================================================
    # Update rows that have different values in the parsed file and the table.
    #===========================================================================
    for i in toUpdate.keys():
        toSet = []
        for j in toUpdate[i]:
            updateString = columns[j] + ' = "' + str(ppiDict[i][j]) + '"'
            toSet.append(updateString)
        toSet = ', '.join(toSet)
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableUPDATE(cursor, tablePPI, toSet, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries updated in the PPI table: ', len(toUpdate)

    #===========================================================================
    # Add rows which are not in the table, but are in the parsed file.
    #===========================================================================
    rowsToAdd = [ppiDict[i] for i in toAdd]
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableINSERT(cursor, tablePPI, values, rowsToAdd)
    mysql.closeConnection(conn, cursor)
    print '\tEntries added to the PPI table: ', len(toAdd)
def calculate_sequence_stats(sequenceOfAllProteinsDict, pepstatsExe, proteinFasta, pepstatsOutput, schemaProteins,
                             tableProteinInfo, databasePassword):

    # Create the lists of the different types of amino acids. trueAAs are the 20 amino acids that are coded for by the genetic code.
    aminoAcids = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    trueAAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    numAA = len(aminoAcids)
    tinyAAs = ['A', 'C', 'G', 'S', 'T']
    smallAAs = ['A', 'C', 'D', 'G', 'N', 'P', 'S', 'T', 'V']
    aliphaticAAs = ['I', 'L', 'V']
    aromaticAAs = ['F', 'H', 'W', 'Y']
    nonpolarAAs = ['A', 'C', 'F', 'G', 'I', 'L', 'M', 'P', 'V', 'W', 'Y']
    polarAAs = ['D', 'E', 'H', 'K', 'N', 'Q', 'R', 'S', 'T']
    chargedAAs = ['D', 'E', 'H', 'K', 'R']
    basicAAs = ['H', 'K', 'R']
    negativelyCharged = ['D', 'E']
    positivelyCharged = ['H', 'K', 'R']
    # The hydrophobicity of different amino acid residues, as measured by the Kyte and Doolittle scale.
    hydro = {'A' : 1.8, 'C' : 2.5, 'D' : -3.5, 'E' : -3.5, 'F' : 2.8, 'G' : -0.4, 'H' : -3.2, 'I' : 4.5,
             'K' : -3.9, 'L' : 3.8, 'M' : 1.9, 'N' : -3.5, 'P' : -1.6, 'Q' : -3.5, 'R' : -4.5, 'S' : -0.8,
             'T' : -0.7, 'V' : 4.2, 'W' : -0.9, 'Y' : -1.3}

    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)

    #===========================================================================
    # Run pepstats on all the proteins.
    #===========================================================================
    pepstatsInput = open(proteinFasta, 'w')
    for i in sequenceOfAllProteinsDict.keys():
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]
        pepstatsInput.write('>' + UPAcc + '\n')
        pepstatsInput.write(seq + '\n')
    pepstatsInput.close()

    # Run Pepstats on the newly created fasta file.
    subprocess.call(pepstatsExe + ' -sequence ' + proteinFasta + ' -outfile ' + pepstatsOutput + ' -auto')

    # Parse the Pepstats output file to get the isoelectric point of the protein.
    pIDict = parsers.parsePepstats.main(pepstatsOutput)

    for i in sequenceOfAllProteinsDict.keys():
        UPAcc = i
        pI = pIDict[UPAcc]['pI']

        # Write the isoelectric point into the Isoelectric column of the protein being analysed.
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Isoelectric=' + str(pI), 'UPAccession = \'' + UPAcc + '\'')

    #===========================================================================
    # Calculate the sequence statistics for the proteins.
    #===========================================================================
    for i in sequenceOfAllProteinsDict.keys():
        stats = [0.0]*numAA  # The summation of the number of each type of amino acid in the protein.
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]
        seqLen = len(seq)

        # Go through the amino acids in the sequence and sum up the different types.
        for aa in seq:
            index = aminoAcids.index(aa)
            stats[index] += 1

        # Compensate for the fact that not all amino acids recorded in the sequence will be from the 20 coded for by the genome.

##        # Remove O, U and X from the count of amino acids
##        O = stats[aminoAcids.index('O')]
##        U = stats[aminoAcids.index('U')]
##        X = stats[aminoAcids.index('X')]
##        seqLen = seqLen - O - U - X

        # B corresponds to asparagine (N) or aspartic acid (D)
        # Get the number of N and the number of D and treat a B as N/(N+D) asparagines and D/(N+D) aspartic acids
        B = stats[aminoAcids.index('B')]
        N = stats[aminoAcids.index('N')]
        D = stats[aminoAcids.index('D')]
        if B != 0:
            extraN = N / 2.#N / (N + D)
            extraD = D / 2.#D / (N + D)
            stats[aminoAcids.index('N')] += B * extraN
            stats[aminoAcids.index('D')] += B * extraD

        # J corresponds to leucine (L) or isoleucine (I)
        # Get the number of L and the number of I and treat a J as L/(L+I) leucines and I/(L+I) isoleucines
        J = stats[aminoAcids.index('J')]
        L = stats[aminoAcids.index('L')]
        I = stats[aminoAcids.index('I')]
        if J != 0:
            extraL = L / 2.#L / (L + I)
            extraI = I / 2.#I / (L + I)
            stats[aminoAcids.index('L')] += J * extraL
            stats[aminoAcids.index('I')] += J * extraI

        # Z corresponds to glutamine (Q) or glutamic acid (E)
        # Get the number of Q and the number of E and treat a Z as Q/(Q+E) glutamines and E/(Q+E) glutamic acids
        Z = stats[aminoAcids.index('Z')]
        Q = stats[aminoAcids.index('Q')]
        E = stats[aminoAcids.index('E')]
        if Z != 0:
            extraQ = Q / 2.#Q / (Q + E)
            extraE = E / 2.#E / (Q + E)
            stats[aminoAcids.index('Q')] += Z * extraQ
            stats[aminoAcids.index('E')] += Z * extraE

        hydroCalc = 0
        tinySum = 0
        smallSum = 0
        aliphaticSum = 0
        aromaticSum = 0
        nonpolarSum = 0
        polarSum = 0
        chargedSum = 0
        basicSum = 0
        negativelyChargedSum = 0
        positivelyChargedSum = 0
        for i in trueAAs:
            # For each of the 20 amino acids coded for by the genome, insert the amino acid frequency information into the table.
            insertValue = stats[aminoAcids.index(i)] / seqLen
            cursor = mysql.tableUPDATE(cursor, tableProteinInfo, i + '=' + str(insertValue), 'UPAccession = \'' + UPAcc + '\'')

            # Calculate the hydrophobicity information.
            hydroCalc = hydroCalc + (stats[aminoAcids.index(i)] * hydro[i])

            # Determine the number of tiny, small, etc etc amino acids.
            if i in tinyAAs:
                tinySum += stats[aminoAcids.index(i)]
            if i in smallAAs:
                smallSum += stats[aminoAcids.index(i)]
            if i in aliphaticAAs:
                aliphaticSum += stats[aminoAcids.index(i)]
            if i in aromaticAAs:
                aromaticSum += stats[aminoAcids.index(i)]
            if i in nonpolarAAs:
                nonpolarSum += stats[aminoAcids.index(i)]
            if i in polarAAs:
                polarSum += stats[aminoAcids.index(i)]
            if i in chargedAAs:
                chargedSum += stats[aminoAcids.index(i)]
            if i in basicAAs:
                basicSum += stats[aminoAcids.index(i)]
            if i in negativelyCharged:
                negativelyChargedSum += stats[aminoAcids.index(i)]
            if i in positivelyCharged:
                positivelyChargedSum += stats[aminoAcids.index(i)]

        # Calculate the mean hydrophobicity of the sequence.
        hydroCalc /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Hydrophobicity=' + str(hydroCalc), 'UPAccession = \'' + UPAcc + '\'')
        # Calculate the fraction of the sequence that is made up of each class of amino acids.
        tinySum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Tiny=' + str(tinySum), 'UPAccession = \'' + UPAcc + '\'')
        smallSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Small=' + str(smallSum), 'UPAccession = \'' + UPAcc + '\'')
        aliphaticSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Aliphatic=' + str(aliphaticSum), 'UPAccession = \'' + UPAcc + '\'')
        aromaticSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Aromatic=' + str(aromaticSum), 'UPAccession = \'' + UPAcc + '\'')
        nonpolarSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'NonPolar=' + str(nonpolarSum), 'UPAccession = \'' + UPAcc + '\'')
        polarSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Polar=' + str(polarSum), 'UPAccession = \'' + UPAcc + '\'')
        chargedSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Charged=' + str(chargedSum), 'UPAccession = \'' + UPAcc + '\'')
        basicSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Basic=' + str(basicSum), 'UPAccession = \'' + UPAcc + '\'')
        negativelyChargedSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'NegativelyCharged=' + str(negativelyChargedSum), 'UPAccession = \'' + UPAcc + '\'')
        positivelyChargedSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'PositivelyCharged=' + str(positivelyChargedSum), 'UPAccession = \'' + UPAcc + '\'')

    mysql.closeConnection(conn, cursor)
def main(parsedGOOutput, schemaProteins, tableGOInfo, databasePassword):

    # ===========================================================================
    # Extract and format the parsed GO data.
    # ===========================================================================
    GOData = utilities.file2list.main(parsedGOOutput)
    GOData = [eval(i) for i in GOData]
    GODict = dict([(i[0], i) for i in GOData])

    # ===========================================================================
    # Extract the GO information recorded in the database.
    # ===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableSELECT(cursor, "*", tableGOInfo)
    results = cursor.fetchall()
    mysql.closeConnection(conn, cursor)

    # ===========================================================================
    # Compare the parsed data with the data recorded in the table.
    # ===========================================================================
    columnIndices = range(1, len(GOData[0]))
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute("SHOW COLUMNS FROM " + tableGOInfo)
    columns = cursor.fetchall()
    mysql.closeConnection(conn, cursor)
    columns = [i[0] for i in columns]

    toRemove = []
    toUpdate = {}
    toAdd = GODict.keys()
    for i in results:
        geneID = i[0]
        if GODict.has_key(geneID):
            # If the key is in both the parsed file and the table, then it does not need to be added.
            toAdd.remove(i[0])
            # Compare the row from the table with the parsed file, to determine if the table needs updating.
            for j in columnIndices:
                if i[j] != GODict[geneID][j]:
                    if not toUpdate.has_key(geneID):
                        toUpdate[geneID] = []
                    toUpdate[geneID].append(j)
        else:
            # If the key is in the table, but not in the parsed file, then the row needs to be removed.
            toRemove.append(i[0])
    values = "(" + ("%s," * len(GOData[0]))
    values = values[:-1] + ")"

    # ===========================================================================
    # Remove rows from the table that are not in the parsed file.
    # ===========================================================================
    for i in toRemove:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.rowDELETE(cursor, tableGOInfo, 'GOTermID="' + str(i) + '"')
        mysql.closeConnection(conn, cursor)
    print "\tEntries removed from the GO table: ", len(toRemove)

    # ===========================================================================
    # Update rows that have different values in the parsed file and the table.
    # ===========================================================================
    for i in toUpdate.keys():
        toSet = []
        for j in toUpdate[i]:
            updateString = columns[j] + ' = "' + GODict[i][j] + '"'
            toSet.append(updateString)
        toSet = ", ".join(toSet)
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableUPDATE(cursor, tableGOInfo, toSet, 'GOTermID="' + str(i) + '"')
        mysql.closeConnection(conn, cursor)
    print "\tEntries updated in the GO table: ", len(toUpdate)

    # ===========================================================================
    # Add rows which are not in the table, but are in the parsed file.
    # ===========================================================================
    # Split the records to be inserted into smaller chunks so the database connection is not lost
    # In order to get this to work you need to increase the size of max_allowed_packet for the
    # MySQL server. This is because the size of some of the paths is very large.
    # I did this by altering the default my.ini file to contain the line:
    # max_allowed_packets=32M
    # and put this line under the [mysqld] section
    rowsToAdd = [GODict[i] for i in toAdd]
    length = len(rowsToAdd)
    itemsInSplit = 5
    numberOfSplits = length / itemsInSplit
    if length % itemsInSplit != 0:
        numberOfSplits += 1
    recordsToInsert = [rowsToAdd[i * itemsInSplit : (i + 1) * itemsInSplit] for i in range(numberOfSplits)]
    for i in recordsToInsert:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableINSERT(cursor, tableGOInfo, values, i)
        mysql.closeConnection(conn, cursor)
    print "\tEntries added to the GO table: ", len(toAdd)
def main(DBDrugIDs, DBTargetIDs, TTDUPAccessions, ChEMBLUPAccessions, UPHumanAccessionMap,
         UPDrugIDs, folderCulling, schemaProteins, tableProteinInfo, tableNonRedundant, tableBLASTResults,
         databasePassword, viewsDict):

    allTargetsDB = xref_drugbank_uniprot(DBTargetIDs, UPHumanAccessionMap)
    allTargetsTTD = xref_TTD_uniprot(TTDUPAccessions, UPHumanAccessionMap)
    allTargets = list(set(allTargetsChEMBL) | set(allTargetsDB) | set(allTargetsTTD) | set(allTargetsUP))
    print '\tTotal number of unique targets found: ', len(allTargets)

    # Extract mode of action and clear the target information.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor = mysql.tableSELECT(cursor, 'UPAccession, ModeOfAction', tableProteinInfo)
    resultsModeOfAction = cursor.fetchall()
    for i in resultsModeOfAction:
        upid = i[0]
        mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="N"', 'UPAccession="' + upid + '"')
    mysql.closeConnection(conn, cursor)

    # Generate the sets of proteins that are GPCRs, kinases, ion channels and proteases.
    gpcr = []
    kinases = []
    ionChannels = []
    proteases = []
    for i in resultsModeOfAction:
        if i[1] == 'G-protein coupled receptor':
            gpcr.append(i[0])
        elif i[1] == 'Kinase':
            kinases.append(i[0])
        elif i[1] == 'Ion Channel':
            ionChannels.append(i[0])
        elif i[1] == 'Protease':
            proteases.append(i[0])

    # Update the table to indicate which proteins are targets.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    for i in allTargets:
        mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="Y"', 'UPAccession="' + i + '"')
    mysql.closeConnection(conn, cursor)

    # Perform redundancy removal using Leaf.
    print '\tPerforming redundancy removal.'
    # Proteins have had their target status changed, so the redundancy needs to be recalculated.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    # Set the number of columns in the nonredundant table.
    cursor.execute('SHOW COLUMNS FROM ' + tableNonRedundant)
    numberColumns = len(cursor.fetchall())
    # Select all the proteins recorded in the database. The number of columns has one subtracted from it as the
    # UP accession column does not take the default value.
    cursor = mysql.tableSELECT(cursor, 'UPAccession', tableProteinInfo)
    allProteins = [tuple([i[0]] + (['N'] * (numberColumns - 1))) for i in cursor.fetchall()]
    # Wipe and refill the nonredundant table.
    cursor.execute('TRUNCATE TABLE ' + tableNonRedundant)
    values = '(' + ('%s,' * numberColumns)
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableNonRedundant, values, allProteins)
    for column in sorted(viewsDict.keys()):
        print '\t\tRunning redundancy removal on ', column
        # For each set of proteins, run the Leaf program.
        inputLocation = folderCulling + '/' + column + '.txt'
        cursor = mysql.tableSELECT(cursor, '*', viewsDict[column])
        results = cursor.fetchall()
        # Determine the accessions of the proteins in the current set.
        proteinSet = [i[0] for i in results]
        print '\t\t\tSize of Redundant Dataset: ', len(proteinSet)
        proteinSetString = '\',\''.join(proteinSet)
        # Select all the BLAST results where both the hit and query protein are in the set to cull.
        cursor = mysql.tableSELECT(cursor, '*', tableBLASTResults, 'ProteinA IN (\'' + proteinSetString + '\') AND ProteinB IN (\'' + proteinSetString + '\')')
        protResults = cursor.fetchall()
        # Generate the file that is going to be used to perform the culling.
        writeTo = open(inputLocation, 'w')
        for i in protResults:
            writeTo.write('\t'.join([str(j) for j in i]) + '\n')
        writeTo.close()
        # Perform the culling.
        adjMatrix, proteinNames = culling.adjlistcreation.main(inputLocation, cutoffPercent=20, maxEValue=1, minAlignLength=20)
        print '\t\t\tNumber of Proteins in Similarity Graph: ', len(proteinNames)
        proteinsToCull = culling.Leafcull.main(adjMatrix, proteinNames)
        print '\t\t\tNumber of Proteins to Cull: ', len(proteinsToCull)
        for i in proteinsToCull:
            mysql.tableUPDATE(cursor, tableNonRedundant, column + '="N"', 'UPAccession="' + str(i) + '"')
        proteinsToKeep = [i for i in proteinSet if i not in proteinsToCull]
        print '\t\t\tNumber of Proteins to Keep: ', len(proteinsToKeep)
        for i in proteinsToKeep:
            mysql.tableUPDATE(cursor, tableNonRedundant, column + '="Y"', 'UPAccession="' + str(i) + '"')
    mysql.closeConnection(conn, cursor)
def main(unigeneParsedOutput, unigeneParsedTotals, schemaProteins, tableUniGene, tableUniGeneTotals, databasePassword):

    #===========================================================================
    # Extract and format the parsed UniGene data, and the UniGene expression totals.
    #===========================================================================
    UGData = utilities.file2list.main(unigeneParsedOutput)
    UGData = [tuple([int(j) for j in eval(i)]) for i in UGData]
    UGDict = dict([(i[0], i) for i in UGData])

    UGTotalsData = utilities.file2list.main(unigeneParsedTotals)
    UGTotalsData = [tuple([j[0], eval(j[1])]) for j in [eval(i) for i in UGTotalsData]]

    #===========================================================================
    # Extract the UniGene information recorded in the database.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableSELECT(cursor, '*', tableUniGene)
    results = cursor.fetchall()
    mysql.closeConnection(conn, cursor)

    #===========================================================================
    # Compare the parsed data with the data recorded in the expression table.
    #===========================================================================
    columnIndices = range(1, len(UGData[0]))
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute('SHOW COLUMNS FROM ' + tableUniGene)
    columns = cursor.fetchall()
    mysql.closeConnection(conn, cursor)
    columns = [i[0] for i in columns]

    toRemove = []
    toUpdate = {}
    toAdd = UGDict.keys()
    for i in results:
        UniGeneID = i[0]
        if UGDict.has_key(UniGeneID):
            # If the key is in both the parsed file and the expression table, then it does not need to be added.
            toAdd.remove(i[0])
            # Compare the row from the expression table with the parsed file, to determine if the expression table needs updating.
            for j in columnIndices:
                if i[j] != UGDict[UniGeneID][j]:
                    if not toUpdate.has_key(UniGeneID):
                        toUpdate[UniGeneID] = []
                    toUpdate[UniGeneID].append(j)
        else:
            # If the key is in the expression table, but not in the parsed file, then the row needs to be removed.
            toRemove.append(i[0])
    values = '(' + ('%s,' * len(UGData[0]))
    values = values[:-1] + ')'

    #===========================================================================
    # Remove rows from the expression table that are not in the parsed file.
    #===========================================================================
    for i in toRemove:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.rowDELETE(cursor, tableUniGene, 'UniGeneID="' + i + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries removed from the UniGene table: ', len(toRemove)

    #===========================================================================
    # Update rows that have different values in the parsed file and the expression table.
    #===========================================================================
    for i in toUpdate.keys():
        toSet = []
        for j in toUpdate[i]:
            updateString = columns[j] + ' = "' + UGDict[i][j] + '"'
            toSet.append(updateString)
        toSet = ', '.join(toSet)
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableUPDATE(cursor, tableUniGene, toSet, 'UniGeneID="' + i + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries updated in the UniGene table: ', len(toUpdate)

    #===========================================================================
    # Add rows which are not in the expression table, but are in the parsed file.
    #===========================================================================
    rowsToAdd = [UGDict[i] for i in toAdd]
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableINSERT(cursor, tableUniGene, values, rowsToAdd)
    mysql.closeConnection(conn, cursor)
    print '\tEntries added to the UniGene table: ', len(toAdd)

    #===========================================================================
    # Enter the expression totals in the totals table.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute('TRUNCATE TABLE ' + tableUniGeneTotals)
    values = '(' + ('%s,' * len(UGTotalsData[0]))
    values = values[:-1] + ')'
    cursor = mysql.tableINSERT(cursor, tableUniGeneTotals, values, UGTotalsData)
    mysql.closeConnection(conn, cursor)
    print '\tUniGene totals table updated.'