def calculate_pest_motif(sequenceOfAllProteinsDict, epestfindExe, proteinFasta, epestfindOutput, schemaProteins,
                         tableProteinInfo, databasePassword):

    # Connect to the specified schema.
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)

    for i in sequenceOfAllProteinsDict.keys():
        # Run every protein in the table through epestfind.
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]

        # Create a FASTA format file for the protein. This is the input format used for epestfind.
        epestfindInput = open(proteinFasta, 'w')
        epestfindInput.write('>' + UPAcc + '\n')
        epestfindInput.write(seq)
        epestfindInput.close()

        # Run epestfind on the fasta file just created.
        subprocess.call(epestfindExe + ' -sequence ' + proteinFasta + ' -outfile ' + epestfindOutput + ' -auto -window 10 -order score -graph none')

        # Parse the epestfind output file to determine if there is a valid PEST motif.
        motifPresent = parsers.parseepestfind.main(epestfindOutput)

        # Write the number of valid PEST motifs into the PESTMotif column of the protein being analysed.
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'PESTMotif=' + str(motifPresent), 'UPAccession = \'' + UPAcc + '\'')


    mysql.closeConnection(conn, cursor)
def calculate_low_complexity(sequenceOfAllProteinsDict, SEGExe, proteinFasta, SEGOutput, schemaProteins,
                             tableProteinInfo, databasePassword):

    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)

    for i in sequenceOfAllProteinsDict.keys():
        # Run every protein in the table through segmasker.
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]

        # Create a FASTA format file for the protein. This is the input format used for segmasker.
        SEGInput = open(proteinFasta, 'w')
        SEGInput.write('>' + UPAcc + '\n')
        SEGInput.write(seq)
        SEGInput.close()

        # Run segmasker on the fasta file just created.
        subprocess.call(SEGExe + ' -in ' + proteinFasta + ' -out ' + SEGOutput)

        # Parse the segmasker output file to determine if there are any low complexity regions.
        numLowComplexity = parsers.parseSEG.main(SEGOutput)

        # Write the number of low complexity regions into the LowComplexity column of the protein being analysed.
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'LowComplexity=' + str(numLowComplexity), 'UPAccession = \'' + UPAcc + '\'')


    mysql.closeConnection(conn, cursor)
def update_gene_table(ensemblParsedTranscripts, schemaProteins, tableEnsemblGene, databasePassword):

    # ===========================================================================
    # Extract and format the parsed gene data.
    # ===========================================================================
    geneData = utilities.file2list.main(ensemblParsedTranscripts)
    geneData = [eval(i) for i in geneData]
    geneDict = dict([(i[0], i) for i in geneData])
    values = "(" + ("%s," * len(geneData[0]))
    values = values[:-1] + ")"

    rowsToAdd = geneData
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute("TRUNCATE " + tableEnsemblGene)
    cursor = mysql.tableINSERT(cursor, tableEnsemblGene, values, rowsToAdd)
    mysql.closeConnection(conn, cursor)
    print "\tEntries added to the Ensembl gene table: ", len(rowsToAdd)
def main(databasePassword, schemaChEMBL, completeChEMBLDatabase, MySQLBin):
    # Create the ChEMBL schema overwriting any schema already in the database.
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database='do not connect')
    mysql.schemaDrop(cursor, 'IF EXISTS ' + schemaChEMBL)
    mysql.schemaCreate(cursor, 'IF NOT EXISTS ' + schemaChEMBL)
    mysql.closeConnection(conn, cursor)
    # Load the downloaded ChEMBL database into the newly created ChEMBL schema.
    subprocess.call('mysql.exe -u root -p' + databasePassword + ' ' + schemaChEMBL + ' < ' + completeChEMBLDatabase, shell=True, cwd=MySQLBin)
def main(databasePassword, schemaGO, completeGODatabase, MySQLBin):
    # Connect to the MySQL database
    conn, cursor = mysql.openConnection(inputPass = databasePassword, database = 'do not connect')
    # Setup the schemas and tables needed to record the GO information. Delete schemas and tables that
    # already exist.
    mysql.schemaDrop(cursor, 'IF EXISTS ' + schemaGO)
    mysql.schemaCreate(cursor, 'IF NOT EXISTS ' + schemaGO)
    mysql.closeConnection(conn, cursor)
    # Load the downloaded GO database into the newly created GO schema.
    subprocess.call('mysql.exe -u root -p' + databasePassword + ' ' + schemaGO + ' < ' + completeGODatabase, shell=True, cwd=MySQLBin)
def main(cosmicParsedGene, cosmicParsedGene2Mutation, cosmicParsedMutation, databasePassword, schemaProteins,
         tableCOSMICGene, tableCOSMICGene2Mutation, tableCOSMICMutation):

    #===========================================================================
    # Extract and format the parsed data.
    #===========================================================================
    cosmicGene = utilities.file2list.main(cosmicParsedGene)
    cosmicGene = [eval(i) for i in cosmicGene]
    cosmicGene2Mutation = utilities.file2list.main(cosmicParsedGene2Mutation)
    cosmicGene2Mutation = [eval(i) for i in cosmicGene2Mutation]
    cosmicMutation = utilities.file2list.main(cosmicParsedMutation)
    cosmicMutation = [eval(i) for i in cosmicMutation]

    #===========================================================================
    # Update the database tables.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    values = '(' + ('%s,' * len(cosmicGene[0]))
    values = values[:-1] + ')'
    cursor = mysql.tableINSERT(cursor, tableCOSMICGene, values, cosmicGene)

    values = '(' + ('%s,' * len(cosmicMutation[0]))
    values = values[:-1] + ')'
    cursor = mysql.tableINSERT(cursor, tableCOSMICMutation, values, cosmicMutation)

    values = '(' + ('%s,' * len(cosmicGene2Mutation[0]))
    values = values[:-1] + ')'
    cursor = mysql.tableINSERT(cursor, tableCOSMICGene2Mutation, values, cosmicGene2Mutation)
    mysql.closeConnection(conn, cursor)
def update_homolog_table(ensemblParsedHomology, schemaProteins, tableHomologs, databasePassword):

    # ===========================================================================
    # Extract and format the parsed gene data.
    # ===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute("TRUNCATE " + tableHomologs)
    readVariants = open(ensemblParsedHomology, "r")
    count = 0
    for line in readVariants:
        count += 1
        line = eval(line)
        values = "(" + ("%s," * len(line))
        values = values[:-1] + ")"
        try:
            cursor = mysql.tableINSERT(cursor, tableHomologs, values, [line])
        except:
            print line
            print count
            raise
    readVariants.close()
    mysql.closeConnection(conn, cursor)
def main(databasePassword, schemaProteins, tableProteinInfo, tableStability):

    # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool).
    halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3,
                'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2,
                'V' : 100.0, 'W' : 2.8, 'Y' : 2.8}

    # Extract all the sequences stored in the database.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo)
    results = cursor.fetchall()

    # Calculate the half life and instability index for each protein.
    stabilityTuples = []
    for i in results:
        sequence = i[1]
        if halfLife.has_key(sequence[0]):
            protHalfLife = halfLife[sequence[0]]
        else:
            # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.)
            protHalfLife = -1
        analysedSeq = ProteinAnalysis(sequence)
        try:
            instabilityIndex = analysedSeq.instability_index()
        except:
            instabilityIndex = -1
            print '\tContains invalid aa code: ', i[0]
        stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex]))

    cursor.execute('TRUNCATE TABLE ' + tableStability)
    values = '(' + ('%s,' * len(stabilityTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableStability, values, stabilityTuples)
    mysql.closeConnection(conn, cursor)

#def instability_index(prot, sequence):
#
#    # A two dimentional dictionary for calculating the instability index.
#    # Guruprasad K., Reddy B.V.B., Pandit M.W.    Protein Engineering 4:155-161(1990).
#    # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G'].
#    DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60,
#                  'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0,
#                  'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0},
#            'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26,
#                  'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0,
#                  'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0},
#            'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54,
#                  'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601},
#            'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34,
#                  'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0,
#                  'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0,
#                  'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49,
#                  'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49},
#            'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0,
#                  'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68,
#                  'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94},
#            'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0,
#                  'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0,
#                  'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60,
#                  'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28,
#                  'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0,
#                  'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54,
#                  'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68},
#            'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26,
#                  'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0},
#            'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0,
#                  'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0,
#                  'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0},
#            'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26,
#                  'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54},
#            'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54,
#                  'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54,
#                  'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0},
#            'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28,
#                  'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54},
#            'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0},
#            'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68,
#                  'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54},
#            'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34,
#                  'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91,
#                  'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34},
#            }
#
#    score = 0.0
#    for i in range(len(sequence) - 1):
#        if DIWV.has_key(sequence[i]):
#            if DIWV[sequence[i]].has_key(sequence[i+1]):
#                score += DIWV[sequence[i]][sequence[i+1]]
#    return (10.0 / len(sequence)) * score
def main(tableToUse, outputDirectory, predictionDirection, seqsPerFile, maxSeqLength, schemaProteins,
         databasePassword, columnWithPredictions):

    if predictionDirection != 'IN':
        # Clear the output directory, and then remake it.
        if os.path.exists(outputDirectory):
            shutil.rmtree(outputDirectory)
        os.mkdir(outputDirectory)

    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    if predictionDirection == 'OUTA':
        # Output all proteins, whether or not they have a prediction made for them.
        cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableToUse)
        results = cursor.fetchall()
        results = ['>' + '\n'.join([j[0], j[1]]) for j in results if len(j[1]) <= maxSeqLength]
        length = len(results)
        numberOfSplits = length / seqsPerFile
        if length%seqsPerFile != 0:
            numberOfSplits += 1
        fileOutput = [ results[i * seqsPerFile : (i+1) * seqsPerFile] for i in range(numberOfSplits)]
        for i in range(len(fileOutput)):
            utilities.list2file.main(fileOutput[i], outputDirectory + '/' + str(i) + '.fasta')
    elif predictionDirection == 'OUTS':
        # Output only those proteins that do not already have a prediction mae for them.
        cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableToUse, columnWithPredictions + '="NA"')
        results = cursor.fetchall()
        results = ['>' + '\n'.join([j[0], j[1]]) for j in results if len(j[1]) <= maxSeqLength]
        length = len(results)
        numberOfSplits = length / seqsPerFile
        if length%seqsPerFile != 0:
            numberOfSplits += 1
        fileOutput = [ results[i * seqsPerFile : (i+1) * seqsPerFile] for i in range(numberOfSplits)]
        for i in range(len(fileOutput)):
            utilities.list2file.main(fileOutput[i], outputDirectory + '/' + str(i) + '.fasta')
    elif predictionDirection == 'IN':
        # Reading in predictions from the user.
        predFolderFiles = os.listdir(outputDirectory)
        fastaFiles = [i for i in predFolderFiles if i.split('.')[1] == 'fasta']
        fastaFilesNoExtension = [i.split('.')[0] for i in fastaFiles]
        predFiles = [(i, i.split('.')[0] + '.fasta') for i in predFolderFiles if i.split('.')[0] in fastaFilesNoExtension and i not in fastaFiles]
        if len(predFiles) > len(fastaFiles):
            # There are too many prediction files.
            print 'There are too many prediction files, or too many files with the same name as the fasta files.'
            sys.exit()
        updatesToPerform = []
        for i in predFiles:
            predictionFile = outputDirectory + '/' + i[0]
            fastaFile = outputDirectory + '/' + i[1]
            # Gather the accessions of the proteins that were supposed to have been predicted.
            outputProteins = utilities.file2list.main(fastaFile)[::2]
            outputProteins = [i[1:] for i in outputProteins]
            # Gather the input data.
            inputProteins = utilities.file2list.main(predictionFile)
            updateTuples = wolf_prediction_entry(outputProteins, inputProteins)
            updatesToPerform.extend(updateTuples)
        updateQuery = 'UPDATE ' + tableToUse + '\n\tSET ' + columnWithPredictions + ' = CASE UPAccession\n'
        for i in updatesToPerform:
            updateQuery += '\t\tWHEN "' + i[0] + '" THEN "' + i[1] + '"\n'
        updateQuery += '\tEND\nWHERE UPAccession IN ("' + '","'.join([i[0] for i in updatesToPerform]) + '")'
        cursor.execute(updateQuery)
    mysql.closeConnection(conn, cursor)
def main(UPPPIData, schemaProteins, tablePPI, databasePassword):

    #===========================================================================
    # Extract and format the parsed gene data.
    #===========================================================================
    ppiData = utilities.file2list.main(UPPPIData)
    ppiData = [eval(i) for i in ppiData]
    ppiDict = dict([(tuple([i[0], i[1]]), i) for i in ppiData])

    #===========================================================================
    # Extract the gene information recorded in the database.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableSELECT(cursor, '*', tablePPI)
    results = cursor.fetchall()
    mysql.closeConnection(conn, cursor)

    #===========================================================================
    # Compare the parsed data with the data recorded in the table.
    #===========================================================================
    columnIndices = range(1, len(ppiData[0]))
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute('SHOW COLUMNS FROM ' + tablePPI)
    columns = cursor.fetchall()
    mysql.closeConnection(conn, cursor)
    columns = [i[0] for i in columns]

    toRemove = []
    toUpdate = {}
    toAdd = ppiDict.keys()
    for i in results:
        proteinOne = i[0]
        proteinTwo = i[1]
        dictKey = tuple([proteinOne, proteinTwo])
        if ppiDict.has_key(dictKey):
            # If the key is in both the parsed file and the table, then it does not need to be added.
            toAdd.remove(dictKey)
            # Compare the row from the table with the parsed file, to determine if the table needs updating.
            for j in columnIndices:
                if i[j] != ppiDict[dictKey][j]:
                    if not toUpdate.has_key(dictKey):
                        toUpdate[dictKey] = []
                    toUpdate[dictKey].append(j)
        else:
            # If the key is in the table, but not in the parsed file, then the row needs to be removed.
            toRemove.append(dictKey)
    values = '(' + ('%s,' * len(ppiData[0]))
    values = values[:-1] + ')'

    #===========================================================================
    # Remove rows from the table that are not in the parsed file.
    #===========================================================================
    for i in toRemove:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.rowDELETE(cursor, tablePPI, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries removed from the PPI table: ', len(toRemove)

    #===========================================================================
    # Update rows that have different values in the parsed file and the table.
    #===========================================================================
    for i in toUpdate.keys():
        toSet = []
        for j in toUpdate[i]:
            updateString = columns[j] + ' = "' + str(ppiDict[i][j]) + '"'
            toSet.append(updateString)
        toSet = ', '.join(toSet)
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableUPDATE(cursor, tablePPI, toSet, 'PPIProteinOne="' + i[0] + '" AND PPIProteinTwo="' + i[1] + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries updated in the PPI table: ', len(toUpdate)

    #===========================================================================
    # Add rows which are not in the table, but are in the parsed file.
    #===========================================================================
    rowsToAdd = [ppiDict[i] for i in toAdd]
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableINSERT(cursor, tablePPI, values, rowsToAdd)
    mysql.closeConnection(conn, cursor)
    print '\tEntries added to the PPI table: ', len(toAdd)
def main(databasePassword, schemaPharmGKB, pharmGKBDiseases, pharmGKBDrugs, pharmGKBGenes, pharmGKBRelationships,
         pharmGKBInitialisation, MYSQLBIN):

    # Define the tables needed.
    tableDisease = schemaPharmGKB + '.disease'
    tableDrug = schemaPharmGKB + '.drug'
    tableGene = schemaPharmGKB + '.gene'
    tableDrug2Drug = schemaPharmGKB + '.drug2drug'
    tableDrug2DrugClass = schemaPharmGKB + '.drug2drugclass'
    tableDrug2Disease = schemaPharmGKB + '.drug2disease'
    tableDrug2Gene = schemaPharmGKB + '.drug2gene'
    tableDrugClass2Disease = schemaPharmGKB + '.drugclass2disease'
    tableDrugClass2Gene = schemaPharmGKB + '.drugclass2gene'
    tableGene2Disease = schemaPharmGKB + '.gene2disease'
    tableGene2Gene = schemaPharmGKB + '.gene2gene'
    tableDisease2MeSH = schemaPharmGKB + '.disease2mesh'
    tableDisease2SnoMED = schemaPharmGKB + '.disease2snomed'
    tableDisease2UMLS = schemaPharmGKB + '.disease2umls'

    # Create the schema and the tables.
    subprocess.call('mysql.exe -u root -p' + databasePassword + ' mysql < ' + pharmGKBInitialisation, shell=True, cwd=MYSQLBIN)

    # Parse the files that contain the data to enter into the database.
    diseaseTuples = []
    meshTuples = []
    snomedTuples = []
    umlsTuples = []
    readIn = open(pharmGKBDiseases, 'r')
    readIn.readline()  # Strip off the header of the file.
    for line in readIn:
        chunks = line.split('\t')
        diseaseID = chunks[0]
        diseaseName = chunks[1]
        diseaseTuples.append(tuple([diseaseID, diseaseName]))
        externalVocab = chunks[4]
        mesh = re.findall('(?<=MeSH:)D[0-9]+', externalVocab)
        meshTuples.extend([tuple([diseaseID, i]) for i in mesh])
        snomed = re.findall('(?<=SnoMedCT:)[0-9]+', externalVocab)
        snomedTuples.extend([tuple([diseaseID, i]) for i in snomed])
        umls = re.findall('(?<=UMLS:)C[0-9]+', externalVocab)
        umlsTuples.extend([tuple([diseaseID, i]) for i in umls])
    readIn.close()

    drugTuples = []
    readIn = open(pharmGKBDrugs, 'r')
    readIn.readline()  # Strip off the header of the file.
    for line in readIn:
        chunks = line.split('\t')
        drugID = chunks[0]
        drugName = chunks[1]
        compoundType = chunks[5]
        if compoundType == 'Drug/Small Molecule':
            compoundType = 'Drug'
        xrefs = chunks[6]
        drugBank = re.findall('(?<=drugBank:)DB[0-9]+', xrefs)
        drugBank = drugBank[0] if drugBank != [] else 'NA'
        pubchemCID = re.findall('(?<=pubChemCompound:)[0-9]+', xrefs)
        pubchemCID = pubchemCID[0] if pubchemCID != [] else 'NA'
        drugTuples.append(tuple([drugID, drugName, compoundType, drugBank, pubchemCID]))
    readIn.close()

    geneTuples = []
    readIn = open(pharmGKBGenes, 'r')
    readIn.readline()  # Strip off the header of the file.
    for line in readIn:
        chunks = line.split('\t')
        geneID = chunks[0]
        entrezID = chunks[1]
        if entrezID == '':
            entrezID = '0'
        geneTuples.append(tuple([geneID, entrezID]))
    readIn.close()

    drug2drugTuples = []
    drug2drugClassTuples = {}
    drug2diseaseTuples = []
    drugClass2diseaseTuples = []
    drug2geneTuples = {}
    drugClass2geneTuples = {}
    gene2diseaseTuples = {}
    gene2geneTuples = {}
    readIn = open(pharmGKBRelationships, 'r')
    readIn.readline()  # Strip off the header of the file.
    for line in readIn:
        chunks = line.split('\t')
        entityOne = chunks[0].split(':')
        entityTwo = chunks[2].split(':')
        evidence = chunks[5]
        publication = 'Y' if 'Publication' in evidence else 'N'
        pathway = 'Y' if 'Pathway' in evidence else 'N'
        variant = 'Y' if 'Variant' in evidence else 'N'
        if entityOne[0] == 'Drug':
            if entityTwo[0] == 'Drug':
                drug2drugTuples.append(tuple([entityOne[1], entityTwo[1], pathway, publication, variant]))
            elif entityTwo[0] == 'Drug Class':
                key = tuple([entityOne[1], entityTwo[1]])
                if drug2drugClassTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or drug2drugClassTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or drug2drugClassTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or drug2drugClassTuples[key][4] == 'Y' else 'N'
                    drug2drugClassTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    drug2drugClassTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
            elif entityTwo[0] == 'Gene':
                key = tuple([entityOne[1], entityTwo[1]])
                if drug2geneTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or drug2geneTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or drug2geneTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or drug2geneTuples[key][4] == 'Y' else 'N'
                    drug2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    drug2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
            elif entityTwo[0] == 'Disease':
                drug2diseaseTuples.append(tuple([entityOne[1], entityTwo[1], pathway, publication, variant]))
        elif entityOne[0] == 'Gene':
            if entityTwo[0] == 'Drug':
                key = tuple([entityTwo[1], entityOne[1]])
                if drug2geneTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or drug2geneTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or drug2geneTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or drug2geneTuples[key][4] == 'Y' else 'N'
                    drug2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    drug2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
            elif entityTwo[0] == 'Drug Class':
                key = tuple([entityTwo[1], entityOne[1]])
                if drugClass2geneTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or drugClass2geneTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or drugClass2geneTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or drugClass2geneTuples[key][4] == 'Y' else 'N'
                    drugClass2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    drugClass2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
            elif entityTwo[0] == 'Disease':
                key = tuple([entityOne[1], entityTwo[1]])
                if gene2diseaseTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or gene2diseaseTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or gene2diseaseTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or gene2diseaseTuples[key][4] == 'Y' else 'N'
                    gene2diseaseTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    gene2diseaseTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
            elif entityTwo[0] == 'Gene':
                key = tuple(sorted([entityOne[1], entityTwo[1]]))
                if gene2geneTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or gene2geneTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or gene2geneTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or gene2geneTuples[key][4] == 'Y' else 'N'
                    gene2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    gene2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
        elif entityOne[0] == 'Disease':
            if entityTwo[0] == 'Drug':
                drug2diseaseTuples.append(tuple([entityTwo[1], entityOne[1], pathway, publication, variant]))
            elif entityTwo[0] == 'Drug Class':
                drugClass2diseaseTuples.append(tuple([entityTwo[1], entityOne[1], pathway, publication, variant]))
            elif entityTwo[0] == 'Gene':
                key = tuple([entityTwo[1], entityOne[1]])
                if gene2diseaseTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or gene2diseaseTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or gene2diseaseTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or gene2diseaseTuples[key][4] == 'Y' else 'N'
                    gene2diseaseTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    gene2diseaseTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
        elif entityOne[0] == 'Drug Class':
            if entityTwo[0] == 'Drug':
                key = tuple([entityTwo[1], entityOne[1]])
                if drug2drugClassTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or drug2drugClassTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or drug2drugClassTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or drug2drugClassTuples[key][4] == 'Y' else 'N'
                    drug2drugClassTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    drug2drugClassTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
            elif entityTwo[0] == 'Disease':
                drugClass2diseaseTuples.append(tuple([entityOne[1], entityTwo[1], pathway, publication, variant]))
            elif entityTwo[0] == 'Gene':
                key = tuple([entityOne[1], entityTwo[1]])
                if drugClass2geneTuples.has_key(key):
                    pathEvidence = 'Y' if pathway == 'Y' or drugClass2geneTuples[key][2] == 'Y' else 'N'
                    pubEvidence = 'Y' if publication == 'Y' or drugClass2geneTuples[key][3] == 'Y' else 'N'
                    varEvidence = 'Y' if variant == 'Y' or drugClass2geneTuples[key][4] == 'Y' else 'N'
                    drugClass2geneTuples[key] = tuple([key[0], key[1], pathEvidence, pubEvidence, varEvidence])
                else:
                    drugClass2geneTuples[key] = tuple([key[0], key[1], pathway, publication, variant])
    readIn.close()

    # Enter the data into the database.
    conn, cursor = mysql.openConnection(databasePassword, schemaPharmGKB)

    values = '(' + ('%s,' * len(diseaseTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDisease, values, diseaseTuples)

    values = '(' + ('%s,' * len(meshTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDisease2MeSH, values, meshTuples)

    values = '(' + ('%s,' * len(snomedTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDisease2SnoMED, values, snomedTuples)

    values = '(' + ('%s,' * len(umlsTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDisease2UMLS, values, umlsTuples)

    values = '(' + ('%s,' * len(drugTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrug, values, drugTuples)

    values = '(' + ('%s,' * len(geneTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableGene, values, geneTuples)

    values = '(' + ('%s,' * len(drug2drugTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrug2Drug, values, drug2drugTuples)

    drug2drugClassTuples = [drug2drugClassTuples[i] for i in drug2drugClassTuples.keys()]
    values = '(' + ('%s,' * len(drug2drugClassTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrug2DrugClass, values, drug2drugClassTuples)

    values = '(' + ('%s,' * len(drug2diseaseTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrug2Disease, values, drug2diseaseTuples)

    values = '(' + ('%s,' * len(drugClass2diseaseTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrugClass2Disease, values, drugClass2diseaseTuples)

    drug2geneTuples = [drug2geneTuples[i] for i in drug2geneTuples.keys()]
    values = '(' + ('%s,' * len(drug2geneTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrug2Gene, values, drug2geneTuples)

    drugClass2geneTuples = [drugClass2geneTuples[i] for i in drugClass2geneTuples.keys()]
    values = '(' + ('%s,' * len(drugClass2geneTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableDrugClass2Gene, values, drugClass2geneTuples)

    gene2diseaseTuples = [gene2diseaseTuples[i] for i in gene2diseaseTuples.keys()]
    values = '(' + ('%s,' * len(gene2diseaseTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableGene2Disease, values, gene2diseaseTuples)

    gene2geneTuples = [gene2geneTuples[i] for i in gene2geneTuples.keys()]
    values = '(' + ('%s,' * len(gene2geneTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableGene2Gene, values, gene2geneTuples)
    mysql.closeConnection(conn, cursor)
def main(CGCParsed, cancerTargets, UPExternalLinks, UPHumanAccessionMap, TTDTarget2Drug, DBTargetIDs, schemaProteins, tableCancerGene, DATABASEPASSWORD):

    # Record the mapping of non-representative UniProt accessions to representative accessions.
    accMap = {}
    readIn = open(UPHumanAccessionMap, 'r')
    for line in readIn:
        chunks = (line.strip()).split()
        accMap[chunks[0]] = chunks[1]
    readIn.close()

    # Read in the CGC information.
    CGCData = {}
    readIn = open(CGCParsed, 'r')
    for line in readIn:
        chunks = (line.strip()).split('\t')
        geneID = chunks[0]
        germline = chunks[1]
        somatic = chunks[2]
        CGCData[geneID] = {'Cancer' : 'Y', 'Target' : 'N', 'Somatic' : somatic, 'Germline' : germline}
    readIn.close()

    cancerTTDTargets = set([])
    cancerDrugBankDrugs = set([])
    cancerDrugTargetUPAccs = set([])

    # Read in the information about anti-neoplastic drugs.
    readIn = open(cancerTargets, 'r')
    readIn.readline()  # Strip the header line.
    for line in readIn:
        chunks = line.split('\t')
        cancerTTDTargets.add(chunks[2])
        cancerDrugBankDrugs.add(chunks[4])
    readIn.close()
    cancerTTDTargets -= set([''])
    cancerDrugBankDrugs -= set([''])

    # Determine the UniProt accessions of the TTD targets.
    readIn = open(TTDTarget2Drug, 'r')
    for line in readIn:
        accessions = line.split('\t')[1]
        for i in accessions.split(','):
            cancerDrugTargetUPAccs.add(i)
    readIn.close()

    # Determine the UniProt accessions of the targets of the DrugBank drugs.
    readIn = open(DBTargetIDs, 'r')
    for line in readIn:
        chunks = line.split('\t')
        accession = chunks[0]
        drugs = chunks[1].split(';')
        for i in drugs:
            if i in cancerDrugBankDrugs:
                cancerDrugTargetUPAccs.add(accession)
    readIn.close()

    geneXRef = {}
    readIn = open(UPExternalLinks, 'r')
    for line in readIn:
        chunks = line.split(',')
        UPAcc = chunks[0]
        geneID = chunks[1]
        if geneID == '':
            pass
        else:
            geneID = geneID.split(';')
            for i in geneID:
                if geneXRef.has_key(i):
                    geneXRef[i].add(UPAcc)
                else:
                    geneXRef[i] = set([UPAcc])
    readIn.close()

    # Create the set of all representative UniProt accessions.
    allUPAccessions = set([])
    for i in accMap:
        allUPAccessions.add(accMap[i])

    # For every representative UniProt accession, determine whether it is implicated in cancer and if it is a target for anti-neoplastic drugs.
    cancerProteins = dict([(i, {'Cancer' : 'N', 'Target' : 'N', 'Somatic' : 'N', 'Germline' : 'N'}) for i in allUPAccessions])
    for i in CGCData.keys():
        if geneXRef.has_key(i):
            for j in geneXRef[i]:
                cancerProteins[j]['Cancer'] = 'Y'
                if cancerProteins[j]['Somatic'] == 'Y':
                    # Keep it as a yes.
                    pass
                elif cancerProteins[j]['Somatic'] == 'N':
                    if CGCData[i]['Somatic'] == 'Y':
                        # Change it to a yes.
                        cancerProteins[j]['Somatic'] = 'Y'
                if cancerProteins[j]['Germline'] == 'Y':
                    # Keep it as a yes.
                    pass
                elif cancerProteins[j]['Germline'] == 'N':
                    if CGCData[i]['Germline'] == 'Y':
                        # Change it to a yes.
                        cancerProteins[j]['Germline'] = 'Y'

    for i in cancerDrugTargetUPAccs:
        # Record all the proteins that are targets of anti-neoplastic drugs as being implicated in cancer and being targets.
        try:
            # Only adding representative accessions.
            representativeAcc = accMap[i]
            cancerProteins[representativeAcc]['Cancer'] = 'Y'
            cancerProteins[representativeAcc]['Target'] = 'Y'
        except:
            pass

    dataRecords = [tuple([i, cancerProteins[i]['Cancer'], cancerProteins[i]['Target'], cancerProteins[i]['Somatic'], cancerProteins[i]['Germline']])
                   for i in cancerProteins]
    conn, cursor = mysql.openConnection(DATABASEPASSWORD, schemaProteins)
    cursor.execute('TRUNCATE TABLE ' + tableCancerGene)
    values = '(' + ('%s,' * len(dataRecords[0]))
    values = values[:-1] + ')'
    cursor = mysql.tableINSERT(cursor, tableCancerGene, values, dataRecords)
    mysql.closeConnection(conn, cursor)
def main(ChEMBLTargets, ChEMBLCID, databasePassword, ChEMBLSchema):
    """
    Returns two files.
        ChEMBLTargets contains mapping of approved drug information to the UniProt accessions of the proteins that the drug targets.
        ChEMBLCID contains mappings of ChEMBL compound IDs to PubChem CIDs.
    ChEMBLTargets - A tab separated (tsv) file, with seven elements on each line. There is one line for each protein that an approved target targets.
        The first element is the UniProt accession for the target protein.
        The second element is ChEMBL ID of the compound.
        The third element is the name of the protein in the first element.
        The fourth element is the activity relation (=, <, <=, > or >=).
        The fifth element is the value of the activity observed between the compound and the protein.
        The sixth element is the units of the value in the fifth element.
        The seventh element is the type of activity measured (Ki, Kd, EC50, etc.)
    ChEMBLCID - A tab separated (tsv) file, with two elements on each line.
        The first element is the ChEMBL compound ID.
        The second element is the PubChem CID that corresponds to the ChEMBL compound ID in the first element.
    """

    # Create the query for extracting targets of approved drugs.
    queryTarget2Compound = """
    SELECT
        td.protein_accession,
        md.molregno,
        md.chembl_id,
        td.pref_name,
        act.relation,
        act.standard_value,
        act.standard_units,
        act.standard_type
    FROM
        molecule_dictionary md,
        activities act,
        assays a,
        assay2target a2t,
        target_dictionary td
    WHERE
        md.max_phase = '4'
        and md.molecule_type = "Small molecule"
        and md.molregno = act.molregno
        and act.assay_id = a.assay_id
        and a.assay_id = a2t.assay_id
        and a2t.confidence_score = '9'
		and a2t.relationship_type = 'D'
        and a2t.tid = td.tid
        and td.target_type = 'PROTEIN'
        and td.db_source = 'SWISS-PROT'
    """

    # Connect to the ChEMBL schema in the database, and run the target/compound extraction query.
    conn, cursor = mysql.openConnection(databasePassword, ChEMBLSchema)
    cursor.execute(queryTarget2Compound)
    resultsTarget2Compound = cursor.fetchall()
    resultsTarget2Compound = list(set(resultsTarget2Compound))

    # Generate the output file of UniProt accession to drug information mappings.
    target2CompoundsDict = {}
    chemblID2Molregno = {}
    ChEMBLIDs = set([])
    writeOut = open(ChEMBLTargets, 'w')
    for i in resultsTarget2Compound:
        chemblID2Molregno[str(i[2])] = str(i[1])
        ChEMBLIDs.add(str(i[2]))
        writeOut.write(i[0] + '\t' + str(i[1]) + '\t' + str(i[3]) + '\t' + str(i[4]) + '\t' + str(i[5]) + '\t' + str(i[6]) + '\t' + str(i[7]) + '\n')
    writeOut.close()
    mysql.closeConnection(conn, cursor)

    # Use Entrez EUtils to get the PubChem CIDs for the ChEMBL IDs.
    molregno2CIDs = set([])
    for i in ChEMBLIDs:
        fetchURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pccompound&term=' + i
        req = urllib2.Request(fetchURL)
        fetchResult = urllib2.urlopen(req)
        output = fetchResult.read()
        result = re.search('(?<=<Id>)[0-9]*(?=</Id>)', output)
        if result:
            result = result.group(0)
            molregno2CIDs.add(chemblID2Molregno[i] + '\t' + result)
        fetchResult.close()
        # Sleep for 0.6 seconds in order to stay within NCBI's guidelines of not submitting more than 3 HTTP requests a second
        time.sleep(0.6)

    utilities.list2file.main(list(molregno2CIDs), ChEMBLCID)
def main(goSchema, parsedGO):
    """
    Returns a file containing the parsed GO data.
    parsedGO - A file of 6-tuples, one on each line.
        The first element is the numerical identifier of the GO term.
        The second element is the name of the GO term.
        The third element is the category (biological_process, cellular component or molecular_function) that the term belongs to.
        The fourth element is all the paths from the term to the category it belongs to. The paths are separated from one another using semi-colons, and the elements of each path are separated from one another using '#'.
        The fifth element is all the level one terms along the paths. These are all the terms that are in a path in element four and are diect descendants of the category in element three.
        The sixth element is all the level two terms along the paths. These are all the terms that are in a path in element four and are diect descendants of the terms in element five.
    """

    # Connect to the GO schema.
    connGO, cursorGO = mysql.openConnection('root', goSchema)
    # Extract the information from the GO schema that is necessary to build up the paths.
    cursorGO = mysql.tableSELECT(cursorGO, 'id, name, term_type, acc', 'term')
    result = cursorGO.fetchall()

    # Turn the information extracted from the GO schema into a dictionary indexed by the identifiers of the GO terms.
    resultsDict = dict([(x[0], x[1:]) for x in result])

    # recordsToInsert will contain the information to insert into the GO path schema.
    recordsToInsert = []

    # For every term work out the name of the term, which of the three types of GO term it is, all the paths from the
    # term to the root terms (which are the three types), all unique level 1 and level 2 terms along the path.
    # A level 1 term is a child term of the root. A level 2 term is a child of a level 1 term.
    for r in resultsDict.keys():
        if re.match('GO:[0-9]+$', resultsDict[r][2]):
            # If the term_type is a GO term rather than something like a relationship describer (i.e. not something like
            # part_of, goslim_plant or regulates).
            pathDict = {}
            entry = resultsDict[r]
            startID = r
            startName = entry[0]
            startType = entry[1]
            pathDict[startID] = ['#' + startName]
            toCheck = [startID]

            while toCheck != []:
                # In order to determine all of the paths from a term to the root nodes, we treat the set of GO terms
                # as a directed graph. The direction of the edges is from parent to child. In the traversal we only go
                # in the direction opposite to that of the arrows (i.e. up a tree). Starting with a single term B, we find
                # the set of it's parent terms, P. To each element of P we add B to the end. The path for B then looks like
                # #P1#B #P2#B ... #Pn#B. This is repeated until all paths are capped at the front with a root term (the
                # root term will be the same for every path of B). Not all paths for B will be of the same length.
                current = toCheck.pop()
                currentPath = pathDict[current]
                # Get the terms that current is a subterm of.
                # Every entry in the term2term table that is returned represents one superterm of the current term.
                cursor = mysql.tableSELECT(cursorGO, 'term1_id', 'term2term', 'term2_id = \'' + str(current) + '\'')
                crossrefResults = cursor.fetchall()
                if crossrefResults == ():
                    # If the current term is not a subterm of any other term, then the current term is a root term.
                    # This means that the current path is finished.
                    rootNode = current
                    continue
                for i in crossrefResults:
                    # Get the id and name of the superterm being examined.
                    uniqueID = i[0]
                    name = resultsDict[uniqueID][0]
                    path = []
                    for j in currentPath:
                        # For every entry in the current path stick the superterm at the front of it, as it is an
                        # ancestor of whatever the head of the current path is.
                        path.append('#' + name + j)
                    # Get the identifiers of the terms which have already had their descendants enumerated.
                    keys = pathDict.keys()
                    if keys.count(uniqueID) == 0:
                        # If the current superterm has not already had its descendants enumerated, then simply record
                        # the paths just enumerated in the pathDict.
                        pathDict[uniqueID] = path
                    else:
                        # If the current superterm has already had its descendants enumerated, then add all the paths just
                        # enumerated to its record. Following this remove all duplicate paths.
                        pathDict[uniqueID].extend(path)
                        pathDict[uniqueID] = list(set(pathDict[uniqueID]))
                    # Add the superterm to the list of terms that need their superterms evaluating.
                    toCheck.append(uniqueID)

            # The final paths of interest are those recorded in the root node entry in the dictionary.
            final = pathDict[rootNode]
            levelOne = []
            levelTwo = []
            path = []

            for f in final:
                # For every path from the initial term to the root term, find the level 1 and level 2 terms.
                chunks = f.split('#')

                # Rather than setting the level 1 terms to be the 2nd and 3rd terms respectively, they are the 3rd and 4th.
                # This is because the GO database defines a term 'all' which is a dummy root node.
                if len(chunks) > 3:
                    levelOne.append(chunks[3])
                if len(chunks) > 4:
                    levelTwo.append(chunks[4])

                # Create the path that will be recorded in the database. Ignore the 'all' dummy root.
                path.append('#'.join(chunks[2:]))

            # Ensure only unique level 1 and level 2 terms are recorded.
            levelOne = list(set(levelOne))
            if levelOne == []:
                level1 = 'NA'
            else:
                level1 = ';'.join(levelOne)
            levelTwo = list(set(levelTwo))
            if levelTwo == []:
                level2 = 'NA'
            else:
                level2 = ';'.join(levelTwo)
            path = ';'.join(path)

            # Create the list of tuples to enter into the database.
            recordsToInsert.append((resultsDict[r][2][3:], startName, startType, path, level1, level2))

    mysql.closeConnection(connGO, cursorGO)

    utilities.list2file.main(recordsToInsert, parsedGO)
def calculate_sequence_stats(sequenceOfAllProteinsDict, pepstatsExe, proteinFasta, pepstatsOutput, schemaProteins,
                             tableProteinInfo, databasePassword):

    # Create the lists of the different types of amino acids. trueAAs are the 20 amino acids that are coded for by the genetic code.
    aminoAcids = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    trueAAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    numAA = len(aminoAcids)
    tinyAAs = ['A', 'C', 'G', 'S', 'T']
    smallAAs = ['A', 'C', 'D', 'G', 'N', 'P', 'S', 'T', 'V']
    aliphaticAAs = ['I', 'L', 'V']
    aromaticAAs = ['F', 'H', 'W', 'Y']
    nonpolarAAs = ['A', 'C', 'F', 'G', 'I', 'L', 'M', 'P', 'V', 'W', 'Y']
    polarAAs = ['D', 'E', 'H', 'K', 'N', 'Q', 'R', 'S', 'T']
    chargedAAs = ['D', 'E', 'H', 'K', 'R']
    basicAAs = ['H', 'K', 'R']
    negativelyCharged = ['D', 'E']
    positivelyCharged = ['H', 'K', 'R']
    # The hydrophobicity of different amino acid residues, as measured by the Kyte and Doolittle scale.
    hydro = {'A' : 1.8, 'C' : 2.5, 'D' : -3.5, 'E' : -3.5, 'F' : 2.8, 'G' : -0.4, 'H' : -3.2, 'I' : 4.5,
             'K' : -3.9, 'L' : 3.8, 'M' : 1.9, 'N' : -3.5, 'P' : -1.6, 'Q' : -3.5, 'R' : -4.5, 'S' : -0.8,
             'T' : -0.7, 'V' : 4.2, 'W' : -0.9, 'Y' : -1.3}

    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)

    #===========================================================================
    # Run pepstats on all the proteins.
    #===========================================================================
    pepstatsInput = open(proteinFasta, 'w')
    for i in sequenceOfAllProteinsDict.keys():
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]
        pepstatsInput.write('>' + UPAcc + '\n')
        pepstatsInput.write(seq + '\n')
    pepstatsInput.close()

    # Run Pepstats on the newly created fasta file.
    subprocess.call(pepstatsExe + ' -sequence ' + proteinFasta + ' -outfile ' + pepstatsOutput + ' -auto')

    # Parse the Pepstats output file to get the isoelectric point of the protein.
    pIDict = parsers.parsePepstats.main(pepstatsOutput)

    for i in sequenceOfAllProteinsDict.keys():
        UPAcc = i
        pI = pIDict[UPAcc]['pI']

        # Write the isoelectric point into the Isoelectric column of the protein being analysed.
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Isoelectric=' + str(pI), 'UPAccession = \'' + UPAcc + '\'')

    #===========================================================================
    # Calculate the sequence statistics for the proteins.
    #===========================================================================
    for i in sequenceOfAllProteinsDict.keys():
        stats = [0.0]*numAA  # The summation of the number of each type of amino acid in the protein.
        UPAcc = i
        seq = sequenceOfAllProteinsDict[i]
        seqLen = len(seq)

        # Go through the amino acids in the sequence and sum up the different types.
        for aa in seq:
            index = aminoAcids.index(aa)
            stats[index] += 1

        # Compensate for the fact that not all amino acids recorded in the sequence will be from the 20 coded for by the genome.

##        # Remove O, U and X from the count of amino acids
##        O = stats[aminoAcids.index('O')]
##        U = stats[aminoAcids.index('U')]
##        X = stats[aminoAcids.index('X')]
##        seqLen = seqLen - O - U - X

        # B corresponds to asparagine (N) or aspartic acid (D)
        # Get the number of N and the number of D and treat a B as N/(N+D) asparagines and D/(N+D) aspartic acids
        B = stats[aminoAcids.index('B')]
        N = stats[aminoAcids.index('N')]
        D = stats[aminoAcids.index('D')]
        if B != 0:
            extraN = N / 2.#N / (N + D)
            extraD = D / 2.#D / (N + D)
            stats[aminoAcids.index('N')] += B * extraN
            stats[aminoAcids.index('D')] += B * extraD

        # J corresponds to leucine (L) or isoleucine (I)
        # Get the number of L and the number of I and treat a J as L/(L+I) leucines and I/(L+I) isoleucines
        J = stats[aminoAcids.index('J')]
        L = stats[aminoAcids.index('L')]
        I = stats[aminoAcids.index('I')]
        if J != 0:
            extraL = L / 2.#L / (L + I)
            extraI = I / 2.#I / (L + I)
            stats[aminoAcids.index('L')] += J * extraL
            stats[aminoAcids.index('I')] += J * extraI

        # Z corresponds to glutamine (Q) or glutamic acid (E)
        # Get the number of Q and the number of E and treat a Z as Q/(Q+E) glutamines and E/(Q+E) glutamic acids
        Z = stats[aminoAcids.index('Z')]
        Q = stats[aminoAcids.index('Q')]
        E = stats[aminoAcids.index('E')]
        if Z != 0:
            extraQ = Q / 2.#Q / (Q + E)
            extraE = E / 2.#E / (Q + E)
            stats[aminoAcids.index('Q')] += Z * extraQ
            stats[aminoAcids.index('E')] += Z * extraE

        hydroCalc = 0
        tinySum = 0
        smallSum = 0
        aliphaticSum = 0
        aromaticSum = 0
        nonpolarSum = 0
        polarSum = 0
        chargedSum = 0
        basicSum = 0
        negativelyChargedSum = 0
        positivelyChargedSum = 0
        for i in trueAAs:
            # For each of the 20 amino acids coded for by the genome, insert the amino acid frequency information into the table.
            insertValue = stats[aminoAcids.index(i)] / seqLen
            cursor = mysql.tableUPDATE(cursor, tableProteinInfo, i + '=' + str(insertValue), 'UPAccession = \'' + UPAcc + '\'')

            # Calculate the hydrophobicity information.
            hydroCalc = hydroCalc + (stats[aminoAcids.index(i)] * hydro[i])

            # Determine the number of tiny, small, etc etc amino acids.
            if i in tinyAAs:
                tinySum += stats[aminoAcids.index(i)]
            if i in smallAAs:
                smallSum += stats[aminoAcids.index(i)]
            if i in aliphaticAAs:
                aliphaticSum += stats[aminoAcids.index(i)]
            if i in aromaticAAs:
                aromaticSum += stats[aminoAcids.index(i)]
            if i in nonpolarAAs:
                nonpolarSum += stats[aminoAcids.index(i)]
            if i in polarAAs:
                polarSum += stats[aminoAcids.index(i)]
            if i in chargedAAs:
                chargedSum += stats[aminoAcids.index(i)]
            if i in basicAAs:
                basicSum += stats[aminoAcids.index(i)]
            if i in negativelyCharged:
                negativelyChargedSum += stats[aminoAcids.index(i)]
            if i in positivelyCharged:
                positivelyChargedSum += stats[aminoAcids.index(i)]

        # Calculate the mean hydrophobicity of the sequence.
        hydroCalc /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Hydrophobicity=' + str(hydroCalc), 'UPAccession = \'' + UPAcc + '\'')
        # Calculate the fraction of the sequence that is made up of each class of amino acids.
        tinySum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Tiny=' + str(tinySum), 'UPAccession = \'' + UPAcc + '\'')
        smallSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Small=' + str(smallSum), 'UPAccession = \'' + UPAcc + '\'')
        aliphaticSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Aliphatic=' + str(aliphaticSum), 'UPAccession = \'' + UPAcc + '\'')
        aromaticSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Aromatic=' + str(aromaticSum), 'UPAccession = \'' + UPAcc + '\'')
        nonpolarSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'NonPolar=' + str(nonpolarSum), 'UPAccession = \'' + UPAcc + '\'')
        polarSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Polar=' + str(polarSum), 'UPAccession = \'' + UPAcc + '\'')
        chargedSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Charged=' + str(chargedSum), 'UPAccession = \'' + UPAcc + '\'')
        basicSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'Basic=' + str(basicSum), 'UPAccession = \'' + UPAcc + '\'')
        negativelyChargedSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'NegativelyCharged=' + str(negativelyChargedSum), 'UPAccession = \'' + UPAcc + '\'')
        positivelyChargedSum /= seqLen
        cursor = mysql.tableUPDATE(cursor, tableProteinInfo, 'PositivelyCharged=' + str(positivelyChargedSum), 'UPAccession = \'' + UPAcc + '\'')

    mysql.closeConnection(conn, cursor)
def main(UPExternalLinks, tableUniProt2Ensembl, tableUniProt2GO, tableUniProt2UniGene, tableUniProt2HGNC,
         tableDict, schemaProteins, databasePassword):

    # Determine all the external database links for each represetnative UniProt accession.
    proteinLinks = {}
    readIn = open(UPExternalLinks, 'r')
    for line in readIn:
        line = line.strip()
        chunks = line.split(',')
        entrezLinks = [i for i in chunks[1].split(';')] if chunks[1] != '' else []
        unigeneLinks = [int(i.replace('Hs.', '')) for i in chunks[2].split(';')] if chunks[2] != '' else []
        goLinks = [int(i.replace('GO:', '')) for i in chunks[3].split(';')] if chunks[3] != '' else []
        hgncLinks = [i for i in chunks[4].split(';')] if chunks[4] != '' else []
        ensemblLinks = [i.split('-') for i in chunks[5].split(';')] if chunks[4] != '' else []
        proteinLinks[chunks[0]] = {'Entrez' : entrezLinks, 'UniGene' : unigeneLinks, 'GO' : goLinks,
                                   'HGNC' : hgncLinks, 'Ensembl' : ensemblLinks
                                   }
    readIn.close()

    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)

    validXrefIDs = {'EnsemblGeneID' : set([]), 'GeneID' : set([]), 'GOTermID' : set([]), 'UPAccession' : set([]),
                    'UniGeneID' : set([]), 'EnsemblTranscriptID' : set([])}
    for i in tableDict.keys():
        for j in tableDict[i]:
            # For each of the external databases with its own table in the database (e.g. Ensembl genes, UniGene, GO terms), determine which of the cross-references recorded
            # are actually referencing a valid ID in the respective database.
            # For example, if the file of cross-references says that UniProt accession U is linked to GO term 123, then make sure that 123 is in fact a valid Go term ID (i.e. that
            # it is in the table that contains all the GO term IDs).
            cursor = mysql.tableSELECT(cursor, i, j)
            results = [k[0] for k in cursor.fetchall()]
            results = set(results)
            validXrefIDs[i] = validXrefIDs[i].union(results)

    # Determine all the UniProt accession cross-references that are referencing a valid external database identifier.
    entrezInsert = []
    unigeneInsert = []
    goInsert = []
    hgncInsert = []
    ensemblInsert = []
    for i in proteinLinks.keys():
        if not i in validXrefIDs['UPAccession']:
            continue
        for j in proteinLinks[i]['Entrez']:
            if j in validXrefIDs['GeneID']:
                entrezInsert.append(tuple([i, str(j)]))
        for j in proteinLinks[i]['UniGene']:
            if j in validXrefIDs['UniGeneID']:
                unigeneInsert.append(tuple([i, j]))
        for j in proteinLinks[i]['GO']:
            if j in validXrefIDs['GOTermID']:
                goInsert.append(tuple([i, j]))
        for j in proteinLinks[i]['HGNC']:
            hgncInsert.append(tuple([i, j]))
        for j in proteinLinks[i]['Ensembl']:
            if j[0] in validXrefIDs['EnsemblGeneID']:# and j[1] in validXrefIDs['EnsemblTranscriptID']:
                ensemblInsert.append(tuple([i] + j))

    print '\tNow Recording UniGene Crossreferences.'
    cursor.execute('TRUNCATE TABLE ' + tableUniProt2UniGene)
    values = '(' + ('%s,' * len(unigeneInsert[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableUniProt2UniGene, values, unigeneInsert)

    print '\tNow Recording GO Crossreferences.'
    cursor.execute('TRUNCATE TABLE ' + tableUniProt2GO)
    values = '(' + ('%s,' * len(goInsert[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableUniProt2GO, values, goInsert)

    print '\tNow Recording HGNC Crossreferences.'
    cursor.execute('TRUNCATE TABLE ' + tableUniProt2HGNC)
    values = '(' + ('%s,' * len(hgncInsert[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableUniProt2HGNC, values, hgncInsert)

    print '\tNow Recording Ensembl Crossreferences.'
    cursor.execute('TRUNCATE TABLE ' + tableUniProt2Ensembl)
    values = '(' + ('%s,' * len(ensemblInsert[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableUniProt2Ensembl, values, ensemblInsert)

    mysql.closeConnection(conn, cursor)
def main(UPProteinInfo, schemaProteins, tableProteinInfo, folderSEG, SEGExe, folderEpestfind, epestfindExe,
         folderPepstats, pepstatsExe, folderBLAST, psiblastExe, makeBLASTDatabaseExe, tableBLASTResults,
         databasePassword):

    #===========================================================================
    # Extract and format the parsed protein data.
    #===========================================================================
    uniprotData = utilities.file2list.main(UPProteinInfo)
    uniprotData = [eval(i) for i in uniprotData]
    uniprotDict = dict([(i[0], i) for i in uniprotData])
    sequenceOfAllProteinsDict = dict([(i, uniprotDict[i][-1]) for i in uniprotDict])  # A dictionary indexed by UniProt accession with the value for each index being the sequence of the protein.

    #===========================================================================
    # Add the data to the database.
    #===========================================================================
    rowsToAdd = [uniprotDict[i] for i in uniprotDict]
    values = '(' + ('%s,' * len(uniprotData[0]))
    values = values[:-1] + ')'
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableINSERT(cursor, tableProteinInfo, values, rowsToAdd)
    mysql.closeConnection(conn, cursor)
    print '\tEntries added to the UniProt table: ', len(toAdd)

    #===========================================================================
    # Annotate the proteins which have just been added to the database.
    #===========================================================================
    # Calculate the number of low complexity regions.
    proteinFasta = folderSEG + '/TempSEGFasta.fasta'
    SEGOutput = folderSEG + '/TempSEGOutput.txt'
    calculate_low_complexity(sequenceOfAllProteinsDict, SEGExe, proteinFasta, SEGOutput, schemaProteins,
                             tableProteinInfo, databasePassword)
    os.remove(proteinFasta)
    os.remove(SEGOutput)

    # Calculate the number of pest motifs.
    proteinFasta = folderEpestfind + '/TempEpestfindFasta.fasta'
    epestfindOutput = folderEpestfind + '/TempEpestfindOutput.txt'
    calculate_pest_motif(sequenceOfAllProteinsDict, epestfindExe, proteinFasta, epestfindOutput, schemaProteins,
                         tableProteinInfo, databasePassword)
    os.remove(proteinFasta)
    os.remove(epestfindOutput)

    # Calculate simple sequence statistics.
    proteinFasta = folderPepstats + '/TempPepstatsFasta.fasta'
    pepstatsOutput = folderPepstats + '/TempPepstatsOutput.txt'
    calculate_sequence_stats(sequenceOfAllProteinsDict, pepstatsExe, proteinFasta, pepstatsOutput, schemaProteins,
                             tableProteinInfo, databasePassword)
    os.remove(proteinFasta)
    os.remove(pepstatsOutput)

    #===========================================================================================
    # Use BLAST to determine the pairwise sequence identity of all the proteins in the table.
    #===========================================================================================
    BLASTOutput = folderBLAST + '/ProcessedBLAST.txt'
    if os.path.isfile(BLASTOutput):
        os.remove(BLASTOutput)

    # Generate the two fasta file to make the BLAST database from.
    allProteinsFasta = folderBLAST + '/TempAllProteinsFasta.fasta'
    writeTo = open(allProteinsFasta, 'w')
    for i in sequenceOfAllProteinsDict.keys():
        writeTo.write('>' + i + '\n')
        writeTo.write(sequenceOfAllProteinsDict[i] + '\n')
    writeTo.close()

    # Set the non-unique PSI-BLAST parameters
    tempQuery = folderBLAST + '/TempQuery.fasta'
    evalue = ' -evalue 1'
    inclusionEThresh = ' -inclusion_ethresh 0.0001'
    numIterations = ' -num_iterations 3'
    gapTrigger = ' -gap_trigger 18'
    numDescriptions = ' -num_descriptions 10000'
    numAlignments = ' -num_alignments 10000'
    dbsize = ' -dbsize 0'
    outputFormat = ' -outfmt "7 qseqid sseqid pident length evalue"'
    numThreads = ' -num_threads 2'

    # BLAST the proteins.
    tempBlastDatabaseFolder = folderBLAST + '/TempAllProtDB'
    os.mkdir(tempBlastDatabaseFolder)
    tempBlastDatabase = tempBlastDatabaseFolder + '/AllProt'
    makeDBArgs = makeBLASTDatabaseExe + ' -in ' + allProteinsFasta + ' -out ' + tempBlastDatabase
    subprocess.call(makeDBArgs)
    allBLASTOutput = folderBLAST + '/AllBLASTOutput.txt'
    out = ' -out ' + allBLASTOutput
    db = ' -db ' + tempBlastDatabase
    for i in sequenceOfAllProteinsDict.keys():
        writeTo = open(tempQuery, 'w')
        writeTo.write('>' + i + '\n')
        writeTo.write(sequenceOfAllProteinsDict[i] + '\n')
        writeTo.close()
        query = ' -query ' + tempQuery
        argsPSI = (query + out + evalue + inclusionEThresh + numIterations + gapTrigger + numDescriptions +
                   numAlignments + dbsize + db + outputFormat + numThreads)
        subprocess.call(psiblastExe + argsPSI, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        parsers.parsePSIBLAST.main(allBLASTOutput, BLASTOutput)
    shutil.rmtree(tempBlastDatabaseFolder)
    os.remove(allBLASTOutput)
    os.remove(tempQuery)
    os.remove(allProteinsFasta)

    # Enter the BLAST information into the blast results table.
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    matchesFound = {}
    readIn = open(BLASTOutput, 'r')
    for line in readIn:
        chunks = line.split('\t')
        index = tuple(sorted([chunks[0], chunks[1]]))
        query = index[0]
        hit = index[1]
        similarity = float(chunks[2])
        length = int(chunks[3])
        eValue = float(chunks[4])
        if not matchesFound.has_key(index):
            # If this is the first time the (query, hit) pair has been found.
            matchesFound[index] = {}
            matchesFound[index]['Similarity'] = similarity
            matchesFound[index]['Tuple'] = tuple([query, hit, similarity, length, eValue])
        else:
            # If the (query, hit) pair has been found previously.
            if similarity < matchesFound[index]['Similarity']:
                # If the similarity of the new (query, hit) pair is less than the previously found occurence of the
                # pair, then discard this pair. This is because we are looking for the situation where the
                # similarity is greatest, in order to know which pairs are redundant. Continue to next loop
                # iteration as we do not want to update the database with this pair, or add this pair.
                continue
            else:
                matchesFound[index]['Similarity'] = similarity
                matchesFound[index]['Tuple'] = tuple([query, hit, similarity, length, eValue])
    readIn.close()

    tuplesToAdd = []
    for i in matchesFound:
        query = matchesFound[i]['Tuple'][0]
        hit = matchesFound[i]['Tuple'][1]
        if not query == hit:
            tuplesToAdd.append(matchesFound[i]['Tuple'])

    if tuplesToAdd != []:
        values = '(' + ('%s,' * len(tuplesToAdd[0]))
        values = values[:-1] + ')'
        cursor.execute('TRUNCATE TABLE ' + tableBLASTResults)
        cursor = mysql.tableINSERT(cursor, tableBLASTResults, values, tuplesToAdd)
    mysql.closeConnection(conn, cursor)
def main(unigeneParsedOutput, unigeneParsedTotals, schemaProteins, tableUniGene, tableUniGeneTotals, databasePassword):

    #===========================================================================
    # Extract and format the parsed UniGene data, and the UniGene expression totals.
    #===========================================================================
    UGData = utilities.file2list.main(unigeneParsedOutput)
    UGData = [tuple([int(j) for j in eval(i)]) for i in UGData]
    UGDict = dict([(i[0], i) for i in UGData])

    UGTotalsData = utilities.file2list.main(unigeneParsedTotals)
    UGTotalsData = [tuple([j[0], eval(j[1])]) for j in [eval(i) for i in UGTotalsData]]

    #===========================================================================
    # Extract the UniGene information recorded in the database.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableSELECT(cursor, '*', tableUniGene)
    results = cursor.fetchall()
    mysql.closeConnection(conn, cursor)

    #===========================================================================
    # Compare the parsed data with the data recorded in the expression table.
    #===========================================================================
    columnIndices = range(1, len(UGData[0]))
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute('SHOW COLUMNS FROM ' + tableUniGene)
    columns = cursor.fetchall()
    mysql.closeConnection(conn, cursor)
    columns = [i[0] for i in columns]

    toRemove = []
    toUpdate = {}
    toAdd = UGDict.keys()
    for i in results:
        UniGeneID = i[0]
        if UGDict.has_key(UniGeneID):
            # If the key is in both the parsed file and the expression table, then it does not need to be added.
            toAdd.remove(i[0])
            # Compare the row from the expression table with the parsed file, to determine if the expression table needs updating.
            for j in columnIndices:
                if i[j] != UGDict[UniGeneID][j]:
                    if not toUpdate.has_key(UniGeneID):
                        toUpdate[UniGeneID] = []
                    toUpdate[UniGeneID].append(j)
        else:
            # If the key is in the expression table, but not in the parsed file, then the row needs to be removed.
            toRemove.append(i[0])
    values = '(' + ('%s,' * len(UGData[0]))
    values = values[:-1] + ')'

    #===========================================================================
    # Remove rows from the expression table that are not in the parsed file.
    #===========================================================================
    for i in toRemove:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.rowDELETE(cursor, tableUniGene, 'UniGeneID="' + i + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries removed from the UniGene table: ', len(toRemove)

    #===========================================================================
    # Update rows that have different values in the parsed file and the expression table.
    #===========================================================================
    for i in toUpdate.keys():
        toSet = []
        for j in toUpdate[i]:
            updateString = columns[j] + ' = "' + UGDict[i][j] + '"'
            toSet.append(updateString)
        toSet = ', '.join(toSet)
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableUPDATE(cursor, tableUniGene, toSet, 'UniGeneID="' + i + '"')
        mysql.closeConnection(conn, cursor)
    print '\tEntries updated in the UniGene table: ', len(toUpdate)

    #===========================================================================
    # Add rows which are not in the expression table, but are in the parsed file.
    #===========================================================================
    rowsToAdd = [UGDict[i] for i in toAdd]
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableINSERT(cursor, tableUniGene, values, rowsToAdd)
    mysql.closeConnection(conn, cursor)
    print '\tEntries added to the UniGene table: ', len(toAdd)

    #===========================================================================
    # Enter the expression totals in the totals table.
    #===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute('TRUNCATE TABLE ' + tableUniGeneTotals)
    values = '(' + ('%s,' * len(UGTotalsData[0]))
    values = values[:-1] + ')'
    cursor = mysql.tableINSERT(cursor, tableUniGeneTotals, values, UGTotalsData)
    mysql.closeConnection(conn, cursor)
    print '\tUniGene totals table updated.'
def main(parsedGOOutput, schemaProteins, tableGOInfo, databasePassword):

    # ===========================================================================
    # Extract and format the parsed GO data.
    # ===========================================================================
    GOData = utilities.file2list.main(parsedGOOutput)
    GOData = [eval(i) for i in GOData]
    GODict = dict([(i[0], i) for i in GOData])

    # ===========================================================================
    # Extract the GO information recorded in the database.
    # ===========================================================================
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor = mysql.tableSELECT(cursor, "*", tableGOInfo)
    results = cursor.fetchall()
    mysql.closeConnection(conn, cursor)

    # ===========================================================================
    # Compare the parsed data with the data recorded in the table.
    # ===========================================================================
    columnIndices = range(1, len(GOData[0]))
    conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
    cursor.execute("SHOW COLUMNS FROM " + tableGOInfo)
    columns = cursor.fetchall()
    mysql.closeConnection(conn, cursor)
    columns = [i[0] for i in columns]

    toRemove = []
    toUpdate = {}
    toAdd = GODict.keys()
    for i in results:
        geneID = i[0]
        if GODict.has_key(geneID):
            # If the key is in both the parsed file and the table, then it does not need to be added.
            toAdd.remove(i[0])
            # Compare the row from the table with the parsed file, to determine if the table needs updating.
            for j in columnIndices:
                if i[j] != GODict[geneID][j]:
                    if not toUpdate.has_key(geneID):
                        toUpdate[geneID] = []
                    toUpdate[geneID].append(j)
        else:
            # If the key is in the table, but not in the parsed file, then the row needs to be removed.
            toRemove.append(i[0])
    values = "(" + ("%s," * len(GOData[0]))
    values = values[:-1] + ")"

    # ===========================================================================
    # Remove rows from the table that are not in the parsed file.
    # ===========================================================================
    for i in toRemove:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.rowDELETE(cursor, tableGOInfo, 'GOTermID="' + str(i) + '"')
        mysql.closeConnection(conn, cursor)
    print "\tEntries removed from the GO table: ", len(toRemove)

    # ===========================================================================
    # Update rows that have different values in the parsed file and the table.
    # ===========================================================================
    for i in toUpdate.keys():
        toSet = []
        for j in toUpdate[i]:
            updateString = columns[j] + ' = "' + GODict[i][j] + '"'
            toSet.append(updateString)
        toSet = ", ".join(toSet)
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableUPDATE(cursor, tableGOInfo, toSet, 'GOTermID="' + str(i) + '"')
        mysql.closeConnection(conn, cursor)
    print "\tEntries updated in the GO table: ", len(toUpdate)

    # ===========================================================================
    # Add rows which are not in the table, but are in the parsed file.
    # ===========================================================================
    # Split the records to be inserted into smaller chunks so the database connection is not lost
    # In order to get this to work you need to increase the size of max_allowed_packet for the
    # MySQL server. This is because the size of some of the paths is very large.
    # I did this by altering the default my.ini file to contain the line:
    # max_allowed_packets=32M
    # and put this line under the [mysqld] section
    rowsToAdd = [GODict[i] for i in toAdd]
    length = len(rowsToAdd)
    itemsInSplit = 5
    numberOfSplits = length / itemsInSplit
    if length % itemsInSplit != 0:
        numberOfSplits += 1
    recordsToInsert = [rowsToAdd[i * itemsInSplit : (i + 1) * itemsInSplit] for i in range(numberOfSplits)]
    for i in recordsToInsert:
        conn, cursor = mysql.openConnection(inputPass=databasePassword, database=schemaProteins)
        cursor = mysql.tableINSERT(cursor, tableGOInfo, values, i)
        mysql.closeConnection(conn, cursor)
    print "\tEntries added to the GO table: ", len(toAdd)
def main(pathwayElements, UPHumanNames, UPHumanAccessionMap, databasePassword, schemaProteins, tablePathways):

    # Generate the human accession mapping:
    UPAccMap = utilities.file2list.main(UPHumanAccessionMap)
    # Make a dictionary where the index is the, possibly deprecated UniProt accession, and the entry is the
    # representative accession.
    allUPID = {}
    for i in UPAccMap:
        chunks = i.split()
        allUPID[chunks[0]] = chunks[1]

    # Extract the names of the valid proteins.
    validProteinNames = utilities.file2list.main(UPHumanNames)

    # Extract the information about what pathways each element is in.
    proteinData = {}
    readIn = open(pathwayElements, 'r')
    for line in readIn:
        line = line.rstrip()
        chunks = line.split('\t')
        pathway = chunks[0]
        elements = chunks[2:]
        for i in elements:
            # For every element in the pathway, extract the names and accessions of the protein. Also record the
            # name of the pathways that the protein is associated with.
            chunks = i.split(':')
            cpath = chunks[0]
            type = chunks[1]
            name = chunks[2]
            UPAccession = chunks[3]
            if proteinData.has_key(cpath):
                proteinData[cpath]['Names'] |= set(name.split(','))
                proteinData[cpath]['UPAccessions'] |= set(UPAccession.split(','))
                proteinData[cpath]['Pathways'].add(pathway)
            else:
                proteinData[cpath] = {'Names' : set(name.split(',')), 'UPAccessions' : set(UPAccession.split(',')), 'Pathways' : set([pathway])}
    readIn.close()

    # Consolidate the pathway element information extracted. This means combining records with UniProt accessions
    # that map to the same accession, and removing any proteins that can't be found to have a known accession or
    # a valid name.
    consolidatedProteinData = {}
    for i in proteinData.keys():
        UPAccessions = proteinData[i]['UPAccessions']
        pathways = proteinData[i]['Pathways']
        for j in UPAccessions:
            j = j.split('-')[0]  # Combine all isoforms into one record.
            # Convert each UniProt accession into its representative form.
            if allUPID.has_key(j):
                reprAcc = allUPID[j]
                if consolidatedProteinData.has_key(reprAcc):
                    consolidatedProteinData[reprAcc]['UPAccessions'] |= UPAccessions
                    consolidatedProteinData[reprAcc]['Pathways'] |= pathways
                else:
                    consolidatedProteinData[reprAcc] = {'UPAccessions' : UPAccessions, 'Pathways' : pathways}
            else:
                # Don't use the names for anything right now.
                pass

    proteinTuples = []
    for i in consolidatedProteinData.keys():
        proteinTuples.append(tuple([i, len(consolidatedProteinData[i]['Pathways'])]))

    values = '(' + ('%s,' * len(proteinTuples[0]))
    values = values[:-1] + ')'
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor.execute('TRUNCATE TABLE ' + tablePathways)
    mysql.tableINSERT(cursor, tablePathways, values, proteinTuples)
    mysql.closeConnection(conn, cursor)
def main(DBDrugIDs, DBTargetIDs, TTDUPAccessions, ChEMBLUPAccessions, UPHumanAccessionMap,
         UPDrugIDs, folderCulling, schemaProteins, tableProteinInfo, tableNonRedundant, tableBLASTResults,
         databasePassword, viewsDict):

    allTargetsDB = xref_drugbank_uniprot(DBTargetIDs, UPHumanAccessionMap)
    allTargetsTTD = xref_TTD_uniprot(TTDUPAccessions, UPHumanAccessionMap)
    allTargets = list(set(allTargetsChEMBL) | set(allTargetsDB) | set(allTargetsTTD) | set(allTargetsUP))
    print '\tTotal number of unique targets found: ', len(allTargets)

    # Extract mode of action and clear the target information.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor = mysql.tableSELECT(cursor, 'UPAccession, ModeOfAction', tableProteinInfo)
    resultsModeOfAction = cursor.fetchall()
    for i in resultsModeOfAction:
        upid = i[0]
        mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="N"', 'UPAccession="' + upid + '"')
    mysql.closeConnection(conn, cursor)

    # Generate the sets of proteins that are GPCRs, kinases, ion channels and proteases.
    gpcr = []
    kinases = []
    ionChannels = []
    proteases = []
    for i in resultsModeOfAction:
        if i[1] == 'G-protein coupled receptor':
            gpcr.append(i[0])
        elif i[1] == 'Kinase':
            kinases.append(i[0])
        elif i[1] == 'Ion Channel':
            ionChannels.append(i[0])
        elif i[1] == 'Protease':
            proteases.append(i[0])

    # Update the table to indicate which proteins are targets.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    for i in allTargets:
        mysql.tableUPDATE(cursor, tableProteinInfo, 'Target="Y"', 'UPAccession="' + i + '"')
    mysql.closeConnection(conn, cursor)

    # Perform redundancy removal using Leaf.
    print '\tPerforming redundancy removal.'
    # Proteins have had their target status changed, so the redundancy needs to be recalculated.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    # Set the number of columns in the nonredundant table.
    cursor.execute('SHOW COLUMNS FROM ' + tableNonRedundant)
    numberColumns = len(cursor.fetchall())
    # Select all the proteins recorded in the database. The number of columns has one subtracted from it as the
    # UP accession column does not take the default value.
    cursor = mysql.tableSELECT(cursor, 'UPAccession', tableProteinInfo)
    allProteins = [tuple([i[0]] + (['N'] * (numberColumns - 1))) for i in cursor.fetchall()]
    # Wipe and refill the nonredundant table.
    cursor.execute('TRUNCATE TABLE ' + tableNonRedundant)
    values = '(' + ('%s,' * numberColumns)
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableNonRedundant, values, allProteins)
    for column in sorted(viewsDict.keys()):
        print '\t\tRunning redundancy removal on ', column
        # For each set of proteins, run the Leaf program.
        inputLocation = folderCulling + '/' + column + '.txt'
        cursor = mysql.tableSELECT(cursor, '*', viewsDict[column])
        results = cursor.fetchall()
        # Determine the accessions of the proteins in the current set.
        proteinSet = [i[0] for i in results]
        print '\t\t\tSize of Redundant Dataset: ', len(proteinSet)
        proteinSetString = '\',\''.join(proteinSet)
        # Select all the BLAST results where both the hit and query protein are in the set to cull.
        cursor = mysql.tableSELECT(cursor, '*', tableBLASTResults, 'ProteinA IN (\'' + proteinSetString + '\') AND ProteinB IN (\'' + proteinSetString + '\')')
        protResults = cursor.fetchall()
        # Generate the file that is going to be used to perform the culling.
        writeTo = open(inputLocation, 'w')
        for i in protResults:
            writeTo.write('\t'.join([str(j) for j in i]) + '\n')
        writeTo.close()
        # Perform the culling.
        adjMatrix, proteinNames = culling.adjlistcreation.main(inputLocation, cutoffPercent=20, maxEValue=1, minAlignLength=20)
        print '\t\t\tNumber of Proteins in Similarity Graph: ', len(proteinNames)
        proteinsToCull = culling.Leafcull.main(adjMatrix, proteinNames)
        print '\t\t\tNumber of Proteins to Cull: ', len(proteinsToCull)
        for i in proteinsToCull:
            mysql.tableUPDATE(cursor, tableNonRedundant, column + '="N"', 'UPAccession="' + str(i) + '"')
        proteinsToKeep = [i for i in proteinSet if i not in proteinsToCull]
        print '\t\t\tNumber of Proteins to Keep: ', len(proteinsToKeep)
        for i in proteinsToKeep:
            mysql.tableUPDATE(cursor, tableNonRedundant, column + '="Y"', 'UPAccession="' + str(i) + '"')
    mysql.closeConnection(conn, cursor)
def main(UPDrugIDs, DBDrugIDs, DBTargetIDs, TTDTarget2Drug, ChEMBLUPAccessions, ChEMBLCID, bindingParsed,
         UPHumanAccessionMap, databasePassword, schemaProteins, tableDrugs):

    # Generate the human accession mapping:
    UPAccMap = utilities.file2list.main(UPHumanAccessionMap)
    # Make a dictionary where the index is the, possibly deprecated UniProt accession, and the entry is the
    # representative accession.
    allUPID = {}
    for i in UPAccMap:
        chunks = i.split()
        allUPID[chunks[0]] = chunks[1]

    # Extract the DrugBank drug IDs, the CAS number and the CID for every approved DrugBank drug.
    approvedDrugBankDrugs = {}
    approvedDrugTuples = set([])
    readDrugs = open(DBDrugIDs, 'r')
    for line in readDrugs:
        chunks = (line.replace('\n', '')).split('\t')
        if len(chunks) < 4 or not 'approved' in chunks[2]:
            continue
        name = chunks[1]
        CAS = chunks[3]
        CID = chunks[4]
        approvedDrugBankDrugs[chunks[0]] = tuple([name, CAS, CID])
        approvedDrugTuples.add(tuple([name, CAS, CID]))
    readDrugs.close()
    approvedDrugs = approvedDrugBankDrugs.keys()

    # Extract the information about which DrugBank drugs target UniProt proteins, as recorded by UniProt.
    targetsUP = utilities.file2list.main(UPDrugIDs)
    targetDrugLinks = {}
    for i in targetsUP:
        chunks = i.split('\t')
        targetDrugLinks[chunks[0]] = set([i for i in chunks[1].split(';') if i in approvedDrugs])

    # Extract the information about which DrugBank drugs target UniProt proteins, as recorded by DrugBank.
    targetsDB = utilities.file2list.main(DBTargetIDs)
    for i in targetsDB:
        chunks = i.split('\t')
        proteinAcc = chunks[0]
        if not allUPID.has_key(proteinAcc):
            continue
        proteinAcc = allUPID[proteinAcc]
        if targetDrugLinks.has_key(proteinAcc):
            targetDrugLinks[proteinAcc].union([i for i in chunks[1].split(';') if i in approvedDrugs])
            #targetDrugLinks[proteinAcc].intersection([i for i in chunks[1].split(';') if i in approvedDrugs])
        else:
            targetDrugLinks[proteinAcc] = [i for i in chunks[1].split(';') if i in approvedDrugs]

    # Convert DrugBank drug IDs to CIDs where possible.
    for i in targetDrugLinks.keys():
        targetDrugLinks[i] = [approvedDrugBankDrugs[j] for j in targetDrugLinks[i]]

    # Extract the TTD target-drug relationship information.
    targetsTTD = utilities.file2list.main(TTDTarget2Drug)
    for i in targetsTTD:
        chunks = i.split('\t')
        proteinAccs = [allUPID[i] for i in chunks[1].split(',') if allUPID.has_key(i)]
        if proteinAccs == []:
            continue
        drugInfo = [tuple(j.split(',')) for j in chunks[2].split(';')]
        for j in drugInfo:
            approvedDrugTuples.add(j)
        for j in proteinAccs:
            if not targetDrugLinks.has_key(j):
                # If the UniProt accession associated with the TTD target is not already recorded as a drug target,
                # then add it along with the drugs that target it.
                targetDrugLinks[j] = drugInfo
            else:
                # The UniProt accession is already recorded in the target drug relationship dictionary.
                # Collect all the (Name, CAS, CID) tuples, and see if the drugs associated with the UniProt
                # accession in the TTD, are already accounted for.
                drugTuples = targetDrugLinks[j]
                drugNames = set([])
                drugCASes = set([])
                drugCIDs = set([])
                for k in drugTuples:
                    drugNames.add(k[0])
                    drugCASes.add(k[1])
                    drugCIDs.add(k[2])
                emptySet = set([''])
                drugNames -= emptySet
                drugCASes -= emptySet
                drugCIDs -= emptySet
                for k in drugInfo:
                    if k[0] in drugNames or k[1] in drugCASes or k[2] in drugCIDs:
                    # If this is True, then that means that one of the name, CAS or CID is already recorded in the
                    # list of drugs that target the protein. Therefore, the drug is already recorded as targeting
                    # the protein and should not be added.
                        continue
                    targetDrugLinks[j].append(k)

    # Extract the ChEMBL drug-target relationship information, along with the binding information.
    molregno2CID = {}
    readCIDs = open(ChEMBLCID, 'r')
    for line in readCIDs:
        chunks = (line.strip()).split('\t')
        molregno2CID[chunks[0]] = chunks[1]
    readCIDs.close()
    targetDrugBinding = {}
    targetsChEMBL = utilities.file2list.main(ChEMBLUPAccessions)
    for i in targetsChEMBL:
        chunks = i.split('\t')
        UPAccession = chunks[0]
        if not allUPID.has_key(UPAccession):
            # If the target protein is not a UniProt human protein.
            continue
        else:
            UPAccession = allUPID[UPAccession]
        molregno = chunks[1]
        drugID = molregno2CID[molregno] if molregno2CID.has_key(molregno) else 'm' + molregno
        name = chunks[2]

        # Update the drug-target relationship information.
        drugInfo = tuple([name, '', drugID])
        if not targetDrugLinks.has_key(UPAccession):
            # If the UniProt accession associated with the TTD target is not already recorded as a drug target,
            # then add it along with the drugs that target it.
            targetDrugLinks[UPAccession] = [drugInfo]
        else:
            # The UniProt accession is already recorded in the target drug relationship dictionary.
            # Collect all the (Name, CAS, CID) tuples, and see if the drugs associated with the UniProt
            # accession in the TTD, are already accounted for.
            drugTuples = targetDrugLinks[UPAccession]
            drugNames = set([])
            drugCASes = set([])
            drugCIDs = set([])
            for j in drugTuples:
                drugNames.add(j[0])
                drugCASes.add(j[1])
                drugCIDs.add(j[2])
            emptySet = set([''])
            drugNames -= emptySet
            drugCASes -= emptySet
            drugCIDs -= emptySet
            if drugInfo[0] in drugNames or drugInfo[1] in drugCASes or drugInfo[2] in drugCIDs:
            # If this is True, then that means that one of the name, CAS or CID is already recorded in the
            # list of drugs that target the protein. Therefore, the drug is already recorded as targeting
            # the protein and should not be added.
                pass
            else:
                targetDrugLinks[UPAccession].append(drugInfo)

        # Record drug binding information.
        if chunks[4] == 'None' or chunks[5] == 'None':
            continue
        value = float(chunks[4])
        units = chunks[5]
        if units == 'M':
            # Convert the value to nM.
            value *= 1000000000
        elif units == 'mM':
            # Convert the value to nM.
            value *= 1000000
        elif units == 'uM':
            # Convert the value to nM.
            value *= 1000
        type = chunks[6]
        bindingKey = tuple([UPAccession, drugID])
        if type.lower() == 'ki':
            if not targetDrugBinding.has_key(bindingKey):
                targetDrugBinding[bindingKey] = {'Ki' : value, 'Kd' : float('Inf')}
            else:
                targetDrugBinding[bindingKey]['Ki'] = min(targetDrugBinding[bindingKey]['Ki'], value)
        elif type.lower() == 'kd':
            if not targetDrugBinding.has_key(bindingKey):
                targetDrugBinding[bindingKey] = {'Ki' : float('Inf'), 'Kd' : value}
            else:
                targetDrugBinding[bindingKey]['Kd'] = min(targetDrugBinding[bindingKey]['Kd'], value)

    # Extract binding information from BindingDB.
    bindingData = utilities.file2list.main(bindingParsed)
    for i in bindingData:
        chunks = i.split('\t')
        UPAccession = chunks[0]
        if allUPID.has_key(UPAccession):
            UPAccession = allUPID[UPAccession]
        else:
            continue
        CID = chunks[1]
        Ki = chunks[2]
        Kd = chunks[3]
        if CID == 'n/a' or (Ki == 'n/a' and Kd == 'n/a'):
            # If any of these are True then the binding information can not be cross-referenced (CID == 'n/a',
            # or there is no binding information.
            continue
        if Ki[0] in ['>', '<', '=']:
            Ki = float(Ki[1:])
        else:
            try:
                Ki = float(Ki)
            except:
                pass
        if Kd[0] in ['>', '<', '=']:
            Ki = float(Kd[1:])
        else:
            try:
                Kd = float(Kd)
            except:
                pass
        bindingKey = tuple([UPAccession, CID])
        if Ki != 'n/a':
            if not targetDrugBinding.has_key(bindingKey):
                targetDrugBinding[bindingKey] = {'Ki' : Ki, 'Kd' : float('Inf')}
            else:
                targetDrugBinding[bindingKey]['Ki'] = min(targetDrugBinding[bindingKey]['Ki'], Ki)
        if Kd != 'n/a':
            if not targetDrugBinding.has_key(bindingKey):
                targetDrugBinding[bindingKey] = {'Ki' : float('Inf'), 'Kd' : Kd}
            else:
                targetDrugBinding[bindingKey]['Kd'] = min(targetDrugBinding[bindingKey]['Kd'], Kd)

    # Generate the tuples to insert into the database.
    tuplesToInsert = []
    for i in targetDrugLinks:
        for j in targetDrugLinks[i]:
            drugName = j[0]
            drugID = j[2]
            bindingKey = tuple([i, drugID])
            if targetDrugBinding.has_key(bindingKey):
                bindingData = targetDrugBinding[bindingKey]
                Ki = bindingData['Ki']
                if Ki == float('inf'):
                    Ki = -1
                Kd = bindingData['Kd']
                if Kd == float('inf'):
                    Kd = -1
            else:
                Ki = -1
                Kd = -1
            tuplesToInsert.append(tuple([i, drugID, drugName, Ki, Kd]))

    # Remove potential duplicates caused by the names of the drugs having different capital letters, and mysql
    # ignoring the capitalisation of the names.
    tuplesToInsert = [tuple([i[0], i[1], i[2].upper(), i[3], i[4]]) for i in tuplesToInsert]
    tuplesToInsert = list(set(tuplesToInsert))

    # Insert the tuples into the database.
    values = '(' + ('%s,' * len(tuplesToInsert[0]))
    values = values[:-1] + ')'
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor.execute('TRUNCATE TABLE ' + tableDrugs)  # Remove the old data in the table first.
    mysql.tableINSERT(cursor, tableDrugs, values, tuplesToInsert)
    mysql.closeConnection(conn, cursor)