Example #1
0
def sequence_BLAST(processedBLAST, inputFile, database, BLASTLoc, SEG, cores):
    """Will perform the process of BLAST -> PROCESS OUTPUT -> CULL on inputFile.
    
    @param processedBLAST: The location at which to write the output of the processing of the BLAST output.
    @type processedBLAST: string
    @param inputFile: The FASTA file which needs to be submitted to PSI-BLAST.
    @type inputFile: string
    @param database: The database to BLAST the inputFile protein against
    @param database: string
    @param BLASTLoc: The location of the PSI-BLAST executable.
    @type BLASTLoc: string
    @param SEG: Set to True to use SEG to mask low complexity regions of the query.
    @type SEG: boolean
    @param cores: The number of threads to create to run BLAST with.
    @type cores: character
    
    """ 

    # Setup the parameters for the BLASTing.
    outputLoc = '.'.join(inputFile.split('.')[:-1]) + '.tmp'
    argsPSI = ['-query', inputFile, '-out', outputLoc, '-evalue', '1', '-inclusion_ethresh', '0.0001', '-num_iterations', '3', '-gap_trigger', '18', '-num_descriptions', '10000', '-num_alignments', '10000', '-dbsize', '0', '-db', database, '-outfmt', '7 qseqid sseqid pident length evalue']
    if SEG:
        argsPSI.extend(['-seg', 'yes'])
    else:
        argsPSI.extend(['-seg', 'no'])
    # Perform the BLASTing.
    argToBLAST = [BLASTLoc] + argsPSI
    subprocess.call(argToBLAST, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    # Process and cull the BLAST output.
    processPSIoutput.main(outputLoc, processedBLAST)
Example #2
0
def main(inputFile, blastOperationID, cores=2, minAlignLength=20, maxEValue=1.0, verboseOutput=False):
    """Perform the BLASTing of the proteins in an input file (inputFile) against those in another file (databaseFile).

    Returns a dictionary of the similarities between proteins, as determined by BLAST. The dictionary is indexed by a
    alphanumerically ordered tuple (index[0] < index[1]), and the entry for each index is the percentage sequence similarity.

    The BLAST version used must be the C++ version. If a different version is being used (i.e. the old C version), then
    parameters like the number of cores can not be used.

    :param inputFile:           The location of a FASTA format file of the proteins to BLAST against each other.
    :type inputFile:            string
    :param blastOperationID:    The name for the directory where the results of the BLASTing -> parsing will be stored.
    :type blastOperationID:     string
    :param cores:               The number of CPU cores on which BLAST will be run.
    :type cores:                integer
    :param minAlignLength:      The minimum permissible length for the BLAST sequence alignments.
    :type minAlignLength:       integer
    :param maxEValue:           The maximum permissible value which the BLAST EValue can take.
    :type maxEValue:            float
    :param verboseOutput:       Whether status updates of the BLASTing should be printed out to the user.
    :type verboseOutput:        boolean
    :returns :                  A record of the similarities between the proteins
    :type :                     dictionary

    """

    # Get the location of the BLAST executables.
    srcLocation = os.path.dirname(os.path.realpath(__file__))
    BLASTExecutables = os.path.join(srcLocation, 'BLASTExecutables')
    cwd = os.getcwd()
    outputLocation = blastOperationID
    if os.path.exists(outputLocation):
        shutil.rmtree(outputLocation)
    os.mkdir(outputLocation)

    # Generate BLAST database.
    databaseDir = outputLocation + '/TempDatabase'
    os.mkdir(databaseDir)
    makeDBArgs = [BLASTExecutables + '/makeblastdb', '-in', inputFile, '-out', databaseDir + '/TempDB', '-dbtype', 'prot']
    subprocess.call(makeDBArgs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    # Perform BLAST.
    if verboseOutput:
        print('Now BLASTing.')
    resultsBLAST = outputLocation + '/ResultsBLAST.txt'
    sequence_BLAST(resultsBLAST, inputFile, databaseDir + '/TempDB', BLASTExecutables + '/psiblast', cores)

    # Determine the similarities between proteins.
    if verboseOutput:
        print('Now determining similarities.')
    similarities = processPSIoutput.main(resultsBLAST, minAlignLength, maxEValue)

    # Remove the temporary directory used for manipulating and processing the BLAST output.
    try:
        shutil.rmtree(outputLocation)
    except:
        time.sleep(60)
        shutil.rmtree(outputLocation)

    return similarities
def sequence_BLAST(processedBLAST, inputFile, database, BLASTLoc, SEG, cores):
    """Will perform the process of BLAST -> PROCESS OUTPUT on inputFile.
    
    @param processedBLAST: The location at which to write the output of the processing of the BLAST output.
    @type processedBLAST: string
    @param inputFile: The FASTA file which needs to be submitted to PSI-BLAST.
    @type inputFile: string
    @param database: The database to BLAST the inputFile protein against
    @param database: string
    @param BLASTLoc: The location of the PSI-BLAST executable.
    @type BLASTLoc: string
    @param SEG: Set to True to use SEG to mask low complexity regions of the query.
    @type SEG: boolean
    @param cores: The number of threads to create to run BLAST with.
    @type cores: character
    
    """ 

    # Setup the parameters for the BLASTing.
    outputLoc = inputFile.split('.')[0] + '.tmp'    
    query = ' -query ' + inputFile
    out = ' -out ' + outputLoc
    evalue = ' -evalue 1'
    inclusionEThresh = ' -inclusion_ethresh 0.0001'
    numIterations = ' -num_iterations 3'
    gapTrigger = ' -gap_trigger 18'
    numDescriptions = ' -num_descriptions 10000'
    numAlignments = ' -num_alignments 10000'
    dbsize = ' -dbsize 0'
    db = ' -db ' + database
    outputFormat = ' -outfmt "7 qseqid sseqid pident length evalue"'
    if SEG:
        seg = ' -seg yes'
    else:
        seg = ' -seg no'
    numThreads = ' -num_threads ' + str(cores)
    argsPSI = (query + out + evalue + inclusionEThresh + numIterations + gapTrigger + numDescriptions +
               numAlignments + dbsize + db + outputFormat + seg + numThreads
               )
    # Perform the BLASTing.
    subprocess.call(BLASTLoc + argsPSI, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    # Process the BLAST output.
    processPSIoutput.main(outputLoc, processedBLAST)
def main(mmCIFDir, parsedPDB, blastExecutables):
    """Process the entire PDB in order to extract the relevant information about the proteins in it.

    :param mmCIFDir:            The directory containing the mmCIF files for the PDB.
    :type mmCIFDir:             string
    :param parsedPDB:           The directory where the results of the parsing and culling will be written.
    :type parsedPDB:            string
    :param blastExecutables:    The location of the BLAST+ executables.
    :type blastExecutables:     string

    """

    # Define output files in the TSV format expected by the App Engine bulk uploader (TSV with header).
    if not os.path.exists(parsedPDB):
        os.mkdir(parsedPDB)
    fileChains = parsedPDB + '/Chains.tsv'
    fileSimilarity = parsedPDB + '/Similarity.tsv'
    fileAllFasta = parsedPDB + '/AllChains.fasta'
    fileReprFasta = parsedPDB + '/ReprChains.fasta'

    ##################################################################
    # Go through the mmCIF files and extract the desired information #
    ##################################################################
    writeAllFasta = open(fileAllFasta, 'w')
    uniqueSequences = set([])
    mmCIFDirs = os.listdir(mmCIFDir)
    for i in mmCIFDirs:
        mmCIFFolder = mmCIFDir + '/' + i
        mmCIFFiles = os.listdir(mmCIFFolder)  # Get the files from each subfolder.

        # Process each mmCIF file.
        for j in mmCIFFiles:
            currentFile = mmCIFFolder + '/' + j
            entryID, entityRecords, experimentalType, resolution, rFactorObs, rFactorFree = parsePDBmmCIF.main(currentFile)  # Parse the file.

            # For each record in the entry, examine the data about it. Each entry can have one or more record depending on the different chains
            # recorded in the entry.
            for j in entityRecords.keys():
                if 'type' in entityRecords[j]:
                    # If the record contains type information than examine it further. Only those records with type information are of interest.
                    chains = [entry + chain for entry in entryID for chain in entityRecords[j]['chains']]  # The chains in the record.
                    type = entityRecords[j]['type'].strip()  # The type of the record.

                    if type == 'Protein':
                        # Only interested in the record if it's a protein.
                        dbCode = entityRecords[j]['dbCode'].strip() if 'dbCode' in entityRecords[j] else ''  # External database identifier.
                        if dbCode in ['?', '.']:
                            dbCode = ''
                        dbName = entityRecords[j]['dbName'].strip() if 'dbName' in entityRecords[j] else ''  # External database name.
                        if dbName in ['?', '.']:
                            dbName = ''
                        description = entityRecords[j]['description'].strip() if 'description' in entityRecords[j] else ''  # Record description.
                        if description in ['?', '.']:
                            description = ''
                        onlyAlphaCarbon = entityRecords[j]['onlyAlphaCarbon']  # Whether the structure for the record contains only alpha carbons.
                        scientificName = entityRecords[j]['scientificName'].strip() if 'scientificName' in entityRecords[j] else ''  # Scientific name of the organism the chain belongs to.
                        if scientificName in ['?', '.']:
                            scientificName = ''
                        sequence = entityRecords[j]['sequence'].upper()  # Sequence of the chain.
                        if sequence.count('X') / float(len(sequence)) < 0.5:
                            # If at least 50% of the amino acids in the chain are X, then the 'protein' is deemed to not be a protein.
                            uniqueSequences.add(sequence)
                            for k in chains:
                                # Record the data about each chain.
                                writeAllFasta.write('>' + k + '\t' + str(len(sequence)) + '\t' + experimentalType + '\t' + str(resolution) + '\t' +
                                                    str(rFactorObs) + '\t' + str(rFactorFree) + '\t' + ('no' if onlyAlphaCarbon == 0 else 'yes') + '\t' +
                                                    description + '\t<' + dbName + ' ' + dbCode + '>\t[' + scientificName + ']\n' + sequence + '\n')

    writeAllFasta.close()

    ####################################
    # Determine sequences for BLASTing #
    ####################################
    uniqueSequences = dict((i, index) for index, i in enumerate(uniqueSequences))
    sequencesUsed = set([])
    writeChains = open(fileChains, 'w')
    writeChains.write('\t'.join(['Chain', 'Res', 'RVal', 'SeqLen', 'NonXRay', 'AlphaCarbonOnly', 'ReprGroup']) + '\n')  # Write the header for the chains file.
    writeReprFasta = open(fileReprFasta, 'w')
    readAllFasta = open(fileAllFasta, 'r')
    while True:
        # Read the file two lines at a time.
        identifierLine = readAllFasta.readline().strip()[1:]  # Strip off any whitespace and the > at the front.
        sequence = readAllFasta.readline().strip()
        if not sequence:
            # Reached the end of the file when there is no second line.
            break

        # Write the chain information into the App Engine TSV file.
        sequenceGrouping = uniqueSequences[sequence]
        chunks = identifierLine.split('\t')
        chain = chunks[0]
        resolution = chunks[3]
        rVal = chunks[4]
        nonXRay = 'no' if chunks[2] == 'XRAY' else 'yes'
        alphaCarbonOnly = chunks[6]
        writeChains.write(chain + '\t' + resolution + '\t' + rVal + '\t' + str(len(sequence)) + '\t' + nonXRay + '\t' + alphaCarbonOnly + '\t' +
                          str(sequenceGrouping) + '\n')

        # Record the representative fasta file for BLASTing.
        if not sequenceGrouping in sequencesUsed:
            # Only record the chain if no other chain from its sequence grouping has been recorded.
            writeReprFasta.write('>' + str(sequenceGrouping) + '\n' + sequence + '\n')

        # Update the record of the sequence groupings that have been recorded in the BLASTing file.
        sequencesUsed.add(sequenceGrouping)
    readAllFasta.close()
    writeReprFasta.close()
    writeChains.close()

    ################
    # Run BLASTing #
    ################
    # Generate BLAST database.
    databaseDir = parsedPDB + '/BLASTdatabase'
    if os.path.exists(databaseDir):
        shutil.rmtree(databaseDir)
    os.mkdir(databaseDir)
    makeDBArgs = [blastExecutables + '/makeblastdb', '-in', fileReprFasta, '-out', databaseDir + '/TempDB', '-dbtype', 'prot']
    subprocess.call(makeDBArgs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    # BLAST chains.
    resultsBLAST = parsedPDB + '/ResultsBLAST.txt'
    if os.path.exists(resultsBLAST):
        os.remove(resultsBLAST)
    sequence_BLAST(fileReprFasta, resultsBLAST, databaseDir + '/TempDB', blastExecutables + '/psiblast.exe', 2)
    processPSIoutput.main(resultsBLAST, fileSimilarity)