def sequence_BLAST(processedBLAST, inputFile, database, BLASTLoc, SEG, cores): """Will perform the process of BLAST -> PROCESS OUTPUT -> CULL on inputFile. @param processedBLAST: The location at which to write the output of the processing of the BLAST output. @type processedBLAST: string @param inputFile: The FASTA file which needs to be submitted to PSI-BLAST. @type inputFile: string @param database: The database to BLAST the inputFile protein against @param database: string @param BLASTLoc: The location of the PSI-BLAST executable. @type BLASTLoc: string @param SEG: Set to True to use SEG to mask low complexity regions of the query. @type SEG: boolean @param cores: The number of threads to create to run BLAST with. @type cores: character """ # Setup the parameters for the BLASTing. outputLoc = '.'.join(inputFile.split('.')[:-1]) + '.tmp' argsPSI = ['-query', inputFile, '-out', outputLoc, '-evalue', '1', '-inclusion_ethresh', '0.0001', '-num_iterations', '3', '-gap_trigger', '18', '-num_descriptions', '10000', '-num_alignments', '10000', '-dbsize', '0', '-db', database, '-outfmt', '7 qseqid sseqid pident length evalue'] if SEG: argsPSI.extend(['-seg', 'yes']) else: argsPSI.extend(['-seg', 'no']) # Perform the BLASTing. argToBLAST = [BLASTLoc] + argsPSI subprocess.call(argToBLAST, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Process and cull the BLAST output. processPSIoutput.main(outputLoc, processedBLAST)
def main(inputFile, blastOperationID, cores=2, minAlignLength=20, maxEValue=1.0, verboseOutput=False): """Perform the BLASTing of the proteins in an input file (inputFile) against those in another file (databaseFile). Returns a dictionary of the similarities between proteins, as determined by BLAST. The dictionary is indexed by a alphanumerically ordered tuple (index[0] < index[1]), and the entry for each index is the percentage sequence similarity. The BLAST version used must be the C++ version. If a different version is being used (i.e. the old C version), then parameters like the number of cores can not be used. :param inputFile: The location of a FASTA format file of the proteins to BLAST against each other. :type inputFile: string :param blastOperationID: The name for the directory where the results of the BLASTing -> parsing will be stored. :type blastOperationID: string :param cores: The number of CPU cores on which BLAST will be run. :type cores: integer :param minAlignLength: The minimum permissible length for the BLAST sequence alignments. :type minAlignLength: integer :param maxEValue: The maximum permissible value which the BLAST EValue can take. :type maxEValue: float :param verboseOutput: Whether status updates of the BLASTing should be printed out to the user. :type verboseOutput: boolean :returns : A record of the similarities between the proteins :type : dictionary """ # Get the location of the BLAST executables. srcLocation = os.path.dirname(os.path.realpath(__file__)) BLASTExecutables = os.path.join(srcLocation, 'BLASTExecutables') cwd = os.getcwd() outputLocation = blastOperationID if os.path.exists(outputLocation): shutil.rmtree(outputLocation) os.mkdir(outputLocation) # Generate BLAST database. databaseDir = outputLocation + '/TempDatabase' os.mkdir(databaseDir) makeDBArgs = [BLASTExecutables + '/makeblastdb', '-in', inputFile, '-out', databaseDir + '/TempDB', '-dbtype', 'prot'] subprocess.call(makeDBArgs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Perform BLAST. if verboseOutput: print('Now BLASTing.') resultsBLAST = outputLocation + '/ResultsBLAST.txt' sequence_BLAST(resultsBLAST, inputFile, databaseDir + '/TempDB', BLASTExecutables + '/psiblast', cores) # Determine the similarities between proteins. if verboseOutput: print('Now determining similarities.') similarities = processPSIoutput.main(resultsBLAST, minAlignLength, maxEValue) # Remove the temporary directory used for manipulating and processing the BLAST output. try: shutil.rmtree(outputLocation) except: time.sleep(60) shutil.rmtree(outputLocation) return similarities
def sequence_BLAST(processedBLAST, inputFile, database, BLASTLoc, SEG, cores): """Will perform the process of BLAST -> PROCESS OUTPUT on inputFile. @param processedBLAST: The location at which to write the output of the processing of the BLAST output. @type processedBLAST: string @param inputFile: The FASTA file which needs to be submitted to PSI-BLAST. @type inputFile: string @param database: The database to BLAST the inputFile protein against @param database: string @param BLASTLoc: The location of the PSI-BLAST executable. @type BLASTLoc: string @param SEG: Set to True to use SEG to mask low complexity regions of the query. @type SEG: boolean @param cores: The number of threads to create to run BLAST with. @type cores: character """ # Setup the parameters for the BLASTing. outputLoc = inputFile.split('.')[0] + '.tmp' query = ' -query ' + inputFile out = ' -out ' + outputLoc evalue = ' -evalue 1' inclusionEThresh = ' -inclusion_ethresh 0.0001' numIterations = ' -num_iterations 3' gapTrigger = ' -gap_trigger 18' numDescriptions = ' -num_descriptions 10000' numAlignments = ' -num_alignments 10000' dbsize = ' -dbsize 0' db = ' -db ' + database outputFormat = ' -outfmt "7 qseqid sseqid pident length evalue"' if SEG: seg = ' -seg yes' else: seg = ' -seg no' numThreads = ' -num_threads ' + str(cores) argsPSI = (query + out + evalue + inclusionEThresh + numIterations + gapTrigger + numDescriptions + numAlignments + dbsize + db + outputFormat + seg + numThreads ) # Perform the BLASTing. subprocess.call(BLASTLoc + argsPSI, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Process the BLAST output. processPSIoutput.main(outputLoc, processedBLAST)
def main(mmCIFDir, parsedPDB, blastExecutables): """Process the entire PDB in order to extract the relevant information about the proteins in it. :param mmCIFDir: The directory containing the mmCIF files for the PDB. :type mmCIFDir: string :param parsedPDB: The directory where the results of the parsing and culling will be written. :type parsedPDB: string :param blastExecutables: The location of the BLAST+ executables. :type blastExecutables: string """ # Define output files in the TSV format expected by the App Engine bulk uploader (TSV with header). if not os.path.exists(parsedPDB): os.mkdir(parsedPDB) fileChains = parsedPDB + '/Chains.tsv' fileSimilarity = parsedPDB + '/Similarity.tsv' fileAllFasta = parsedPDB + '/AllChains.fasta' fileReprFasta = parsedPDB + '/ReprChains.fasta' ################################################################## # Go through the mmCIF files and extract the desired information # ################################################################## writeAllFasta = open(fileAllFasta, 'w') uniqueSequences = set([]) mmCIFDirs = os.listdir(mmCIFDir) for i in mmCIFDirs: mmCIFFolder = mmCIFDir + '/' + i mmCIFFiles = os.listdir(mmCIFFolder) # Get the files from each subfolder. # Process each mmCIF file. for j in mmCIFFiles: currentFile = mmCIFFolder + '/' + j entryID, entityRecords, experimentalType, resolution, rFactorObs, rFactorFree = parsePDBmmCIF.main(currentFile) # Parse the file. # For each record in the entry, examine the data about it. Each entry can have one or more record depending on the different chains # recorded in the entry. for j in entityRecords.keys(): if 'type' in entityRecords[j]: # If the record contains type information than examine it further. Only those records with type information are of interest. chains = [entry + chain for entry in entryID for chain in entityRecords[j]['chains']] # The chains in the record. type = entityRecords[j]['type'].strip() # The type of the record. if type == 'Protein': # Only interested in the record if it's a protein. dbCode = entityRecords[j]['dbCode'].strip() if 'dbCode' in entityRecords[j] else '' # External database identifier. if dbCode in ['?', '.']: dbCode = '' dbName = entityRecords[j]['dbName'].strip() if 'dbName' in entityRecords[j] else '' # External database name. if dbName in ['?', '.']: dbName = '' description = entityRecords[j]['description'].strip() if 'description' in entityRecords[j] else '' # Record description. if description in ['?', '.']: description = '' onlyAlphaCarbon = entityRecords[j]['onlyAlphaCarbon'] # Whether the structure for the record contains only alpha carbons. scientificName = entityRecords[j]['scientificName'].strip() if 'scientificName' in entityRecords[j] else '' # Scientific name of the organism the chain belongs to. if scientificName in ['?', '.']: scientificName = '' sequence = entityRecords[j]['sequence'].upper() # Sequence of the chain. if sequence.count('X') / float(len(sequence)) < 0.5: # If at least 50% of the amino acids in the chain are X, then the 'protein' is deemed to not be a protein. uniqueSequences.add(sequence) for k in chains: # Record the data about each chain. writeAllFasta.write('>' + k + '\t' + str(len(sequence)) + '\t' + experimentalType + '\t' + str(resolution) + '\t' + str(rFactorObs) + '\t' + str(rFactorFree) + '\t' + ('no' if onlyAlphaCarbon == 0 else 'yes') + '\t' + description + '\t<' + dbName + ' ' + dbCode + '>\t[' + scientificName + ']\n' + sequence + '\n') writeAllFasta.close() #################################### # Determine sequences for BLASTing # #################################### uniqueSequences = dict((i, index) for index, i in enumerate(uniqueSequences)) sequencesUsed = set([]) writeChains = open(fileChains, 'w') writeChains.write('\t'.join(['Chain', 'Res', 'RVal', 'SeqLen', 'NonXRay', 'AlphaCarbonOnly', 'ReprGroup']) + '\n') # Write the header for the chains file. writeReprFasta = open(fileReprFasta, 'w') readAllFasta = open(fileAllFasta, 'r') while True: # Read the file two lines at a time. identifierLine = readAllFasta.readline().strip()[1:] # Strip off any whitespace and the > at the front. sequence = readAllFasta.readline().strip() if not sequence: # Reached the end of the file when there is no second line. break # Write the chain information into the App Engine TSV file. sequenceGrouping = uniqueSequences[sequence] chunks = identifierLine.split('\t') chain = chunks[0] resolution = chunks[3] rVal = chunks[4] nonXRay = 'no' if chunks[2] == 'XRAY' else 'yes' alphaCarbonOnly = chunks[6] writeChains.write(chain + '\t' + resolution + '\t' + rVal + '\t' + str(len(sequence)) + '\t' + nonXRay + '\t' + alphaCarbonOnly + '\t' + str(sequenceGrouping) + '\n') # Record the representative fasta file for BLASTing. if not sequenceGrouping in sequencesUsed: # Only record the chain if no other chain from its sequence grouping has been recorded. writeReprFasta.write('>' + str(sequenceGrouping) + '\n' + sequence + '\n') # Update the record of the sequence groupings that have been recorded in the BLASTing file. sequencesUsed.add(sequenceGrouping) readAllFasta.close() writeReprFasta.close() writeChains.close() ################ # Run BLASTing # ################ # Generate BLAST database. databaseDir = parsedPDB + '/BLASTdatabase' if os.path.exists(databaseDir): shutil.rmtree(databaseDir) os.mkdir(databaseDir) makeDBArgs = [blastExecutables + '/makeblastdb', '-in', fileReprFasta, '-out', databaseDir + '/TempDB', '-dbtype', 'prot'] subprocess.call(makeDBArgs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # BLAST chains. resultsBLAST = parsedPDB + '/ResultsBLAST.txt' if os.path.exists(resultsBLAST): os.remove(resultsBLAST) sequence_BLAST(fileReprFasta, resultsBLAST, databaseDir + '/TempDB', blastExecutables + '/psiblast.exe', 2) processPSIoutput.main(resultsBLAST, fileSimilarity)