def main(args): """Runs the protein culling. @param args: The command line arguments. @type args: A Python list. """ # =========================================================================== # Parse the user's input. # =========================================================================== parser = argparse.ArgumentParser( description=( "Generate a non-redundant dataset of chains or entries from the PDB. " + "Please see the README for more information on how to use this program." ), epilog=( "This program is designed to cull a dataset of protein sequences so that no " + "two sequences have a sequence identity greater than the specified threshold " + "percentage. The method used is the Leaf heuristic, which is described in PAPER. " + "A server to perform the culling can be found at http://www.bioinf.manchester.ac.uk/leaf/." ), ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( "-i", "--inputFile", help="The location of the input file. (Required type: %(type)s).", metavar="inputFile", type=str, default="", ) group.add_argument( "-w", "--whole", help="Cull chains/entries from the entire PDB. (Default value: %(default)s).", action="store_true", default=False, ) group.add_argument( "-n", "--organism", help="Cull chains/entries from a specified organism. Replace spaces with underscores ('_'). (Required type: %(type)s).", metavar="organismName", type=str, default="", ) parser.add_argument( "-p", "--percent", help="The maximum percent sequence identity between sequences. 5 <= maxPercent < 100 must be true. (Required type: %(type)s, default value: %(default)s).", metavar="maxPercent", type=float, default=20, required=False, ) parser.add_argument( "-q", "--minRes", help="The minimum resolution a chain/entry can have. 0 <= minResolution <= 100 must be true. Must not be greater than the maximum resolution. (Required type: %(type)s, default value: %(default)s).", metavar="minResolution", type=float, default=0.0, required=False, ) parser.add_argument( "-r", "--maxRes", help="The maximum resolution a chain/entry can have. 0 <= maxResolution <= 100 must be true. Must not be less than the minimum resolution. (Required type: %(type)s, default value: %(default)s).", metavar="maxResolution", type=float, default=3.0, required=False, ) parser.add_argument( "-l", "--rval", help="The maximum R value a chain/entry can have. 0 <= maxRValue <= 1 must be true. (Required type: %(type)s, default value: %(default)s).", metavar="maxRValue", type=float, default=0.5, required=False, ) parser.add_argument( "-m", "--minLen", help="The maximum sequence length permissible. A negative value means not to use a minimum sequence length. Must not be greater than the maximum sequence length. (Required type: %(type)s, default value: Not Used).", metavar="minLength", type=int, required=False, default=-1, ) parser.add_argument( "-a", "--maxLen", help="The minimum sequence length permissible A negative value means not to use a maximum sequence length. Must not be less than the minimum sequence length. (Required type: %(type)s, default value: Not Used).", metavar="maxLength", type=int, required=False, default=-1, ) parser.add_argument( "-x", "--usenonxray", help="Whether non-X-ray chains/entries should be included. (Default value: Not used).", action="store_true", default=False, required=False, ) parser.add_argument( "-u", "--usealpha", help="Whether alpha carbon only chains/entries should be included. (Default value: Not used).", action="store_true", default=False, required=False, ) parser.add_argument( "-c", "--cullbyentry", help="Whether the culling should be by entry. (Default value: Do not cull by entry).", action="store_true", default=False, required=False, ) parser.add_argument( "-e", "--intraentry", help="Whether culling should be performed within entries. Only useable when culling by entry is being used. (Default value: No intra entry culling).", action="store_true", default=False, required=False, ) parser.add_argument( "-s", "--intrapercent", help="The maximum percent sequence identity between chains in the same entry 5 <= maxIntraEntryPercent < 100 must be true. Only usable when culling by entry with intra entry culling is being used. (Required type: %(type)s, default value: %(default)s).", metavar="maxIntraEntryPercent", type=float, default=20, required=False, ) parser.add_argument( "-d", "--dataloc", help="The location of the directory that contains the processed PDB data (required type: %(type)s, default value: in the directory this script is being called from).", metavar="dataFileDir", type=str, default="", required=False, ) parser.add_argument( "-o", "--output", help="The name of the output directory to create in the current working directory. (Required type: %(type)s, default value: %(default)s).", metavar="outputFolder", type=str, default="PDBCullResults", required=False, ) parser.add_argument( "-v", "--verbose", help="Whether status updates should be displayed. (Default value: No status updates).", action="store_true", default=False, required=False, ) args = parser.parse_args() wholePDB = args.whole cullByOrganism = " ".join(args.organism.split("_")) userInput = args.inputFile sequenceIdentity = args.percent minResolution = args.minRes maxResolution = args.maxRes maxRValue = args.rval minLength = args.minLen maxLength = args.maxLen skipNonXray = not args.usenonxray skipAlphaCarbon = not args.usealpha cullByChain = not args.cullbyentry performIntraEntryCulling = args.intraentry intraEntrySequenceIdentity = args.intrapercent dataFileDir = args.dataloc cullOperationID = args.output verboseOutput = args.verbose # =========================================================================== # Validate the user's input. # =========================================================================== toExit = False if not wholePDB: if userInput != "" and not os.path.isfile(userInput): print "The input location supplied is not a valid file location." toExit = True if dataFileDir == "": # Load the default location that contains the data directory. srcLocation = os.path.abspath(__file__) srcLocation = "/".join(srcLocation.split("/")[:-1]) dataFileDir = srcLocation + "/PDBData" elif not os.path.isdir(dataFileDir): # Check that the user submitted a directory as the data directory. print "The location supplied for the parsed PDB data is not a valid directory location." toExit = True if sequenceIdentity < 5 or sequenceIdentity >= 100: print "The maximum allowable percentage sequence similarity must be no less than 5, and less than 100." toExit = True if minResolution < 0 or minResolution > 100: print "The valid range for the minimum resolution is 0 - 100." toExit = True if maxResolution < 0 or maxResolution > 100: print "The valid range for the maximum resolution is 0 - 100." toExit = True if minResolution > maxResolution: print "The minimum resolution must be less than or equal to the maximum resolution." toExit = True if maxRValue < 0 or maxRValue > 1: print "The valid range for the maximum R value is 0 - 1." toExit = True if not cullByChain and performIntraEntryCulling: intraEntrySequenceIdentity = float(intraEntrySequenceIdentity) if intraEntrySequenceIdentity < 5 or intraEntrySequenceIdentity >= 100: print "The maximum allowable intr entry percentage sequence similarity must be no less than 5, and less than 100." toExit = True elif cullByChain and performIntraEntryCulling: print "WARNING: Culling by entry is not enabled, but intra entry culling is selected. If you choose to continue the culling will NOT be done by entry. Continue (y for yes, anything else for no)?" cont = raw_input("--> ") if cont.upper() != "Y": toExit = True if minLength < 0: minLength = -1 if maxLength < 0: maxLength = -1 if minLength > maxLength: print "The minimum sequence length must be less than the maximum sequence length." toExit = True if toExit: sys.exit() # Create the directory to store the output in. cwd = os.getcwd() outputLocation = cwd + "/" + cullOperationID try: if os.path.isdir(outputLocation): shutil.rmtree(outputLocation) elif os.path.exists(outputLocation): os.remove(outputLocation) os.mkdir(outputLocation) except: print "The output directory could not be created. Please check the location specified in the input parameters." print "If you did not specify a location then consider changing the default output location (the variable cullOperationID)" sys.exit() # =========================================================================== # Extract the PDB data. # =========================================================================== PDBEntriesData = dataFileDir + "/" + "AllPDBEntries.txt" chainTypeData = dataFileDir + "/" + "ChainType.txt" representativeData = dataFileDir + "/" + "Representative.txt" similarityData = dataFileDir + "/" + "Similarity.txt" proteinData = dataFileDir + "/" + "ProteinInformation.txt" PDBEntriesList = [] readPDBEntriesData = open(PDBEntriesData, "r") for i in readPDBEntriesData: PDBEntriesList.append(i.strip()) readPDBEntriesData.close() chainTypeDict = {} readChainTypeData = open(chainTypeData, "r") for i in readChainTypeData: chunks = (i.strip()).split("\t") chainTypeDict[chunks[0]] = chunks[1] readChainTypeData.close() proteinDict = {} readProteinData = open(proteinData, "r") for i in readProteinData: chunks = (i.strip()).split("\t") chain = chunks[0] entry = chunks[1] organism = chunks[10] proteinDict[chain] = {"entry": entry, "organism": organism} readProteinData.close() # =========================================================================== # Process the PDB data. # =========================================================================== if verboseOutput: print "Now parsing the processed PDB data." if userInput != "": # Check that the user's input contains no invalid chains/entries. Only necessary if the whole PDB is not being culled. userInputFile = open(userInput, "r") userInput = userInputFile.read() userInputFile.close() userInput = userInput.split("\n") inputList = [i.strip() for i in userInput] if cullByChain: # If the user has selected to cull 'by chain'. processedInputList = set([]) for i in inputList: # Check the user input to see if any of the 'chains' supplied by the user might actually be entries. # This is fine, but it needs to be checked for. Any entries supplied will be converted into the chains # that correspond to the entry supplied (i.e. entry E will be converted to chains EA, EB, EC, etc.). if len(i) == 4: # Possibly an entry. chainsFromEntry = [j for j in proteinDict.keys() if proteinDict[j]["entry"] == i] if chainsFromEntry == []: # If there are not chains corresponding to the 4 character input, then it is likely that the 4 # character input is invalid. In this case simply keep the 4 character input, and wait for the # input checking script to raise an error. processedInputList.add(i) else: # Chains were found, and therefore the 4 character input was in fact an entry. for j in chainsFromEntry: processedInputList.add(j) else: # If i could not be an entry then just add it to the list of chains to be checked. processedInputList.add(i) processedInputList = list(processedInputList) retCode, retVal = checkPDBinput.main(processedInputList, allChains=chainTypeDict, checkType="chain") else: # If the user has selected to cull 'by entry'. allProtEntries = set( [ chainTypeDict[i]["chain"][:4] for i in chainTypeDict.keys() if chainTypeDict[i]["chainType"] == "Protein" ] ) retCode, retVal = checkPDBinput.main( inputList, allEntries=PDBEntriesList, allProtEntries=allProtEntries, checkType="entry" ) if retCode != 0: # An error was found in the input. print retVal sys.exit() elif cullByOrganism != "": if cullByChain: # Collect all chains belonging to the organism specified. retVal = [i for i in proteinDict.keys() if proteinDict[i]["organism"]] else: # Collect all entries belonging to the organism specified. retVal = list(set([proteinDict[i]["entry"] for i in proteinDict.keys() if proteinDict[i]["organism"]])) if len(retVal) < 2: # Not enough chains/entries of the given organism type. print "There are less than 2 chains/entries in the PDB from the organism you entered. This is possibly due ", print "to a misspelling of the organism name." sys.exit() retVal = "\n".join(retVal) if verboseOutput: startTime = time.time() if not cullByChain: # If the method of culling is 'by entry', record the entries and convert the entries to their corresponding chains. if not wholePDB: userInput = retVal.split("\n") entriesUsed = userInput # All redundant and non-redundant entries. potentialChains = [] # All redudant and non-redundant chains. chainsToCull = set([]) # The chains that will be used in the culling. readProteinData = open(proteinData, "r") for i in readProteinData: # Parse the data file containing all the chains in the PDB, and record only those which are members of an entry in the user's input. chunks = (i.strip()).split("\t") chain = chunks[0] entry = chunks[1] if entry in userInput: potentialChains.append(chain) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) alphaCarbonOnly = False if chunks[6] == "0" else True sequence = chunks[11] invalid = ( (experimentType != "XRAY" and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) else: entriesUsed = set([]) # All redundant and non-redundant entries. potentialChains = [] # All redudant and non-redundant chains. chainsToCull = set([]) # The chains that will be used in the culling. readProteinData = open(proteinData, "r") for i in readProteinData: # Parse the data file containing all the chains in the PDB, and record only those which are members of an entry in the user's input. chunks = (i.strip()).split("\t") chain = chunks[0] entry = chunks[1] potentialChains.append(chain) entriesUsed.add(entry) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) alphaCarbonOnly = False if chunks[6] == "0" else True sequence = chunks[11] invalid = ( (experimentType != "XRAY" and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) entriesUsed = list(entriesUsed) entriesToCull = set([i[:4] for i in chainsToCull]) else: # If the method of culling is 'by chain', record the chains input by the user. if not wholePDB: userInput = retVal.split("\n") potentialChains = [] # All redudant and non-redundant chains. chainsToCull = set([]) # The chains that will be used in the culling. readProteinData = open(proteinData, "r") for i in readProteinData: # Parse the data file containing all the chains in the PDB, and record only those which are members in the user's input. chunks = (i.strip()).split("\t") chain = chunks[0] if chain in userInput: potentialChains.append(chain) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) alphaCarbonOnly = False if chunks[6] == "0" else True sequence = chunks[11] invalid = ( (experimentType != "XRAY" and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) readProteinData.close() else: potentialChains = [] # All redudant and non-redundant chains. chainsToCull = set([]) # The chains that will be used in the culling. readProteinData = open(proteinData, "r") for i in readProteinData: # Parse the data file containing all the chains in the PDB, and record only those which are members in the user's input. chunks = (i.strip()).split("\t") chain = chunks[0] potentialChains.append(chain) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) alphaCarbonOnly = False if chunks[6] == "0" else True sequence = chunks[11] invalid = ( (experimentType != "XRAY" and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) readProteinData.close() if verboseOutput: print "Potential chains: ", len(potentialChains), "Valid chains: ", len(chainsToCull) # Determine representative chain information. # representatives records the non-representative to representative chain mapping for the non-representative # chains in the set of chains to cull. representatives = {} readRepresentativeData = open(representativeData, "r") for i in readRepresentativeData: chunks = (i.strip()).split("\t") nonreprChain = chunks[0] reprChain = chunks[1] if nonreprChain in chainsToCull: representatives[nonreprChain] = reprChain readRepresentativeData.close() # representativeChains records the set of representative chains that cover all the chains in the set of chains to cull. # This means that if a chain in chainsToCull is a representative itself then it is in representativeChains, and if # a chain in chainsToCull is not a representative, then its representative chain is in representativeChains. representativeChains = set([i if not representatives.has_key(i) else representatives[i] for i in chainsToCull]) # representativesReverse records for each chain that represents at least one chain in chainsToCull, a set of the # non-representative chains in chainsToCull that it represents. # For example, if chainsToCull == [a, b, c, d], and a and b are non-representative chains represented by chain q, # then representativesReverse[q] = set([a, b]). representativesReverse = {} for i in representatives.keys(): reprChain = representatives[i] if representativesReverse.has_key(reprChain): representativesReverse[reprChain].add(i) else: representativesReverse[reprChain] = set([i]) representativesReverseKeys = representativesReverse.keys() if verboseOutput: print "Now beginning the culling. Time elapsed: ", time.time() - startTime if not cullByChain: # Perform the 'by entry' culling. # Determine the redundant user input entries. removedInput = cull_main( similarityData, sequenceIdentity, representativeChains, "entry", representativesReverse, verboseOutput, startTime, ) # Determine the non-redundant user input entries. keptInput = set([i[:4] for i in entriesToCull if i[:4] not in removedInput]) if performIntraEntryCulling and intraEntrySequenceIdentity < 100: # Perform intra-entry culling. if verboseOutput: print "Now performing intra-entry culling. Time elapsed: ", time.time() - startTime entryToChain = {} # Records all the chains within each non-redudnant user input entry. chainsOfInterest = set([]) # Records all the chains that are in a non-redundant user input entry. readProteinData = open(proteinData, "r") for i in readProteinData: # Determine all the chains for each non-redundant user input entry. chunks = (i.strip()).split("\t") chain = chunks[0] entry = chunks[1] sequence = chunks[11] invalid = (minLength != -1 and len(sequence) < minLength) or ( maxLength != -1 and len(sequence) > maxLength ) if entry in keptInput and not invalid: chainsOfInterest.add(chain) if entryToChain.has_key(entry): entryToChain[entry].append(chain) else: entryToChain[entry] = [chain] readProteinData.close() representatives = ( {} ) # Records the non-representative to representative chain mapping for all chains in non-redundant user input entries. readRepresentativeData = open(representativeData, "r") for i in readRepresentativeData: # Determine the representative for each chain that is in a non-redundant user input entry (for all chains in chainsOfInterest). chunks = (i.strip()).split("\t") nonreprChain = chunks[0] reprChain = chunks[1] if nonreprChain in chainsOfInterest: representatives[nonreprChain] = reprChain readRepresentativeData.close() representativeChains = set( [i if not representatives.has_key(i) else representatives[i] for i in chainsOfInterest] ) # Records all representative chains for the set of chains that are in the non-redundant user input entries. representativesReverse = ( {} ) # Records all the non-representative chains represented by the representative chains in representatives. for i in representatives.keys(): reprChain = representatives[i] if representativesReverse.has_key(reprChain): representativesReverse[reprChain].add(i) else: representativesReverse[reprChain] = set([i]) entryToRepChain = dict( [(i, set([])) for i in entryToChain.keys()] ) # Maps entries to their representative chains. for i in chainsOfInterest: entry = i[:4] if representatives.has_key(i): entryToRepChain[entry].add(representatives[i]) else: entryToRepChain[entry].add(i) keptInputChains = set([]) for i in keptInput: if len(entryToRepChain[i]) == 1: # If the entry's chains are all representated by one chain, then all the chains are identical. A random chain from the entry should be kept. keptInputChains.add(entryToChain[i][0]) del entryToRepChain[i] adjList, namesList = adjlistcreation.intra_entry_main( similarityData, intraEntrySequenceIdentity, representativeChains, entryToRepChain ) for i in range(len(adjList)): # Perform the intra-entry culling for each entry that needs it. chainsToCull = Leafcull.main(adjList[i], namesList[i]) keptReprChains = [j for j in namesList[i] if not j in chainsToCull] for j in keptReprChains: # Calculate the kept input chains. if representativesReverse.has_key(i): # If the representative chain that was kept has non-representative chains in the input, then select one of them. keptInputChains.add(iter(representativesReverse[j]).next()) else: # If the representative chain that was kept has no non-representative chains in the input, then the representative chain was in the input. Keep it. keptInputChains.add(j) else: # If intra-entry cuilling is not being used. keptInputChains = set([i for i in chainsToCull if i[:4] in keptInput]) else: # Perform the 'by chain' culling. # Calculate the redundant representative chains. removedReprChains = set( cull_main(similarityData, sequenceIdentity, representativeChains, "chain", {}, verboseOutput, startTime) ) # Determine the non-redundant representative chains. keptReprChains = [i for i in representativeChains if i not in removedReprChains] keptInputChains = set([]) # Determine the non-redundant user input chains. for i in keptReprChains: # Calculate the kept input chains. if representativesReverse.has_key(i): # If the representative chain that was kept has non-representative chains in the input, then select one of them. keptInputChains.add(iter(representativesReverse[i]).next()) else: # If the representative chain that was kept has no non-representative chains in the input, then the representative chain was in the input. Keep it. keptInputChains.add(i) # Determine the redundant user input chains. removedInput = sorted([i for i in potentialChains if i not in keptInputChains]) if verboseOutput: print "Now saving results. Time elapsed: ", time.time() - startTime # Write out the redundant chains/entries. writeOutRem = open(outputLocation + "/Removed.txt", "w") writeOutRem.write("\n".join(removedInput)) writeOutRem.close() # Determine the non-redundant chain/entry data to be output. keptInputOutput = "IDs\tlength\tExptl.\tresolution\tR-factor\tFreeRvalue\n" fastaOutput = "" entryStats = {} readProteinData = open(proteinData, "r") for i in readProteinData: chunks = (i.strip()).split("\t") chain = chunks[0] entry = chunks[1] experimentType = chunks[2] resolution = chunks[3] rValueObs = chunks[4] rValueFree = chunks[5] alphaCarbon = "no" if chunks[6] == "0" else "yes" description = chunks[7] dbName = chunks[8] dbCode = chunks[9] organism = chunks[10] sequence = chunks[11] if not cullByChain: if entry in keptInput: entryStats[entry] = { "len": str(len(sequence)), "expt": experimentType, "res": resolution, "rval": rValueObs, "freeRval": rValueFree, } if chain in keptInputChains: fastaOutput += ( ">" + "\t".join( [ chain, str(len(sequence)), experimentType, resolution, rValueObs, rValueFree, alphaCarbon, description, "<" + dbName + " " + dbCode + ">", "[" + organism + "]", ] ) + "\n" + sequence + "\n" ) else: if chain in keptInputChains: keptInputOutput += ( "\t".join([chain, str(len(sequence)), experimentType, resolution, rValueObs, rValueFree]) + "\n" ) fastaOutput += ( ">" + "\t".join( [ chain, str(len(sequence)), experimentType, resolution, rValueObs, rValueFree, alphaCarbon, description, "<" + dbName + " " + dbCode + ">", "[" + organism + "]", ] ) + "\n" + sequence + "\n" ) readProteinData.close() if not cullByChain: keptInputOutput += "\n".join( [ "\t".join( [ i, entryStats[i]["len"], entryStats[i]["expt"], entryStats[i]["res"], entryStats[i]["rval"], entryStats[i]["freeRval"], ] ) for i in sorted(entryStats.keys()) ] ) # Write out the non-redundant chain/entry statistics. writeOutKeepList = open(outputLocation + "/KeptList.txt", "w") writeOutKeepList.write(keptInputOutput) writeOutKeepList.close() # Write out the non-redundant FASTA file of chains. writeOutKeepFasta = open(outputLocation + "/KeptFasta.fasta", "w") writeOutKeepFasta.write(fastaOutput) writeOutKeepFasta.close() if verboseOutput: print "Results saved. Total time elapsed: ", time.time() - startTime
def run(self): if self.requestType == 'seq': try: outputLocation = userseqcontroller.main(self.request.userInput.path, self.request.sequenceIdentity, self.request.minLength, self.request.maxLength, self.request.SEG, 'UserSeq' + str(self.request.id)) readOutput = open(outputLocation + '/KeptList.txt', 'r') self.request.nonredNoSeq.save('', ContentFile(readOutput.read())) readOutput.close() readOutput = open(outputLocation + '/KeptFasta.fasta', 'r') self.request.nonredSeq.save('', ContentFile(readOutput.read())) readOutput.close() readOutput = open(outputLocation + '/Removed.txt', 'r') self.request.removed.save('', ContentFile(readOutput.read())) readOutput.close() self.request.completed = True self.request.save() logger = open('/srv/www/vhosts.d/www.bioinf/html/doig/cgi-bin/django_projects/LeafWebApp/ErrorLogs/ERROR.log', 'a') logger.write('\tFINISH: thread for request ' + str(self.request.id) + ', of type ' + str(self.requestType) + ' on ' + strftime('%Y/%m/%d/ at %H:%M:%S', gmtime()) + '.\n') logger.close() except: logger = open('/srv/www/vhosts.d/www.bioinf/html/doig/cgi-bin/django_projects/LeafWebApp/ErrorLogs/ERROR.log', 'a') logger.write('\tERROR: thread for request ' + str(self.request.id) + ', of type ' + str(self.requestType) + ' on ' + strftime('%Y/%m/%d/ at %H:%M:%S', gmtime()) + '.\n') excType, excValue, excTrace = sys.exc_info() logger.write('\t\tException type: ' + str(excType) + '\n') logger.write('\t\tException Value: ' + str(excValue) + '\n') errors = traceback.format_exception(excType, excValue, excTrace) for i in errors: logger.write('\t\t' + i) logger.close() try: outputLocation == False except: pass else: try: shutil.rmtree(outputLocation) except: sleep(60) shutil.rmtree(outputLocation) elif self.requestType == 'pdb': try: if self.request.skipNonXray: skipNonXray = 'Yes' else: skipNonXray = 'No' if self.request.skipAlphaCarbon: skipAlphaCarbon = 'Yes' else: skipAlphaCarbon = 'No' if self.request.cullByChain: cullMethod = 'Chain' else: cullMethod = 'Entry' if self.request.performIntraEntryCulling: performIntraEntryCulling = True intraEntrySequenceIdentity = self.request.intraEntrySequenceIdentity intraEntryCull = 'Yes' intraEntrySequenceIdentity = 'Within entry culling threshold: ' + str(self.request.intraEntrySequenceIdentity) + '\n' else: performIntraEntryCulling = False intraEntryCull = 'No' intraEntrySequenceIdentity = '' if self.request.wholePDB: wholePDB = True else: wholePDB = False sequenceIdentity = self.request.sequenceIdentity minResolution = self.request.minResolution maxResolution = self.request.maxResolution maxRValue = self.request.maxRValue minLength = self.request.minLength maxLength = self.request.maxLength proteinData = DownloadableFiles.objects.filter(fileName__exact='ProteinInformation')[0].downloadFile.path similarityData = DownloadableFiles.objects.filter(fileName__exact='Similarity')[0].downloadFile.path representativeData = DownloadableFiles.objects.filter(fileName__exact='Representative')[0].downloadFile.path checkedUserInput = open(self.request.userInput.path, 'r') retVal = checkedUserInput.read() checkedUserInput.close() if cullMethod == 'Entry': # If the method of culling is 'by entry', record the entries and convert the entries to their corresponding chains. if not wholePDB: userInput = retVal.split('\n') entriesUsed = userInput potentialChains = [] chainsToCull = set([]) readProteinData = open(proteinData, 'r') for i in readProteinData: chunks = (i.strip()).split('\t') chain = chunks[0] entry = chunks[1] if entry in userInput: potentialChains.append(chain) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) if chunks[6] == '0': alphaCarbonOnly = False else: alphaCarbonOnly = True sequence = chunks[11] invalid = ((experimentType != 'XRAY' and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) else: entriesUsed = set([]) potentialChains = [] chainsToCull = set([]) readProteinData = open(proteinData, 'r') for i in readProteinData: chunks = (i.strip()).split('\t') chain = chunks[0] entry = chunks[1] potentialChains.append(chain) entriesUsed.add(entry) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) if chunks[6] == '0': alphaCarbonOnly = False else: alphaCarbonOnly = True sequence = chunks[11] invalid = ((experimentType != 'XRAY' and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) entriesUsed = list(entriesUsed) entriesToCull = set([i[:4] for i in chainsToCull]) elif cullMethod == 'Chain': # If the method of culling is 'by chain', record the chains input by the user. if not wholePDB: userInput = retVal.split('\n') potentialChains = [] chainsToCull = set([]) readProteinData = open(proteinData, 'r') for i in readProteinData: chunks = (i.strip()).split('\t') chain = chunks[0] if chain in userInput: potentialChains.append(chain) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) if chunks[6] == '0': alphaCarbonOnly = False else: alphaCarbonOnly = True sequence = chunks[11] invalid = ((experimentType != 'XRAY' and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) readProteinData.close() else: potentialChains = [] chainsToCull = set([]) readProteinData = open(proteinData, 'r') for i in readProteinData: chunks = (i.strip()).split('\t') chain = chunks[0] potentialChains.append(chain) experimentType = chunks[2] resolution = float(chunks[3]) rValueObs = float(chunks[4]) if chunks[6] == '0': alphaCarbonOnly = False else: alphaCarbonOnly = True sequence = chunks[11] invalid = ((experimentType != 'XRAY' and skipNonXray) or (resolution < minResolution) or (resolution > maxResolution) or (rValueObs > maxRValue) or (alphaCarbonOnly and skipAlphaCarbon) or (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) ) if not invalid: chainsToCull.add(chain) readProteinData.close() # Determine representative chain information. # representatives records the non-representative to representative chain mapping for the non-representative # chains in the set of chains to cull. representatives = {} readRepresentativeData = open(representativeData, 'r') for i in readRepresentativeData: chunks = (i.strip()).split('\t') nonreprChain = chunks[0] reprChain = chunks[1] if nonreprChain in chainsToCull: representatives[nonreprChain] = reprChain readRepresentativeData.close() # representativeChains records the set of representative chains that cover all the chains in the set of chains to cull. # This means that if a chain in chainsToCull is a representative itself then it is in representativeChains, and if # a chain in chainsToCull is not a representative, then its representative chain is in representativeChains. representativeChains = set([]) for i in chainsToCull: if representatives.has_key(i): representativeChains.add(representatives[i]) else: representativeChains.add(i) # representativesReverse records for each chain that represents at least one chain in chainsToCull, a set of the # non-representative chains in chainsToCull that it represents. # For example, if chainsToCull == [a, b, c, d], and a and b are non-representative chains represented by chain q, # then representativesReverse[q] = set([a, b]). representativesReverse = {} for i in representatives.keys(): reprChain = representatives[i] if representativesReverse.has_key(reprChain): representativesReverse[reprChain].add(i) else: representativesReverse[reprChain] = set([i]) representativesReverseKeys = representativesReverse.keys() if cullMethod == 'Entry': removedInput = cull_main(similarityData, sequenceIdentity, representativeChains, 'entry', representativesReverse) keptInput = set([i[:4] for i in entriesToCull if i[:4] not in removedInput]) if performIntraEntryCulling and intraEntrySequenceIdentity < 100: entryToChain = {} chainsOfInterest = set([]) readProteinData = open(proteinData, 'r') for i in readProteinData: chunks = (i.strip()).split('\t') chain = chunks[0] entry = chunks[1] sequence = chunks[11] invalid = (minLength != -1 and len(sequence) < minLength) or (maxLength != -1 and len(sequence) > maxLength) if entry in keptInput and not invalid: chainsOfInterest.add(chain) if entryToChain.has_key(entry): entryToChain[entry].append(chain) else: entryToChain[entry] = [chain] readProteinData.close() representatives = {} readRepresentativeData = open(representativeData, 'r') for i in readRepresentativeData: chunks = (i.strip()).split('\t') nonreprChain = chunks[0] reprChain = chunks[1] if nonreprChain in chainsOfInterest: representatives[nonreprChain] = reprChain readRepresentativeData.close() representativeChains = set([]) for i in chainsOfInterest: if representatives.has_key(i): representativeChains.add(representatives[i]) else: representativeChains.add(i) representativesReverse = {} for i in representatives.keys(): reprChain = representatives[i] if representativesReverse.has_key(reprChain): representativesReverse[reprChain].add(i) else: representativesReverse[reprChain] = set([i]) entryToRepChain = dict([(i, set([])) for i in entryToChain.keys()]) for i in chainsOfInterest: entry = i[:4] if representatives.has_key(i): entryToRepChain[entry].add(representatives[i]) else: entryToRepChain[entry].add(i) keptInputChains = set([]) for i in keptInput: if len(entryToRepChain[i]) == 1: # If the entry's chains are all representated by one chain, then all the chains are identical. A random chain from the entry should be kept. keptInputChains.add(entryToChain[i][0]) del entryToRepChain[i] adjList, namesList = adjlistcreation.intra_entry_main(similarityData, intraEntrySequenceIdentity, representativeChains, entryToRepChain) for i in range(len(adjList)): chainsToCull = Leafcull.main(adjList[i], namesList[i]) keptReprChains = [j for j in namesList[i] if not j in chainsToCull] for j in keptReprChains: # Calculate the kept input chains. if representativesReverse.has_key(i): # If the representative chain that was kept has non-representative chains in the input, then select one of them. keptInputChains.add(iter(representativesReverse[j]).next()) else: # If the representative chain that was kept has no non-representative chains in the input, then the representative chain was in the input. Keep it. keptInputChains.add(j) else: keptInputChains = set([i for i in chainsToCull if i[:4] in keptInput]) else: removedReprChains = set(cull_main(similarityData, sequenceIdentity, representativeChains, 'chain', {})) keptReprChains = [i for i in representativeChains if i not in removedReprChains] keptInputChains = set([]) for i in keptReprChains: # Calculate the kept input chains. if representativesReverse.has_key(i): # If the representative chain that was kept has non-representative chains in the input, then select one of them. keptInputChains.add(iter(representativesReverse[i]).next()) else: # If the representative chain that was kept has no non-representative chains in the input, then the representative chain was in the input. Keep it. keptInputChains.add(i) removedInput = sorted([i for i in potentialChains if i not in keptInputChains]) keptInputOutput = 'IDs\tlength\tExptl.\tresolution\tR-factor\tFreeRvalue\n' fastaOutput = '' entryStats = {} readProteinData = open(proteinData, 'r') for i in readProteinData: chunks = (i.strip()).split('\t') chain = chunks[0] entry = chunks[1] experimentType = chunks[2] resolution = chunks[3] rValueObs = chunks[4] rValueFree = chunks[5] if chunks[6] == '0': alphaCarbon = 'no' else: alphaCarbon = 'yes' description = chunks[7] dbName = chunks[8] dbCode = chunks[9] organism = chunks[10] sequence = chunks[11] if cullMethod == 'Entry': if entry in keptInput: entryStats[entry] = {'len' : str(len(sequence)), 'expt' : experimentType, 'res' : resolution, 'rval' : rValueObs, 'freeRval' : rValueFree} if chain in keptInputChains: fastaOutput += '>' + '\t'.join([chain, str(len(sequence)), experimentType, resolution, rValueObs, rValueFree, alphaCarbon, description, '<' + dbName + ' ' + dbCode + '>', '[' + organism + ']']) + '\n' + sequence + '\n' else: if chain in keptInputChains: keptInputOutput += '\t'.join([chain, str(len(sequence)), experimentType, resolution, rValueObs, rValueFree]) + '\n' fastaOutput += '>' + '\t'.join([chain, str(len(sequence)), experimentType, resolution, rValueObs, rValueFree, alphaCarbon, description, '<' + dbName + ' ' + dbCode + '>', '[' + organism + ']']) + '\n' + sequence + '\n' readProteinData.close() if cullMethod == 'Entry': keptInputOutput += '\n'.join(['\t'.join([i, entryStats[i]['len'], entryStats[i]['expt'], entryStats[i]['res'], entryStats[i]['rval'], entryStats[i]['freeRval']]) for i in sorted(entryStats.keys())]) self.request.removed.save('', ContentFile('\n'.join(removedInput))) self.request.nonredNoSeq.save('', ContentFile(keptInputOutput)) self.request.nonredSeq.save('', ContentFile(fastaOutput)) self.request.completed = True self.request.save() logger = open('/srv/www/vhosts.d/www.bioinf/html/doig/cgi-bin/django_projects/LeafWebApp/ErrorLogs/ERROR.log', 'a') logger.write('\tFINISH: thread for request ' + str(self.request.id) + ', of type ' + str(self.requestType) + ' on ' + strftime('%Y/%m/%d/ at %H:%M:%S', gmtime()) + '.\n') logger.close() except: logger = open('/srv/www/vhosts.d/www.bioinf/html/doig/cgi-bin/django_projects/LeafWebApp/ErrorLogs/ERROR.log', 'a') logger.write('\tERROR: thread for request ' + str(self.request.id) + ', of type ' + str(self.requestType) + 'on ' + strftime('%Y/%m/%d/ at %H:%M:%S', gmtime()) + '.\n') excType, excValue, excTrace = sys.exc_info() logger.write('\t\tException type: ' + str(excType) + '\n') logger.write('\t\tException Value: ' + str(excValue) + '\n') errors = traceback.format_exception(excType, excValue, excTrace) for i in errors: logger.write('\t\t' + i) logger.close() else: logger = open('/srv/www/vhosts.d/www.bioinf/html/doig/cgi-bin/django_projects/LeafWebApp/ErrorLogs/ERROR.log', 'a') logger.write('\tERROR: Request not for sequence or PDB culling!!!\n') logger.close()