def validateTestSetResult(self, tsr): if (tsr.getFinalPositiveCount() != self.referencePositiveCount): raise pcssErrors.PcssGlobalException("Error: test set did not have same number of positives (%s) as the reference (%s)" % (tsr.getFinalPositiveCount(), self.referencePositiveCount)) if (tsr.getFinalNegativeCount() != self.referenceNegativeCount): raise pcssErrors.PcssGlobalException("Error: test set did not have same number of negatives (%s) as the reference (%s)" % (tsr.getFinalNegativeCount(), self.referenceNegativeCount))
def readAnnotationFile(self, annotationFile): if (not os.path.exists(annotationFile)): raise pcssErrors.PcssGlobalException( "Error: annotation file reader did not find expected annotation file\n%s" % annotationFile) reader = pcssTools.PcssFileReader(annotationFile) lines = reader.getLines() sortedAttributes = self.pcssRunner.pfa.getColumnSortedInputAttributes() for (i, line) in enumerate(lines): if (i == 0): self.validateColumnLine(annotationFile, line) continue pcssProtein = self.getProteinFromLine(line) if (not (pcssProtein.hasErrors())): cols = line.split('\t') for attribute in sortedAttributes: attribute.setValueFromFile( self.getValueForAttributeName(attribute.name, cols), pcssProtein, int( self.getValueForAttributeName( "peptide_start", cols))) self.setPeptideLength() if (len(self.proteins) == 0): raise pcssErrors.PcssGlobalException( "Did not read any proteins from annotation file")
def validateCounts(self): if (self.testSetPositiveCount < 1): raise pcssErrors.PcssGlobalException("Positive test set count is %s (should be greater than 0)") if (self.testSetPositiveCount > self.testSetNegativeCount): raise pcssErrors.PcssGlobalException("Test set should have more negatives than positives") if (self.testSetPositiveCount + self.trainingSetPositiveCount != self.totalPositiveCount): raise pcssErrors.PcssGlobalException("Positive Training Set (%s) and Positive Test Set (%s) do not add up to total positives (%s)" % (self.trainingSetPositiveCount, self.testSetPositiveCount, self.totalPositiveCount)) if (self.testSetNegativeCount + self.trainingSetNegativeCount != self.totalNegativeCount): raise pcssErrors.PcssGlobalException("Negative Training Set (%s) and Negative Test Set (%s) do not add up to total negatives (%s)" % (self.trainingSetNegativeCount, self.testSetNegativeCount, self.totalNegativeCount))
def readResultFile(self): resultFile = self.getClassifyOutputFile() self.pstList = [] if (not os.path.exists(resultFile)): raise pcssErrors.PcssGlobalException("Classify SVM could not read result file %s; \n" "check to make sure svm_classify completed as suggested" % resultFile) reader = pcssTools.PcssFileReader(self.getClassifyOutputFile()) lines = reader.getLines() if (len(lines) != len(self.peptides)): raise pcssErrors.PcssGlobalException("Result file has a different number of results (%s) than I have peptides (%s)" % (len(lines), len(self.peptides))) for (i, peptide) in enumerate(self.peptides): score = float(lines[i]) pst = self.PeptideScoreTuple(peptide, score) self.pstList.append(pst)
def validatePeptideSequences(self): for peptide in self.peptides.values(): if (peptide.sequence != self.getSubsequence(peptide.startPosition, peptide.endPosition + 1)): raise pcssErrors.PcssGlobalException("Protein %s subsequence %s doesn't match peptide sequence %s starting at position %s" % (self.modbaseSequenceId, self.getSubsequence(peptide.startPosition, peptide.endPosition + 1), peptide.sequence, peptide.startPosition))
def seqBatchErrorExists(self, subDirName): if (os.path.exists( os.path.join( subDirName, self.pcssRunner.internalConfig["pcss_error_output_file"])) ): errorInfo = pcssErrors.ErrorInfo( os.path.join( subDirName, self.pcssRunner.internalConfig["pcss_error_output_file"])) raise pcssErrors.PcssGlobalException( "Got pcss seq batch error %s\nin directory %s" % (errorInfo.msg, subDirName)) if (os.path.exists( os.path.join( subDirName, self.pcssRunner. internalConfig["internal_error_output_file"]))): errorInfo = pcssErrors.ErrorInfo( os.path.join( subDirName, self.pcssRunner. internalConfig["internal_error_output_file"])) raise InternalException( "Got internal seq batch error %s\nin directory %s" % (errorInfo.msg, subDirName))
def getLength(self): targetEnd = int(self.getAttributeValue("target_end")) targetBegin = int(self.getAttributeValue("target_beg")) if (targetEnd <= targetBegin): raise pcssErrors.PcssGlobalException("Error in model table: model %s has target_end position %s " "before target_start position %s" % (self.getId(), targetEnd, targetBegin)) return targetEnd - targetBegin
def trainAndApplyModel(self): self.trainingSvm.trainModel() if (len(self.testSvm.peptides) > 1): raise pcssErrors.PcssGlobalException("Error: Leave One Out Benchmarker has test set greater than size 1 (%s total)" % len(self.testSvm.peptides)) self.testSvm.classifySvm()
def finalizeFeature(self, peptide, feature, referencePeptideLength): if (peptide.getPeptideLength() > referencePeptideLength): raise pcssErrors.PcssGlobalException("Peptide %s has length of %s which is greater than reference %s" % (peptide.startPosition, peptide.getPeptideLength(), referencePeptideLength())) lengthDifference = referencePeptideLength - peptide.getPeptideLength() multiplier = feature.getFeatureLength() self.featureNumber += (lengthDifference * multiplier)
def runSubprocess(self, args, checkStdError=True): """Run python subprocess module command; by default, raise exception if anything was written to stderr""" process = subprocess.Popen(args, shell=False, stderr=subprocess.PIPE) processOutput = process.communicate() if (processOutput[1] != "" and checkStdError): raise pcssErrors.PcssGlobalException( "Got subprocess error.\nRan method args %s\nGot stderr %s" % (args, processOutput[1])) return processOutput
def validatePeptideTrainingStatus(self, status): status = status.lower() if (not (status == self.getPositiveKeyword() or status == self.getNegativeKeyword())): raise pcssErrors.PcssGlobalException( "Peptide status %s not valid status (needs to be %s, %s)" % (status, self.getPositiveKeyword(), self.getNegativeKeyword())) return status
def readBenchmarkFile(self, fileName): reader = pcssTools.PcssFileReader(fileName) lines = reader.getLines() firstLine = lines[0] lastLine = lines[-1] if (not self.checkBoundaryLines(firstLine, "0")): raise pcssErrors.PcssGlobalException("Expected benchmark file %s to have first line of 0\t0") if (not self.checkBoundaryLines(lastLine, "1")): raise pcssErrors.PcssGlobalException("Expected benchmark file %s to have last line of 1\t1") for line in lines[1:len(lines) - 2]: cols = line.split() fpr = float(cols[0]) tpr = float(cols[1]) score = float(cols[2]) self.validateScore(score) st = self.ScoreTuple(fpr, tpr, score) self._results.append(st)
def createModelStyle(self, runName, values, pdh): if (values['style'] == 'NewModelStyle'): return NewModelStyle(pdh) elif (values['style'] == 'OldModelStyle'): return OldModelStyle(pdh) else: raise pcssErrors.PcssGlobalException( "Model run info file has run %s with invalid model style %s; " "please change to either 'NewModelStyle' or 'OldModelStyle'" % (runName, values['style']))
def sleepUntilDone(self, fileName, predicate): """Sleep until predicate involving fileName is true; useful for avoiding race conditions in file manipulation""" sleepTime = 0 while (predicate(fileName)): print "sleep 1 second" time.sleep(1) sleepTime += 1 if (sleepTime > 10): raise pcssErrors.PcssGlobalException("Timeout on file %s" % fileName)
def getSample(self, peptides, count): if (count > len(peptides)): raise pcssErrors.PcssGlobalException("getSample(): tried to sample %s peptides but there are only %s peptides in the pool" % (count, len(peptides))) makeRandomSample = self.pcssRunner.internalConfig["make_random_test_set"] if (makeRandomSample): print "RANDOM SAMPLE" return random.sample(peptides, int(count)) else: print "NON RANDOM SAMPLE" # -- make internal config interpolation and test return peptides[0:int(count)]
def handleConfigError(self, results): msg = "CONFIGURATION ERROR\n" for (section_list, key, _) in flatten_errors(self.pcssConfig, results): if key is not None: msg += 'The "%s" key in the section "%s" failed validation\n' % ( key, ', '.join(section_list)) else: msg += 'The following section was missing:%s ' % ', '.join( section_list) print msg raise pcssErrors.PcssGlobalException(msg)
def trainModel(self): svmCommandName = self.runner.internalConfig['svm_train_command'] trainingSetFileName = self.runner.pdh.getSvmTrainingSetFile() if (not os.path.exists(trainingSetFileName)): raise pcssErrors.PcssGlobalException("Did not find training set input file in expected location -- searched for\n%s" % trainingSetFileName) modelFileName = self.runner.pdh.getSvmNewModelFile() gammaFlag = self.runner.pcssConfig["svm_training_gamma"] cFlag = self.runner.pcssConfig["svm_training_c"] #SPLIT FLAGS svmOutput = self.runner.pdh.runSubprocess([svmCommandName, "-g", gammaFlag, "-c", cFlag, trainingSetFileName, modelFileName])
def validateColumnLine(self, annotationFile, line): sortedAttributes = self.pcssRunner.pfa.getColumnSortedInputAttributes() firstAttribute = sortedAttributes[0] if (not line.startswith(firstAttribute.niceName)): raise pcssErrors.PcssGlobalException( "Error: read annotation file %s\n. Expected first row to be column header " "(starting with %s) but didn't find it; instead got\n%s" % (annotationFile, firstAttribute.niceName, line)) columnNames = line.split('\t') sortedAttributeNames = [] for i in sortedAttributes: sortedAttributeNames.append(i.niceName) if (i.niceName not in columnNames): raise pcssErrors.PcssGlobalException( "Error: read annotation file %s\n. Expected input attribute %s but did not find it" % (annotationFile, i.niceName)) for i in columnNames: if i not in sortedAttributeNames: raise pcssErrors.PcssGlobalException( "Error: read annotation file %s\n. Read column header %s that wasn't specified in attributes file" % (annotationFile, i))
def classifySvm(self): svmCommandName = self.pcssRunner.internalConfig['svm_classify_command'] classificationFileName = self.getSvmInputFile() if (not os.path.exists(classificationFileName)): raise pcssErrors.PcssGlobalException("Did not find test set file in expected location -- searched for\n%s" % classificationFileName) scoreFileName = self.getClassifyOutputFile() modelFile = self.getSvmModelFile() svmOutput = self.pcssRunner.pdh.runSubprocess([svmCommandName, classificationFileName, modelFile, scoreFileName])
def getSingleResidueFeatureList(self, residueCode, featureNumber, seqList): featureList = [] foundResidue = False for (i, nextResidueCode) in enumerate(self.residueOrder): if (nextResidueCode == residueCode): foundResidue = True featureList.append("%s:%s" % (featureNumber + i, 1)) else: featureList.append("%s:%s" % (featureNumber + i, 0)) if (not foundResidue): raise pcssErrors.PcssGlobalException("Residue %s in sequence %s is not one of the 20 standard amino acids" % (residueCode, "".join(seqList))) return " ".join(featureList)
def getProteinValue(self, protein): """Get value of my attribute from input protein as a string""" attributeValue = protein.getAttributeOutputString(self.name) if (attributeValue is None): if (self.outputOptional is False): raise pcssErrors.PcssGlobalException( "Protein %s never set mandatory attribute %s" % (protein.modbaseSequenceId, self.name)) else: return "" return attributeValue
def unzipFile(self, sourceFile): """Unzip sourceFile; expects .gz suffix""" if (not sourceFile.endswith(".gz")): #might end up having to change this later to include other filetypes, but this could wreak havoc #if result file isn't properly formatted raise pcssErrors.PcssGlobalException( "Attempted to unzip file %s that does not end with '.gz'") resultFile = sourceFile.rstrip(".gz") if (not os.path.exists(resultFile)): #should never already exist since we wouldn't be here if it did, but another process could possibly have put it here #there is still somewhat of a race condition though as this check could return false and another process could put #the unzipped file in immdediately after, but chances are extremely low self.runSubprocess(['gunzip', sourceFile]) self.sleepUntilDone(resultFile, predicate=self.fileDoesNotExist)
def readProteinSequences(self, fastaFileName): fh = open(fastaFileName, 'r') fastaIterator = SeqIO.FastaIO.FastaIterator(fh) for seqRecord in fastaIterator: [modbaseId, uniprotId] = seqRecord.id.split('|') if (modbaseId in self.proteins): protein = self.proteins[modbaseId] protein.setProteinSequence(seqRecord.seq) for protein in self.proteins.values(): if (protein.proteinSequence is None): raise pcssErrors.PcssGlobalException( "Protein %s has no sequence set" % protein.modbaseSequenceId) fh.close()
def makePeptideFromCode(self, peptideCode, modbaseSeqId): if (len(peptideCode.split('_')) != 3): raise pcssErrors.PcssGlobalException( "Peptide code %s from protein %s is not proper form of peptideStart_peptideSequence_status" % (peptideCode, modbaseSeqId)) [peptideStart, peptideSequence, status] = peptideCode.split('_') if (peptideSequence == self.pcssRunner. internalConfig["keyword_peptide_sequence_mismatch"]): return None status = self.pcssRunner.validatePeptideCodeStatus(status, peptideCode) peptideStart = int(peptideStart) peptide = pcssPeptide.PcssPeptide( peptideSequence, peptideStart, peptideStart + len(peptideSequence) - 1, self.pcssRunner) peptide.addStringAttribute("status", status) return peptide
def createTrainingAndTestSets(self, peptides): trainingPeptideList = [] if (self.currentPeptidePosition >= len(peptides)): msg = "Error: Leave one out benchmarker internal peptide counter (%s) must be smaller than input peptide set count (%s)" % (self.currentPeptidePosition, len(peptides)) raise pcssErrors.PcssGlobalException(msg) for (i, peptide) in enumerate(peptides): if (i == self.currentPeptidePosition): self.testSvm.setPeptides([peptide]) print "next test set peptide position %s status %s" % (peptide.startPosition, peptide.getAttributeOutputString("status")) else: trainingPeptideList.append(peptide) self.trainingSvm.setPeptides(trainingPeptideList) self.trainingSvm.writeTrainingSetFile() self.testSvm.writeClassificationFile() self.currentPeptidePosition += 1
def initFromModelTableLine(self, line, modelTableColumns): """Initialize model from file. Use ModelTableColumns to get the names of each attribute in the line (which is read from the model table file) and save these attributes internally""" cols = line.split('\t') i = 0 if (modelTableColumns.getColumnCount() != len(cols)): raise pcssErrors.PcssGlobalException("Model table column order file contains a " "different number of columns (%s) than model table (%s)\nline: %s" % (modelTableColumns.getColumnCount(), len(cols), line)) for col in cols: columnName = modelTableColumns.getColumnName(i) self.setAttribute(columnName, col) i += 1
def prepareTrainingBenchmarkRun(self): pcssCopy = copy.deepcopy(self.pcssRunner.pcssConfig) print "running cluster with output file %s" % self.pcssRunner.pdh.getFullOutputFile( "") inputAnnotationFileName = self.pcssRunner.pdh.getFullOutputFile( self.pcssRunner.internalConfig["annotation_output_file"]) if (not os.path.exists(inputAnnotationFileName)): msg = "Did not find input annotation file name %s\n" % inputAnnotationFileName msg += "Please make sure this is file is in the run directory for this training benchmark run" raise pcssErrors.PcssGlobalException(msg) pcssCopy[ "input_annotation_file_name"] = self.pcssRunner.pdh.getFullOutputFile( self.pcssRunner.internalConfig["annotation_output_file"]) pcssCopy.filename = self.pcssRunner.pdh.getFullOutputFile( self.pcssRunner.internalConfig["training_benchmark_config_file"]) pcssCopy.write()
def makeSvmFileLine(self): svmFileStringList = [] featureNumber = 0 svmHandler = pcssSvm.SvmFeatureHandler() self.svmHandler = svmHandler for featureName in self.pcssRunner.getSvmFeatureOrder(): if (not self.hasAttribute(featureName)): raise pcssErrors.PcssGlobalException("Error: peptide tried to make svm feature for %s but does not have this feature" % featureName) if (not self.getAttribute(featureName).isInitialized() or pcssTools.isPeptideErrorValue(self.getAttributeOutputString(featureName))): svmHandler.processEmptyFeature(self.pcssRunner.getPeptideLength(), self.getAttribute(featureName)) else: svmFileStringList.append(self.getAttribute(featureName).makeSvmFeature(svmHandler)) svmHandler.finalizeFeature(self, self.getAttribute(featureName), self.pcssRunner.getPeptideLength()) return " ".join(svmFileStringList)
def getTrainingBenchmarkConfig(self): baseConfig = copy.deepcopy(self.pcssConfig) baseConfig["pcss_directory"] = self.getPcssClusterBaseDirectory() baseConfig["run_directory"] = self.getClusterRunDirectory() inputAnnotationFileName = self.getFullOutputFile( self.internalConfig["annotation_output_file"]) if (not os.path.exists(inputAnnotationFileName)): msg = "Did not find input annotation file name %s\n" % inputAnnotationFileName msg += "Please make sure this is file is in the run directory for this training benchmark run" raise pcssErrors.PcssGlobalException(msg) baseConfig[ "input_annotation_file_name"] = self.getFullClusterOutputFile( self.internalConfig["annotation_output_file"]) return baseConfig
def getPeptideValue(self, peptide): """Get value of my attribute from input peptide as a string""" if (self.attributeType == 'model'): if (peptide.bestModel is None): return "" else: return peptide.bestModel.getAttributeValue(self.name) else: attributeValue = peptide.getAttributeOutputString(self.name) if (attributeValue is None): if (self.outputOptional is False): raise pcssErrors.PcssGlobalException( "Peptide %s never set mandatory attribute %s" % (peptide.startPosition, self.name)) else: return "" return attributeValue