def TrainModel(self): """ Our training data-set is in self.InputFeaturePath. Let's train a model to predict which entries come from the true database. """ if not self.InputFeaturePath: print "* Please specify an input feature-file." print UsageInfo sys.exit(-1) # Load in features for a collection of TRUE and FALSE instances. File = open(self.InputFeaturePath, "rb") self.FeatureNames = {} FeatureCount = FormatBits.LastFeature - FormatBits.FirstFeature + 1 # We have one set of features for facultative sites, and one for constitutive. # Note that some features (modification rate, correlation with unmodified peptide) # are applicable to F but not C. #self.FeaturesF = range(FeatureCount) # For constitutive modifications: Modification rate, protein coverage, # and number of unmodified peptides are all off-limits. (Those features # are "dead giveaways" that we have a non-shuffled protein!) #self.FeaturesC = [2, 3, 5, 22, 24, 25, 26] self.FeaturesC = ValidFeatureIndices[:] #self.FeaturesC = range(FeatureCount) self.FeaturesF = self.FeaturesC self.FeaturesAll = [] for FeatureIndex in self.FeaturesF: if FeatureIndex in self.FeaturesC: self.FeaturesAll.append(FeatureIndex) # We can OVERRIDE the list of features here, to forbid the use of some: print "Permitted features all:", self.FeaturesAll # Parse the features from the TRAINING and TESTING files. We generate # training sets for the FACULTATIVE (F) and for CONSTITUTIVE (C) sites. self.TrainingSet2 = Learning.FeatureSetClass() self.TrainingSet2.Type = "Charge-2" self.TrainingSet3 = Learning.FeatureSetClass() self.TrainingSet3.Type = "Charge-3" #self.TrainingSetAll = Learning.FeatureSetClass() #self.TrainingSetAll.Type = "All" self.ParseFeatureFile(self.InputFeaturePath, self.TrainingSet2, self.TrainingSet3, self.TrainingSetDBRatio) if self.ModelTestFilePath: self.TestingSet2 = FeatureSetClass() self.TestingSet3 = FeatureSetClass() self.ParseFeatureFile(self.ModelTestFilePath, self.TestingSet2, self.TestingSet3, self.TestingSetAll, self.TestingSetDBRatio) # SPECIAL values for model, which don't actually cause training: if self.ModelType == "feature": print "\n\nSINGLE feature:" self.TrainOneFeature(self.TrainingSet2) self.TrainOneFeature(self.TrainingSet3) return if self.ModelType == "featurescatter": print "\n\nFeature+feature scatter-plots:" self.ProduceFeatureScatterPlots(self.TrainingSetAll) return if self.ModelType == "summary": self.PerformFeatureSummary() return # Instantiate our model: self.Model2 = self.GetModelObject(self.FeaturesAll) self.Model3 = self.GetModelObject(self.FeaturesAll) # Load a pre-trained model, if we received a path: if self.ReadModelFilePath2: self.Model2.LoadModel(self.ReadModelFilePath2) self.Model3.LoadModel(self.ReadModelFilePath3) ####################################################################### # Special value for feature selection (3) means that we train a model on # all data, then use it to generate a sub-feature-set for a facultative model! if self.FeatureSelectionFlag == 3: self.TrainFacultative() return ####################################################################### # If we're not doing feature selection: Train on the training set, # and then (if we have a testing set) test on the testing set. if not self.FeatureSelectionFlag: # Train the model (unless we just loaded it in): if not self.ReadModelFilePath2: self.Model2.Train(self.TrainingSet2) self.Model3.Train(self.TrainingSet3) # Compute the score of each vector: if self.ModelTestFilePath: self.Model2.Test(self.TestingSet2) self.Model2.ReportAccuracy(self.TestingSet2) self.Model3.Test(self.TestingSet3) self.Model3.ReportAccuracy(self.TestingSet3) self.WriteScoredFeatureSet(self.TestingSet2, self.TestingSet3) else: self.Model2.Test(self.TrainingSet2) self.Model2.ReportAccuracy(self.TrainingSet2) shutil.copyfile("PValues.txt", "PValues.chg2.txt") self.Model3.Test(self.TrainingSet3) self.Model3.ReportAccuracy(self.TrainingSet3) shutil.copyfile("PValues.txt", "PValues.chg3.txt") #if self.ReportROCPath: # self.Model.ReportROC(self.TrainingSetAll, self.ReportROCPath) self.WriteScoredFeatureSet(self.TrainingSet2, self.TrainingSet3) if self.WriteModelFilePath2: self.Model2.SaveModel(self.WriteModelFilePath2) self.Model3.SaveModel(self.WriteModelFilePath3) return ####################################################################### # We're doing feature selection. We'll need to write out feature files, # then call TrainMachineLearner print "Feature names:", self.FeatureNames print "AllFeatures:", self.FeaturesAll self.WriteFeaturesToFile(self.TrainingSet2, "PTMFeatures.2.txt") self.WriteFeaturesToFile(self.TrainingSet3, "PTMFeatures.3.txt")
def TrainFacultative(self): """ Train paired models for CONSTITUTIVE ("always") and FACULTATIVE ("sometimes") PTMs. """ # Train a model on all PTMs, to get initial scores for all PTMs. # The initial model uses only CONSTITUTIVE features, and its output # is used only to provide an ORACLE for the facultative model: print "TRAIN model on all features:" self.Model.Train(self.TrainingSetAll) print "SCORE all features:" self.Model.Test(self.TrainingSetAll) ############################################################## print "Generate SUB-MODEL of only facultative features:" # Sort facultative instances by score: SortedList = [] for Vector in self.TrainingSetAll.AllVectors: if not Vector.FileBits[FormatBits.SisterAnnotationFlag]: continue SortedList.append((Vector.Score, Vector)) SortedList.sort() FacFeatureSet = Learning.FeatureSetClass() ChunkSize = min(len(SortedList) / 4, 1000) print "Sorted list of %s facultative features, chunk size is %s"%(len(SortedList), ChunkSize) for (Score, Vector) in SortedList[:ChunkSize]: NewVector = Learning.FeatureVector() NewVector.FileBits = Vector.FileBits[:] NewVector.Features = Vector.Features[:] NewVector.TrueFlag = 0 FacFeatureSet.AllVectors.append(NewVector) FacFeatureSet.FalseVectors.append(NewVector) for (Score, Vector) in SortedList[-ChunkSize:]: NewVector = Learning.FeatureVector() NewVector.FileBits = Vector.FileBits[:] NewVector.Features = Vector.Features[:] NewVector.TrueFlag = 1 FacFeatureSet.AllVectors.append(NewVector) FacFeatureSet.TrueVectors.append(NewVector) FacFeatureSet.SetCounts() FacFeatureSet.GetPriorProbabilityFalse(self.TrainingSetDBRatio) ############################################################## # Write out the FACULTATIVE feature set: FacTrainingFile = open("FacultativeTrainingSet.txt", "wb") for HeaderLine in self.HeaderLines: FacTrainingFile.write(HeaderLine) for Vector in FacFeatureSet.AllVectors: Bits = Vector.FileBits[:] if Vector.TrueFlag: Bits[FormatBits.TrueProteinFlag] = "1" else: Bits[FormatBits.TrueProteinFlag] = "0" Str = string.join(Bits, "\t") FacTrainingFile.write(Str + "\n") FacTrainingFile.close() ############################################################## # Train the sub-model: self.FacModel = self.GetModelObject(self.FeaturesF) self.FacModel.Train(FacFeatureSet) self.FacModel.Test(FacFeatureSet) self.FacModel.ReportAccuracy(FacFeatureSet) # invokes ComputeOddsTrue ############################################################## # Apply the trained fac-model to *all* facultative features, and # train an overall model on all *constitutive* features: self.FeatureSetC = Learning.FeatureSetClass() self.FeatureSetF = Learning.FeatureSetClass() for Vector in self.TrainingSetAll.AllVectors: if Vector.FileBits[FormatBits.SisterAnnotationFlag]: FeatureSet = self.FeatureSetF else: FeatureSet = self.FeatureSetC FeatureSet.AllVectors.append(Vector) if Vector.TrueFlag: FeatureSet.TrueVectors.append(Vector) else: FeatureSet.FalseVectors.append(Vector) self.FeatureSetC.SetCounts() self.FeatureSetF.SetCounts() self.FeatureSetC.GetPriorProbabilityFalse(self.TrainingSetDBRatio) self.FeatureSetF.GetPriorProbabilityFalse(self.TrainingSetDBRatio) # Score facultative-feature, using facultative-model: self.FacModel.Test(self.FeatureSetF) # Train constitutive-ONLY model, and score constitutive features: self.ConModel = self.GetModelObject(self.FeaturesC) self.ConModel.Train(self.FeatureSetC) self.ConModel.Test(self.FeatureSetC) self.ConModel.ReportAccuracy(self.FeatureSetC) # to invoke ComputeOddsTrue ############################################################## # Save our models: if self.WriteModelFilePath: (Stub, Extension) = os.path.splitext(self.WriteModelFilePath) ConModelPath = "%s.con"%Stub FacModelPath = "%s.fac"%Stub self.ConModel.SaveModel(ConModelPath) self.FacModel.SaveModel(FacModelPath) ############################################################## # Write out the scored features: OutputFile = open(self.OutputFeaturePath, "wb") for Line in self.HeaderLines: OutputFile.write(Line) for Vector in self.TrainingSetAll.AllVectors: if Vector.FileBits[FormatBits.SisterAnnotationFlag]: PValue = self.FacModel.GetPValue(Vector.Score) else: PValue = self.ConModel.GetPValue(Vector.Score) while len(Vector.FileBits) <= FormatBits.ModelPValue: Vector.FileBits.append("") Vector.FileBits[FormatBits.ModelScore] = str(Vector.Score) Vector.FileBits[FormatBits.ModelPValue] = str(PValue) Str = string.join(Vector.FileBits, "\t") OutputFile.write(Str + "\n")