Example #1
0
 def TrainModel(self):
     """
     Our training data-set is in self.InputFeaturePath.
     Let's train a model to predict which entries come from the true database.
     """
     if not self.InputFeaturePath:
         print "* Please specify an input feature-file."
         print UsageInfo
         sys.exit(-1)
     # Load in features for a collection of TRUE and FALSE instances.
     File = open(self.InputFeaturePath, "rb")
     self.FeatureNames = {}
     FeatureCount = FormatBits.LastFeature - FormatBits.FirstFeature + 1
     # We have one set of features for facultative sites, and one for constitutive.
     # Note that some features (modification rate, correlation with unmodified peptide)
     # are applicable to F but not C.
     #self.FeaturesF = range(FeatureCount)
     # For constitutive modifications: Modification rate, protein coverage,
     # and number of unmodified peptides are all off-limits.  (Those features
     # are "dead giveaways" that we have a non-shuffled protein!)
     #self.FeaturesC = [2, 3, 5, 22, 24, 25, 26]
     self.FeaturesC = ValidFeatureIndices[:]
     #self.FeaturesC = range(FeatureCount)
     self.FeaturesF = self.FeaturesC
     self.FeaturesAll = []
     for FeatureIndex in self.FeaturesF:
         if FeatureIndex in self.FeaturesC:
             self.FeaturesAll.append(FeatureIndex)
     # We can OVERRIDE the list of features here, to forbid the use of some:
     print "Permitted features all:", self.FeaturesAll
     # Parse the features from the TRAINING and TESTING files.  We generate
     # training sets for the FACULTATIVE (F) and for CONSTITUTIVE (C) sites.
     self.TrainingSet2 = Learning.FeatureSetClass()
     self.TrainingSet2.Type = "Charge-2"
     self.TrainingSet3 = Learning.FeatureSetClass()
     self.TrainingSet3.Type = "Charge-3"
     #self.TrainingSetAll = Learning.FeatureSetClass()
     #self.TrainingSetAll.Type = "All"
     self.ParseFeatureFile(self.InputFeaturePath, self.TrainingSet2, self.TrainingSet3,
                           self.TrainingSetDBRatio)
     if self.ModelTestFilePath:
         self.TestingSet2 = FeatureSetClass()
         self.TestingSet3 = FeatureSetClass()
         self.ParseFeatureFile(self.ModelTestFilePath, self.TestingSet2, self.TestingSet3,
             self.TestingSetAll, self.TestingSetDBRatio)
     # SPECIAL values for model, which don't actually cause training:
     if self.ModelType == "feature":
         print "\n\nSINGLE feature:"
         self.TrainOneFeature(self.TrainingSet2)
         self.TrainOneFeature(self.TrainingSet3)
         return
     if self.ModelType == "featurescatter":
         print "\n\nFeature+feature scatter-plots:"
         self.ProduceFeatureScatterPlots(self.TrainingSetAll)
         return
     if self.ModelType == "summary":
         self.PerformFeatureSummary()
         return
     # Instantiate our model:
     self.Model2 = self.GetModelObject(self.FeaturesAll)
     self.Model3 = self.GetModelObject(self.FeaturesAll)
     # Load a pre-trained model, if we received a path:
     if self.ReadModelFilePath2:
         self.Model2.LoadModel(self.ReadModelFilePath2)
         self.Model3.LoadModel(self.ReadModelFilePath3)
     #######################################################################
     # Special value for feature selection (3) means that we train a model on
     # all data, then use it to generate a sub-feature-set for a facultative model!
     if self.FeatureSelectionFlag == 3:
         self.TrainFacultative()
         return
     #######################################################################
     # If we're not doing feature selection: Train on the training set,
     # and then (if we have a testing set) test on the testing set.
     if not self.FeatureSelectionFlag:
         # Train the model (unless we just loaded it in):
         if not self.ReadModelFilePath2:
             self.Model2.Train(self.TrainingSet2)
             self.Model3.Train(self.TrainingSet3)
         # Compute the score of each vector:
         if self.ModelTestFilePath:
             
             self.Model2.Test(self.TestingSet2)
             self.Model2.ReportAccuracy(self.TestingSet2)
             
             self.Model3.Test(self.TestingSet3)
             self.Model3.ReportAccuracy(self.TestingSet3)
             self.WriteScoredFeatureSet(self.TestingSet2, self.TestingSet3)
         else:
             
             self.Model2.Test(self.TrainingSet2)
             self.Model2.ReportAccuracy(self.TrainingSet2)
             shutil.copyfile("PValues.txt", "PValues.chg2.txt")
             
             self.Model3.Test(self.TrainingSet3)
             self.Model3.ReportAccuracy(self.TrainingSet3)
             shutil.copyfile("PValues.txt", "PValues.chg3.txt")
             #if self.ReportROCPath:
             #    self.Model.ReportROC(self.TrainingSetAll, self.ReportROCPath)
             self.WriteScoredFeatureSet(self.TrainingSet2, self.TrainingSet3)
         if self.WriteModelFilePath2:
             self.Model2.SaveModel(self.WriteModelFilePath2)
             self.Model3.SaveModel(self.WriteModelFilePath3)
         return
     #######################################################################
     # We're doing feature selection.  We'll need to write out feature files,
     # then call TrainMachineLearner
     print "Feature names:", self.FeatureNames
     print "AllFeatures:", self.FeaturesAll
     self.WriteFeaturesToFile(self.TrainingSet2, "PTMFeatures.2.txt")
     self.WriteFeaturesToFile(self.TrainingSet3, "PTMFeatures.3.txt")
Example #2
0
 def TrainFacultative(self):
     """
     Train paired models for CONSTITUTIVE ("always") and FACULTATIVE ("sometimes") PTMs.
     """
     # Train a model on all PTMs, to get initial scores for all PTMs.
     # The initial model uses only CONSTITUTIVE features, and its output
     # is used only to provide an ORACLE for the facultative model:
     print "TRAIN model on all features:"
     self.Model.Train(self.TrainingSetAll)
     print "SCORE all features:"
     self.Model.Test(self.TrainingSetAll)
     ##############################################################
     print "Generate SUB-MODEL of only facultative features:"
     # Sort facultative instances by score:
     SortedList = []
     for Vector in self.TrainingSetAll.AllVectors:
         if not Vector.FileBits[FormatBits.SisterAnnotationFlag]:
             continue
         SortedList.append((Vector.Score, Vector))
     SortedList.sort()
     FacFeatureSet = Learning.FeatureSetClass()
     ChunkSize = min(len(SortedList) / 4, 1000)
     print "Sorted list of %s facultative features, chunk size is %s"%(len(SortedList), ChunkSize)
     for (Score, Vector) in SortedList[:ChunkSize]:
         NewVector = Learning.FeatureVector()
         NewVector.FileBits = Vector.FileBits[:]
         NewVector.Features = Vector.Features[:]
         NewVector.TrueFlag = 0
         FacFeatureSet.AllVectors.append(NewVector)
         FacFeatureSet.FalseVectors.append(NewVector)
     for (Score, Vector) in SortedList[-ChunkSize:]:
         NewVector = Learning.FeatureVector()
         NewVector.FileBits = Vector.FileBits[:]
         NewVector.Features = Vector.Features[:]
         NewVector.TrueFlag = 1
         FacFeatureSet.AllVectors.append(NewVector)
         FacFeatureSet.TrueVectors.append(NewVector)
     FacFeatureSet.SetCounts()
     FacFeatureSet.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
     ##############################################################
     # Write out the FACULTATIVE feature set:
     FacTrainingFile = open("FacultativeTrainingSet.txt", "wb")
     for HeaderLine in self.HeaderLines:
         FacTrainingFile.write(HeaderLine)
     for Vector in FacFeatureSet.AllVectors:
         Bits = Vector.FileBits[:]
         if Vector.TrueFlag:
             Bits[FormatBits.TrueProteinFlag] = "1"
         else:
             Bits[FormatBits.TrueProteinFlag] = "0"
         Str = string.join(Bits, "\t")
         FacTrainingFile.write(Str + "\n")
     FacTrainingFile.close()
     ##############################################################
     # Train the sub-model:
     self.FacModel = self.GetModelObject(self.FeaturesF)
     self.FacModel.Train(FacFeatureSet)
     self.FacModel.Test(FacFeatureSet)
     self.FacModel.ReportAccuracy(FacFeatureSet) # invokes ComputeOddsTrue
     ##############################################################
     # Apply the trained fac-model to *all* facultative features, and
     # train an overall model on all *constitutive* features:
     self.FeatureSetC = Learning.FeatureSetClass()
     self.FeatureSetF = Learning.FeatureSetClass()
     for Vector in self.TrainingSetAll.AllVectors:
         if Vector.FileBits[FormatBits.SisterAnnotationFlag]:
             FeatureSet = self.FeatureSetF
         else:
             FeatureSet = self.FeatureSetC
         FeatureSet.AllVectors.append(Vector)
         if Vector.TrueFlag:
             FeatureSet.TrueVectors.append(Vector)
         else:
             FeatureSet.FalseVectors.append(Vector)
     self.FeatureSetC.SetCounts()
     self.FeatureSetF.SetCounts()
     self.FeatureSetC.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
     self.FeatureSetF.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
     # Score facultative-feature, using facultative-model:
     self.FacModel.Test(self.FeatureSetF)
     # Train constitutive-ONLY model, and score constitutive features:
     self.ConModel = self.GetModelObject(self.FeaturesC)
     self.ConModel.Train(self.FeatureSetC)
     self.ConModel.Test(self.FeatureSetC)
     self.ConModel.ReportAccuracy(self.FeatureSetC) # to invoke ComputeOddsTrue
     ##############################################################
     # Save our models:
     if self.WriteModelFilePath:
         (Stub, Extension) = os.path.splitext(self.WriteModelFilePath)
         ConModelPath = "%s.con"%Stub
         FacModelPath = "%s.fac"%Stub
         self.ConModel.SaveModel(ConModelPath)
         self.FacModel.SaveModel(FacModelPath)
     ##############################################################
     # Write out the scored features:
     OutputFile = open(self.OutputFeaturePath, "wb")
     for Line in self.HeaderLines:
         OutputFile.write(Line)
     for Vector in self.TrainingSetAll.AllVectors:
         if Vector.FileBits[FormatBits.SisterAnnotationFlag]:
             PValue = self.FacModel.GetPValue(Vector.Score)
         else:
             PValue = self.ConModel.GetPValue(Vector.Score)
         while len(Vector.FileBits) <= FormatBits.ModelPValue:
             Vector.FileBits.append("")
         Vector.FileBits[FormatBits.ModelScore] = str(Vector.Score)
         Vector.FileBits[FormatBits.ModelPValue] = str(PValue)
         Str = string.join(Vector.FileBits, "\t")
         OutputFile.write(Str + "\n")