예제 #1
0
 def ParseFeatureFile(self, FilePath, FeatureSet2, FeatureSet3, DBRatio):
     """
     Initialize the FeatureSet object, by parsing features from the specified FilePath.
     Facultative features go to FeatureSetF, constitutive features go to FeatureSetC
     """
     File = open(FilePath, "rb")
     # Parse the header line specially:
     HeaderLine = File.readline()
     self.HeaderLines.append(HeaderLine)
     Bits = HeaderLine.strip().split("\t")
     for BitIndex in range(len(Bits)):
         if BitIndex >= FormatBits.FirstFeature:
             self.FeatureNames[BitIndex - FormatBits.FirstFeature] = Bits[BitIndex]
             #if BitIndex <= FormatBits.LastFeature:
             #    print "Feature %s: %s"%(BitIndex - FormatBits.FirstFeature, Bits[BitIndex])
     # Iterate over non-header lines:
     LineNumber = 0
     for FileLine in File.xreadlines():
         LineNumber += 1
         if FileLine[0] == "#":
             self.HeaderLines.append(FileLine)
             continue # skip comment line
         if not FileLine.strip():
             continue # skip blank line
         Bits = FileLine.replace("\r","").replace("\n","").split("\t")
         # If there are TOO MANY bits, then discard the extras:
         Bits = Bits[:FormatBits.LastFeature + 1]
         try:
             TrueFlag = int(Bits[FormatBits.TrueProteinFlag])
         except:
             continue # skip; not a valid instance line
         Charge = int(Bits[FormatBits.Charge])
         SisterAnnotation = Bits[FormatBits.SisterAnnotationFlag]
         Vector = Learning.FeatureVector()
         if Charge > 2:
             FeatureSet = FeatureSet3
         else:
             FeatureSet = FeatureSet2
         try:
             for FeatureBitIndex in range(FormatBits.FirstFeature, FormatBits.LastFeature + 1):
                 FeatureIndex = FeatureBitIndex - FormatBits.FirstFeature
                 #if FeatureIndex not in self.FeaturesAll:
                 #    continue
                 if FeatureBitIndex < len(Bits) and Bits[FeatureBitIndex].strip() and Bits[FeatureBitIndex] != "None":
                     Vector.Features.append(float(Bits[FeatureBitIndex]))
                 else:
                     Vector.Features.append(0)
             Vector.FileBits = Bits
             Vector.TrueFlag = TrueFlag
             if TrueFlag:
                 FeatureSet.TrueVectors.append(Vector)
             else:
                 FeatureSet.FalseVectors.append(Vector)
             FeatureSet.AllVectors.append(Vector)
         except:
             traceback.print_exc()
             print "** Error on line %s column %s of feature file"%(LineNumber, FeatureIndex)
             print Bits
     File.close()
     # Initialize counts:
     for FeatureSet in (FeatureSet2, FeatureSet3):
         FeatureSet.SetCounts()
         FeatureSet.GetPriorProbabilityFalse(DBRatio)
     print "CHARGE 1,2: Read in %s true and %s false vectors"%(FeatureSet2.TrueCount, FeatureSet2.FalseCount)
     print "CHARGE  3+: Read in %s true and %s false vectors"%(FeatureSet3.TrueCount, FeatureSet3.FalseCount)
예제 #2
0
 def TrainFacultative(self):
     """
     Train paired models for CONSTITUTIVE ("always") and FACULTATIVE ("sometimes") PTMs.
     """
     # Train a model on all PTMs, to get initial scores for all PTMs.
     # The initial model uses only CONSTITUTIVE features, and its output
     # is used only to provide an ORACLE for the facultative model:
     print "TRAIN model on all features:"
     self.Model.Train(self.TrainingSetAll)
     print "SCORE all features:"
     self.Model.Test(self.TrainingSetAll)
     ##############################################################
     print "Generate SUB-MODEL of only facultative features:"
     # Sort facultative instances by score:
     SortedList = []
     for Vector in self.TrainingSetAll.AllVectors:
         if not Vector.FileBits[FormatBits.SisterAnnotationFlag]:
             continue
         SortedList.append((Vector.Score, Vector))
     SortedList.sort()
     FacFeatureSet = Learning.FeatureSetClass()
     ChunkSize = min(len(SortedList) / 4, 1000)
     print "Sorted list of %s facultative features, chunk size is %s"%(len(SortedList), ChunkSize)
     for (Score, Vector) in SortedList[:ChunkSize]:
         NewVector = Learning.FeatureVector()
         NewVector.FileBits = Vector.FileBits[:]
         NewVector.Features = Vector.Features[:]
         NewVector.TrueFlag = 0
         FacFeatureSet.AllVectors.append(NewVector)
         FacFeatureSet.FalseVectors.append(NewVector)
     for (Score, Vector) in SortedList[-ChunkSize:]:
         NewVector = Learning.FeatureVector()
         NewVector.FileBits = Vector.FileBits[:]
         NewVector.Features = Vector.Features[:]
         NewVector.TrueFlag = 1
         FacFeatureSet.AllVectors.append(NewVector)
         FacFeatureSet.TrueVectors.append(NewVector)
     FacFeatureSet.SetCounts()
     FacFeatureSet.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
     ##############################################################
     # Write out the FACULTATIVE feature set:
     FacTrainingFile = open("FacultativeTrainingSet.txt", "wb")
     for HeaderLine in self.HeaderLines:
         FacTrainingFile.write(HeaderLine)
     for Vector in FacFeatureSet.AllVectors:
         Bits = Vector.FileBits[:]
         if Vector.TrueFlag:
             Bits[FormatBits.TrueProteinFlag] = "1"
         else:
             Bits[FormatBits.TrueProteinFlag] = "0"
         Str = string.join(Bits, "\t")
         FacTrainingFile.write(Str + "\n")
     FacTrainingFile.close()
     ##############################################################
     # Train the sub-model:
     self.FacModel = self.GetModelObject(self.FeaturesF)
     self.FacModel.Train(FacFeatureSet)
     self.FacModel.Test(FacFeatureSet)
     self.FacModel.ReportAccuracy(FacFeatureSet) # invokes ComputeOddsTrue
     ##############################################################
     # Apply the trained fac-model to *all* facultative features, and
     # train an overall model on all *constitutive* features:
     self.FeatureSetC = Learning.FeatureSetClass()
     self.FeatureSetF = Learning.FeatureSetClass()
     for Vector in self.TrainingSetAll.AllVectors:
         if Vector.FileBits[FormatBits.SisterAnnotationFlag]:
             FeatureSet = self.FeatureSetF
         else:
             FeatureSet = self.FeatureSetC
         FeatureSet.AllVectors.append(Vector)
         if Vector.TrueFlag:
             FeatureSet.TrueVectors.append(Vector)
         else:
             FeatureSet.FalseVectors.append(Vector)
     self.FeatureSetC.SetCounts()
     self.FeatureSetF.SetCounts()
     self.FeatureSetC.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
     self.FeatureSetF.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
     # Score facultative-feature, using facultative-model:
     self.FacModel.Test(self.FeatureSetF)
     # Train constitutive-ONLY model, and score constitutive features:
     self.ConModel = self.GetModelObject(self.FeaturesC)
     self.ConModel.Train(self.FeatureSetC)
     self.ConModel.Test(self.FeatureSetC)
     self.ConModel.ReportAccuracy(self.FeatureSetC) # to invoke ComputeOddsTrue
     ##############################################################
     # Save our models:
     if self.WriteModelFilePath:
         (Stub, Extension) = os.path.splitext(self.WriteModelFilePath)
         ConModelPath = "%s.con"%Stub
         FacModelPath = "%s.fac"%Stub
         self.ConModel.SaveModel(ConModelPath)
         self.FacModel.SaveModel(FacModelPath)
     ##############################################################
     # Write out the scored features:
     OutputFile = open(self.OutputFeaturePath, "wb")
     for Line in self.HeaderLines:
         OutputFile.write(Line)
     for Vector in self.TrainingSetAll.AllVectors:
         if Vector.FileBits[FormatBits.SisterAnnotationFlag]:
             PValue = self.FacModel.GetPValue(Vector.Score)
         else:
             PValue = self.ConModel.GetPValue(Vector.Score)
         while len(Vector.FileBits) <= FormatBits.ModelPValue:
             Vector.FileBits.append("")
         Vector.FileBits[FormatBits.ModelScore] = str(Vector.Score)
         Vector.FileBits[FormatBits.ModelPValue] = str(PValue)
         Str = string.join(Vector.FileBits, "\t")
         OutputFile.write(Str + "\n")