def epose_prediction(Arguments): ''' Score some new mutations with the ePOSE(s) you already created. First we'll take our ePOSEs and the mutations originally used create them to derive the linear equation necessary to convert POSE scores to predicted endophenotypes. Finally, we score the new mutations and use said linear equation to supply predictions. ''' #First get the cutoff from the mutations originally used to make the ePOSE Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation = make_pose_input(Arguments) #These mutations are the ones we actually need. Mutations, ePOSEs = zip(*[(cPickle.load(open(File, "rb"))[0], cPickle.load(open(File, "rb"))[1][0]) \ for File in ls("./") if Arguments.Filename in File]) Mutations, Endophenotypes = Mutations[0].keys(), Mutations[0].values() LinearParameters = [] ePOSEScores = [] for ePOSE in ePOSEs: MSA = zip(*[Sequences[Gene] for Gene in ePOSE]) #Turn the list of genes into the MSA (sequence ensemble) Scores = get_pose_scores(MSA, list(ePOSE), Identities, Mutations, ResidueBurial, Annotation, Arguments) ePOSEScores.append(Scores) a, b, RR = linear_regression(Scores, Endophenotypes) LinearParameters.append((a, b)) ePOSEScores = [mean(Score) for Score in zip(*ePOSEScores)] #Now we are ready to get the new mutations. Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation = make_pose_input(Arguments) Scores = [] for ePOSE in ePOSEs: MSA = zip(*[Sequences[Gene] for Gene in ePOSE]) #Turn the list of genes into the MSA (sequence ensemble) Scores.append(get_pose_scores(MSA, list(ePOSE), Identities, Mutations, ResidueBurial, Annotation, Arguments)) Scores = dict(zip(Mutations, zip(*Scores))) print "Mutation, Mean POSE score, Standard deviation, Predicted Endophenotype, Standard deviation" for Mutation in Mutations: PredictedEndophenotypes = [a*mean(Scores[Mutation]) + b for a, b in LinearParameters] print Mutation, "\t", "\t", round(mean(Scores[Mutation]), 3), "\t", "\t", round(std(Scores[Mutation]), 3), \ "\t", "\t", "\t", round(mean(PredictedEndophenotypes), 3), "\t", "\t", "\t", round(std(PredictedEndophenotypes), 3) print " " print "~~~~~~~~~~ Performance Metrics ~~~~~~~~~~~~~" print " " print "R-squared =", linear_regression(ePOSEScores, Endophenotypes)[2] print "Pearson correlation =", correlation(ePOSEScores, Endophenotypes) print "P-value =", correlation_pvalue(ePOSEScores, Endophenotypes) return
def validate_epose(Arguments): ''' Validate the ePOSE. See the validate function (below) for detailed description. ''' CrossValidations = [cPickle.load(open(File, "rb")) for File in ls("./") if Arguments.Filename + ".CrossValidation." in File] Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation = make_pose_input(Arguments) Predictions = {} for CrossValidation in CrossValidations: TestingMutations, ePOSEs = list(CrossValidation.keys()[0]), CrossValidation.values()[0] TrainingMutations = dict([(Mutation, Endophenotype) for Mutation, Endophenotype in Mutations.items() \ if Mutation not in TestingMutations]) TestScores = [] PredictedEndophenotypes = [] #this is where will put ePOSE scores that are converted to their corresponding endophenotype prediction for ePOSE in ePOSEs: MSA = zip(*[Sequences[Gene] for Gene in ePOSE]) #Turn the list of genes into the MSA (sequence ensemble) test_scores = get_pose_scores(MSA, list(ePOSE), Identities, TestingMutations, ResidueBurial, Annotation, Arguments) TestScores.append(test_scores) TrainScores = get_pose_scores(MSA, list(ePOSE), Identities, TrainingMutations.keys(), ResidueBurial, Annotation, Arguments) a, b, RR = linear_regression(TrainScores, TrainingMutations.values()) PredictedEndophenotypes.append([a*x + b for x in test_scores]) PredictedEndophenotypes = dict(zip(TestingMutations, zip(*PredictedEndophenotypes))) #If you had more than one trial in you MakePOSE, you have more than one score associated with each mutation. Here we collect em for Mutation, Scores in dict(zip(TestingMutations, zip(*TestScores))).items(): #Now we get the mean and standard deviation of scores associated with each mutation (relevant if trial > 1 in you initial MakePOSE). Predictions[Mutation] = [round(mean(Scores), 3), round(std(Scores), 3), \ round(mean(PredictedEndophenotypes[Mutation]), 3), round(std(PredictedEndophenotypes[Mutation]), 3)] Scores = [Predictions[Mutation][0] for Mutation in Mutations.keys()] #make sure scores are ordered just like Mutations.values() print ", ".join(["Mutation", "Measurement", "Score", "Std dev", "Prediction", "Std dev"]) for Mutation in Mutations.keys(): print Mutation + "\t" + "\t" + str(Mutations[Mutation]) + "\t" + "\t ".join(map(str, Predictions[Mutation])) print " " print "~~~~~~~~~~ Performance Metrics ~~~~~~~~~~~~~" print " " print "R-squared =", linear_regression(Scores, Mutations.values())[2] print "Pearson correlation =", correlation(Scores, Mutations.values()) print "P-value =", correlation_pvalue(Scores, Mutations.values()) return
def validate_pose(Arguments): ''' Validate the POSE. See the validate function (below) for detailed description. ''' CrossValidations = [cPickle.load(open(File, "rb")) for File in ls("./") if Arguments.Filename + ".CrossValidation." in File] Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation = make_pose_input(Arguments) Predictions = {} for CrossValidation in CrossValidations: TestingMutations, POSEs = list(CrossValidation.keys()[0]), CrossValidation.values()[0] TestScores = [] for POSE in POSEs: MSA = zip(*[Sequences[Gene] for Gene in POSE]) #Turn the list of genes into the MSA (sequence ensemble) TestScores.append(get_pose_scores(MSA, list(POSE), Identities, TestingMutations, ResidueBurial, Annotation, Arguments)) #If you had more than one trial in you MakePOSE, you have more than one score associated with each mutation. Here we collect em for Mutation, Scores in dict(zip(TestingMutations, zip(*TestScores))).items(): #Now we get the mean and standard deviation of scores associated with each mutation (relevant if trial > 1 in you initial MakePOSE). Predictions[Mutation] = [round(mean(Scores), 3), round(std(Scores), 3)] print ", ".join(["Mutation", "'Known' phenotype", "POSE score", "Std dev (if multiple 'trials' were used)"]) for Mutation in Mutations.keys(): print Mutation + "\t" + "\t" + ["Positive" if Mutations[Mutation] else "Negative"][0] \ + "\t" + "\t ".join(map(str, Predictions[Mutation])) print " " print "~~~~~~~~~~ Performance Metrics ~~~~~~~~~~~~~" Scores, Phenotypes = zip(*[(Predictions[Mutation][0], int(Mutations[Mutation])) for Mutation in Mutations.keys()]) AUC, Metrix = ROC(Scores, Phenotypes, Arguments.Correlated) Optimal = sorted(Metrix, key=lambda Metric: Metric.Sensitivity + Metric.Specificity, reverse=True)[0] print "Optimal cutoff for separating the classes =", round(Optimal.CutOff, 2) print " " print "Performance using optimal cutoff:" print " " print "\t", "Area under the ROC curve =", round(AUC, 2) print "\t", "Accuracy =", Optimal.Accuracy print "\t", "Sensitivity =", Optimal.Sensitivity print "\t", "Specificity =", Optimal.Specificity print "\t", "Negative predictive value =", Optimal.NPV print "\t", "Positive predictive value =", Optimal.PPV print "\t", "Mathews correlation coefficient =", Optimal.MCC return
def pose_prediction(Arguments): ''' Score some new mutations with the POSE(s) you already created. First we need to determine the optimal POSE score cutoff for predicting a mutants phenotype; this is done using the mutations used to create the POSE in the first place. Next, we score the mutations and apply said cutoffs to make predictions. ''' #First get the cutoff from the mutations originally used to make the POSE Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation = make_pose_input(Arguments) #These mutations are the ones we actually need. Mutations, POSEs = zip(*[(cPickle.load(open(File, "rb"))[0], cPickle.load(open(File, "rb"))[1][0]) \ for File in ls("./") if Arguments.Filename in File]) Mutations, Phenotypes = Mutations[0].keys(), Mutations[0].values() Scores = [] for POSE in POSEs: MSA = zip(*[Sequences[Gene] for Gene in POSE]) #Turn the list of genes into the MSA (sequence ensemble) Scores.append(get_pose_scores(MSA, list(POSE), Identities, Mutations, ResidueBurial, Annotation, Arguments)) Scores = [mean(Score) for Score in zip(*Scores)] AUC, Metrix = ROC(Scores, Phenotypes, Arguments.Correlated) Optimal = sorted(Metrix, key=lambda Metric: Metric.Sensitivity + Metric.Specificity, reverse=True)[0] #Now we are ready to get the new mutations. Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation = make_pose_input(Arguments) Scores = [] for POSE in POSEs: MSA = zip(*[Sequences[Gene] for Gene in POSE]) #Turn the list of genes into the MSA (sequence ensemble) Scores.append(get_pose_scores(MSA, list(POSE), Identities, Mutations, ResidueBurial, Annotation, Arguments)) Scores = dict(zip(Mutations, zip(*Scores))) print "Mutation, Mean POSE score, Standard deviation, Predicted phenotype" for Mutation in Mutations: if Arguments.Correlated and mean(Scores[Mutation]) >= Optimal.CutOff: print Mutation, "\t", "\t", round(mean(Scores[Mutation]), 3), "\t", "\t", round(std(Scores[Mutation]), 3), "\t", "\t", "Positive" if Arguments.Correlated and mean(Scores[Mutation]) < Optimal.CutOff: print Mutation, "\t", "\t", round(mean(Scores[Mutation]), 3), "\t", "\t", round(std(Scores[Mutation]), 3), "\t", "\t", "Negative" if not Arguments.Correlated and mean(Scores[Mutation]) <= Optimal.CutOff: print Mutation, "\t", "\t", round(mean(Scores[Mutation]), 3), "\t", "\t", round(std(Scores[Mutation]), 3), "\t", "\t", "Positive" if not Arguments.Correlated and mean(Scores[Mutation]) > Optimal.CutOff: print Mutation, "\t", "\t", round(mean(Scores[Mutation]), 3), "\t", "\t", round(std(Scores[Mutation]), 3), "\t", "\t", "Negative" print " " print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Cutoff used for separating the classes (determined when POSEs were initially created!!!) =", round(Optimal.CutOff, 2) print " " print "Performance acheived on the mutations used to train the initial POSE:" print " " print "\t", "Area under the ROC curve =", round(AUC, 2) print "\t", "Accuracy =", Optimal.Accuracy print "\t", "Sensitivity =", Optimal.Sensitivity print "\t", "Specificity =", Optimal.Specificity print "\t", "Negative predictive value =", Optimal.NPV print "\t", "Positive predictive value =", Optimal.PPV print "\t", "Mathews correlation coefficient =", Optimal.MCC return