def validate_markers(Arguments): ''' ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) Header, Results = load_results_file(get_path("ValidateMarkers")) CSVfile = open(Arguments.Filename + ".csv", "wb") CSVwriter = csv.writer(CSVfile, dialect='excel') CSVwriter.writerow(["Union","Intersection","Difference","Interaction", "Phenotype", "P-value", "Odds Ratio", "Effect Size", "Sensitivity","Specificity","PPV","NPV","Accuracy", "MCC", "Sample Count", "Case Count"]) for Phenotype in Phenotypes: Response = Variates[Features.index(Phenotype)] for Marker in Results: try: Predictor = assemble_setwork(Features, Variates, filter(None, Marker[Header.index("Union")].split(", ")), filter(None, Marker[Header.index("Intersection")].split(", ")), filter(None, Marker[Header.index("Difference")].split(", ")), Arguments) TP,FP,FN,TN = contingency_table(Predictor, Response, NA=Arguments.NA) performance = Performance(Marker[Header.index("Interaction")], TP,FP,FN,TN) effect_size = EffectSize(Marker[Header.index("Interaction")], TP,FP,FN,TN) CSVwriter.writerow([Marker[Header.index("Union")], Marker[Header.index("Intersection")], Marker[Header.index("Difference")], Marker[Header.index("Interaction")], Phenotype[:Phenotype.index(":")], "%0.2e" %fisher(TP,FP,FN,TN).two_tail, "%0.2f" %effect_size.odds_ratio, "%0.2f" %effect_size.difference_of_proportions, "%0.2f" %performance.sensitivity, "%0.2f" %performance.specificity, "%0.2f" %performance.PPV, "%0.2f" %performance.NPV, "%0.2f" %performance.accuracy, "%0.2f" %performance.MCC, TP+FP+FN+TN, TP+FN]) except ValueError: CSVwriter.writerow([Marker[Header.index("Union")], Marker[Header.index("Intersection")], Marker[Header.index("Difference")], "NA"]) CSVfile.close() return
def get_rowsidematrix(Features, Arguments): ''' ''' GrayScale = { -9: "gray10", -8: "gray20", -7: "gray30", -6: "gray40", -5: "gray50", -4: "gray60", -3: "gray70", -2: "gray80" } Matrix = dict([(Row.split()[0], Row.split()[1]) for Row in file(get_path("RowSideColors"))]) PValues = [] for Feature in Features: PValues.append(Matrix[Feature]) Label = ["Mesenchymal"] Variates = [ int(("%0.2e" % float(PValue)).split("e")[1]) for PValue in PValues ] Variates = [ "black" if Variate <= -10 else ("white" if Variate >= -1 else GrayScale[Variate]) for Variate in Variates ] return string_matrix(Features, Label, [Variates], Arguments)
def make_priors(Arguments): ''' This function can make a Priors file from any setwork output (probably from either a Setworks run, or a ValidateBiomarkers run). This Priors file can then be used to bias future searches. Can be useful for focusing setwork selection from "big data". For instance, do a Setworks run to see which individual features are being selected with high frequency. Next, make the Priors file and rerun Setworks forcing MOCA to bias its search around the individual features predicted to be important in your previous run. This can be useful, because some data is too big to search thru with the normal MOCA optimization process, without a little help-- this would be that help. ''' Header, Markers = load_results_file(get_path("Priors")) UnionFeatures, IntersectionFeatures, DifferenceFeatures = [], [], [] for Marker in Markers: if Marker[0]: UnionFeatures.extend(map(lambda x: x.strip(","), Marker[0].split())) if Marker[1]: IntersectionFeatures.extend(map(lambda x: x.strip(","), Marker[1].split())) if Marker[2]: DifferenceFeatures.extend(map(lambda x: x.strip(","), Marker[2].split())) Priors = open(Arguments.Filename, "w") Priors.write("%s %s \n" %("Union =", " ".join(UnionFeatures))) Priors.write("%s %s \n" %("Intersection =", " ".join(IntersectionFeatures))) Priors.write("%s %s \n" %("Difference =", " ".join(DifferenceFeatures))) Priors.close() return
def load_setworks(Arguments): ''' Load the setworks pickled dictionary of dictionaries, and return lists that have been truncated to specified number of results (TopInteractions = 100, by default). The list can additionally be truncated using performance metric, thru the MinimumPerformance argument. ''' Results = cPickle.load(open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb")) PValues = Results["PValues"] QValues = Results["QValues"] Performances = Results["Performances"] Interactions = Results["Interactions"] FeatureVectors = Results["FeatureVectors"] UnionFeatures = Results["UnionFeatures"] IntersectionFeatures = Results["IntersectionFeatures"] DifferenceFeatures = Results["DifferenceFeatures"] SampleCounts = Results["SampleCounts"] CaseCounts = Results["CaseCounts"] EffectSizes = Results["EffectSizes"] Barcodes = Results["Barcodes"] Report = Results["Report"] Labels = Results["Labels"] #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided) Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)] #Initial sort will be done by decreasing balanced accuracy Barcodes = rank(Performances, Arguments.RankMethod, Arguments.TopInteractions) return Barcodes, PValues, QValues, Performances, Interactions, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures, \ FeatureVectors, SampleCounts, CaseCounts, EffectSizes, Report, Labels
def load_validation_data(Arguments): ''' Load the python pickles you made from LeaveSomeOut cross-validation calculations ''' CrossValidations = dict([(File, {}) for File in os.listdir(get_path("MOCA.results")) if Arguments.Filename + ".Validation" in File]) for CrossValidation in CrossValidations.keys(): Results = cPickle.load(open(get_path("MOCA.results") + "/" + CrossValidation, "rb")) CrossValidations[CrossValidation]["Cases"] = Results["Report"]["Cases"][1] CrossValidations[CrossValidation]["Controls"] = Results["Report"]["Controls"][1] CrossValidations[CrossValidation]["Barcodes"] = Results["Barcodes"] CrossValidations[CrossValidation]["PValues"] = Results["PValues"] CrossValidations[CrossValidation]["QValues"] = Results["QValues"] CrossValidations[CrossValidation]["Performances"] = Results["Performances"] CrossValidations[CrossValidation]["Interactions"] = Results["Interactions"] CrossValidations[CrossValidation]["FeatureVectors"] = Results["FeatureVectors"] CrossValidations[CrossValidation]["UnionFeatures"] = Results["UnionFeatures"] CrossValidations[CrossValidation]["IntersectionFeatures"] = Results["IntersectionFeatures"] CrossValidations[CrossValidation]["DifferenceFeatures"] = Results["DifferenceFeatures"] CrossValidations[CrossValidation]["Report"] = Results["Report"] return CrossValidations
def get_rowsidematrix(Features, Arguments): ''' ''' GrayScale = {-9:"gray10", -8:"gray20", -7:"gray30", -6:"gray40", -5:"gray50", -4:"gray60", -3:"gray70", -2:"gray80"} Matrix = dict([(Row.split()[0], Row.split()[1]) for Row in file(get_path("RowSideColors"))]) PValues = [] for Feature in Features: PValues.append(Matrix[Feature]) Label = ["Mesenchymal"] Variates = [int(("%0.2e" %float(PValue)).split("e")[1]) for PValue in PValues] Variates = ["black" if Variate <= -10 else ("white" if Variate >= -1 else GrayScale[Variate]) for Variate in Variates] return string_matrix(Features, Label, [Variates], Arguments)
def pairwise(Arguments): ''' Print results from you MOCA pairwise runs. ''' Results = cPickle.load(open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb")) if not Results["Report"]["Pairwise"]: print "You have 'Pairwise = True' in your arguments file, yet the file you pointed to (MOCA.results/", print Arguments.Filename + ") did not result from a pairwise MOCA calculation." print "Exiting..." exit() if Results["Report"]["Continuous-valued correlation"]: pairwise_continuous(Results, Arguments) elif type(Results["Report"]["Supervised"]) == list: supervised(Results, Arguments) else: unsupervised(Results, Arguments) return
def pairwise(Arguments): ''' Print results from you MOCA pairwise runs. ''' Results = cPickle.load( open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb")) if not Results["Report"]["Pairwise"]: print "You have 'Pairwise = True' in your arguments file, yet the file you pointed to (MOCA.results/", print Arguments.Filename + ") did not result from a pairwise MOCA calculation." print "Exiting..." exit() if Results["Report"]["Continuous-valued correlation"]: pairwise_continuous(Results, Arguments) elif type(Results["Report"]["Supervised"]) == list: supervised(Results, Arguments) else: unsupervised(Results, Arguments) return
def load_setworks(Arguments): ''' Load the setworks pickled dictionary of dictionaries, and return lists that have been truncated to specified number of results (TopInteractions = 100, by default). The list can additionally be truncated using performance metric, thru the MinimumPerformance argument. ''' Results = cPickle.load( open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb")) PValues = Results["PValues"] QValues = Results["QValues"] Performances = Results["Performances"] Interactions = Results["Interactions"] FeatureVectors = Results["FeatureVectors"] UnionFeatures = Results["UnionFeatures"] IntersectionFeatures = Results["IntersectionFeatures"] DifferenceFeatures = Results["DifferenceFeatures"] SampleCounts = Results["SampleCounts"] CaseCounts = Results["CaseCounts"] EffectSizes = Results["EffectSizes"] Barcodes = Results["Barcodes"] Report = Results["Report"] Labels = Results["Labels"] #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided) Barcodes = [ Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments) ] #Initial sort will be done by decreasing balanced accuracy Barcodes = rank(Performances, Arguments.RankMethod, Arguments.TopInteractions) return Barcodes, PValues, QValues, Performances, Interactions, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures, \ FeatureVectors, SampleCounts, CaseCounts, EffectSizes, Report, Labels
def leave_some_out(Arguments): ''' Function for selecting biomarkers from cross-validation output. Strict in that it only returns biomarkers that were selected during each cross validation. Might have the benefit, relative to vote-based prediction, that these were so predictive that they will translate better to future predictions. Also lends itself to simple clinical use because they require little or no computational support for subsequent prediction...it is simply the marker ''' CrossValidations = load_validation_data(Arguments) Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) Phenotype = Phenotypes[0] Response = Variates[Features.index(Phenotype)] Setworks = {} for CrossValidation in CrossValidations: for Barcode in CrossValidations[CrossValidation]["Barcodes"]: Setwork = (CrossValidations[CrossValidation]["UnionFeatures"][Barcode], \ CrossValidations[CrossValidation]["IntersectionFeatures"][Barcode], \ CrossValidations[CrossValidation]["DifferenceFeatures"][Barcode], \ CrossValidations[CrossValidation]["Interactions"][Barcode]) if Setworks.has_key(Setwork): Setworks[Setwork].append(Barcode) else: Setworks[Setwork] = [Barcode] PValues = {} QValues = {} Performances = {} Interactions = {} FeatureVectors = {} UnionFeatures = {} IntersectionFeatures = {} DifferenceFeatures = {} SampleCounts = {} CaseCounts = {} EffectSizes = {} Barcodes = [] for Setwork in Setworks: if len(Setworks[Setwork]) == len(CrossValidations): #Sework had to be selected in each cross validation!!! Union, Intersection, Difference, Interaction = Setwork Predictor = assemble_setwork(Features, Variates, Union, Intersection, Difference, Arguments) TP,FP,FN,TN = contingency_table(Predictor, Response, NA=Arguments.NA) Barcode = Setworks[Setwork][0] Barcodes.append(Barcode) PValues[Barcode] = fisher(TP,FP,FN,TN).two_tail QValues[Barcode] = "NA" Performances[Barcode] = Performance(Interaction, TP,FP,FN,TN) Interactions[Barcode] = Interaction EffectSizes[Barcode] = EffectSize(Interactions[Barcode], TP,FP,FN,TN) FeatureVectors[Barcode] = Predictor UnionFeatures[Barcode] = Union IntersectionFeatures[Barcode] = Intersection DifferenceFeatures[Barcode] = Difference SampleCounts[Barcode] = TP + FP + FN + TN CaseCounts[Barcode] = TP + FN Results = {} Results["PValues"] = PValues Results["QValues"] = QValues Results["Performances"] = Performances Results["Interactions"] = Interactions Results["FeatureVectors"] = FeatureVectors Results["UnionFeatures"] = UnionFeatures Results["IntersectionFeatures"] = IntersectionFeatures Results["DifferenceFeatures"] = DifferenceFeatures Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes #Doesn't matter which index we use we just need one report. The last accessed 'CrossValidation' will do. Results["Report"] = make_report(Labels, CrossValidations[CrossValidation]["Report"]) Results["Labels"] = Labels Results["Barcodes"] = Barcodes Results["Phenotype"] = Response if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join(["Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization))]) else: Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) ''' #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided) Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)] #Initial sort will be done by decreasing balanced accuracy Barcodes = sorted(Barcodes, key=lambda Barcode: \ (Performances[Barcode].sensitivity + Performances[Barcode].specificity)/2, reverse=True) CSVfile = open(Arguments.Filename + ".csv", "wb") CSVwriter = csv.writer(CSVfile, dialect='excel') #Right the excel header CSVwriter.writerow(["Union","Intersection","Difference","Interaction", "Phenotype", "Sensitivity","Specificity","PPV","NPV","Accuracy", "Sample Count"]) for Barcode in Barcodes: p = Performances[Barcode] Sens, Spec, PPV, NPV, Accuracy = p.sensitivity, p.specificity, p.PPV, p.NPV, p.accuracy CSVwriter.writerow([", ".join(UnionFeatures[Barcode]), ", ".join(IntersectionFeatures[Barcode]), ", ".join(DifferenceFeatures[Barcode]), Interactions[Barcode], Phenotype[:Phenotype.index(":")], "%0.2f" %Sens, "%0.2f" %Spec, "%0.2f" %PPV, "%0.2f" %NPV, "%0.2f" %Accuracy, SampleCount[Barcode]]) CSVfile.close() ''' return
exit() else: print "To run MOCA you need to choose either a mode, request a report, or call your own function from MyMOCA." print "Please see the 'Arguments file' and/or execute moca.py --help. Exiting..." exit() # Check to see that you have a paths file called 'Paths' in your current working directory if not os.path.isfile("Paths"): print "You must have a paths file in your current-working directory, and it must be called 'Paths'" print "Exiting..." exit() # MOCA data and results directories must exist for any MOCA calculation try: os.makedirs(get_path("MOCA.data")) except OSError: pass try: os.makedirs(get_path("MOCA.results")) except OSError: pass # vvv Doesn't work well in MultiprocessMode vvv # if not os.path.exists(get_path("MOCA.results")): os.makedirs(get_path("MOCA.results")) # Test the speed of MOCA... if get_arguments().Profile: cProfile.run("main(get_arguments())", sort="cumulative") else: # ...or just do a MOCA run
exit() else: print "To run MOCA you need to choose either a mode, request a report, or call your own function from MyMOCA." print "Please see the 'Arguments file' and/or execute moca.py --help. Exiting..." exit() #Check to see that you have a paths file called 'Paths' in your current working directory if not os.path.isfile("Paths"): print "You must have a paths file in your current-working directory, and it must be called 'Paths'" print "Exiting..." exit() #MOCA data and results directories must exist for any MOCA calculation try: os.makedirs(get_path("MOCA.data")) except OSError: pass try: os.makedirs(get_path("MOCA.results")) except OSError: pass # vvv Doesn't work well in MultiprocessMode vvv #if not os.path.exists(get_path("MOCA.results")): os.makedirs(get_path("MOCA.results")) #Test the speed of MOCA... if get_arguments().Profile: cProfile.run("main(get_arguments())", sort="cumulative") else: #...or just do a MOCA run