def leave_some_out(Arguments): ''' Makes the data splits before sending data off for cross validation. Every label gets used once. To the extent possible, try and put the same number of controls in every split and cases in every split. And, try and balance the number of cases and controls in each individual split. Example: Split 1: Cancer10, Cancer28, Cancer15, Healthy2 Healthy3 Split 2: Cancer2, Cancer12, Cancer1, Healthy10, Healthy1 Split 3: Cancer1, Cancer3, Healthy9, Healthy0 etc etc ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset( Arguments) #Only one phenotype at a time for cross validation Phenotype = Phenotypes[0] #Important: if you are in Multiprocess mode you need to set the seed or the data splits won't make sense!!! CrossValidations = int(ceil(len(Labels) / float(Arguments.LeaveSomeOut))) #Get the cases ("1"s) and controls ("0"s) from the Phenotype vector Cases = [ Label for Label in Labels if Variates[Features.index(Phenotype)][Labels.index(Label)] ] Controls = [ Label for Label in Labels if not Variates[Features.index(Phenotype)][Labels.index(Label)] ] shuffle( Cases ) #Get rid of bias that MIGHT be inherent in the original data structure #Split as evenly as possible among the cross-validations Cases = dict([(Iteration, Cases[LabelRange:len(Cases):CrossValidations]) \ for Iteration, LabelRange in enumerate(range(CrossValidations))]) shuffle( Controls ) #Get rid of bias that MIGHT be inherent in the original data structure #Split as evenly as possible among the cross-validations Controls = dict([(Iteration, Controls[LabelRange:len(Controls):CrossValidations]) \ for Iteration, LabelRange in enumerate(range(CrossValidations))]) #only for the leave ONE out case do we do cases and controls in series if Arguments.LeaveSomeOut == 1: if len(Cases) < len(Controls): Cases = dict(zip(Cases.keys(), list(reversed(Cases.values())))) else: Controls = dict( zip(Controls.keys(), list(reversed(Controls.values())))) cross_validation(Arguments, Labels, Features, Variates, \ Markers, Phenotype, CrossValidations, Cases, Controls) return
def leave_some_out(Arguments): ''' Makes the data splits before sending data off for cross validation. Every label gets used once. To the extent possible, try and put the same number of controls in every split and cases in every split. And, try and balance the number of cases and controls in each individual split. Example: Split 1: Cancer10, Cancer28, Cancer15, Healthy2 Healthy3 Split 2: Cancer2, Cancer12, Cancer1, Healthy10, Healthy1 Split 3: Cancer1, Cancer3, Healthy9, Healthy0 etc etc ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) #Only one phenotype at a time for cross validation Phenotype = Phenotypes[0] #Important: if you are in Multiprocess mode you need to set the seed or the data splits won't make sense!!! CrossValidations = int(ceil(len(Labels)/float(Arguments.LeaveSomeOut))) #Get the cases ("1"s) and controls ("0"s) from the Phenotype vector Cases = [Label for Label in Labels if Variates[Features.index(Phenotype)][Labels.index(Label)]] Controls = [Label for Label in Labels if not Variates[Features.index(Phenotype)][Labels.index(Label)]] shuffle(Cases) #Get rid of bias that MIGHT be inherent in the original data structure #Split as evenly as possible among the cross-validations Cases = dict([(Iteration, Cases[LabelRange:len(Cases):CrossValidations]) \ for Iteration, LabelRange in enumerate(range(CrossValidations))]) shuffle(Controls) #Get rid of bias that MIGHT be inherent in the original data structure #Split as evenly as possible among the cross-validations Controls = dict([(Iteration, Controls[LabelRange:len(Controls):CrossValidations]) \ for Iteration, LabelRange in enumerate(range(CrossValidations))]) #only for the leave ONE out case do we do cases and controls in series if Arguments.LeaveSomeOut == 1: if len(Cases) < len(Controls): Cases = dict(zip(Cases.keys(), list(reversed(Cases.values())))) else: Controls = dict(zip(Controls.keys(), list(reversed(Controls.values())))) cross_validation(Arguments, Labels, Features, Variates, \ Markers, Phenotype, CrossValidations, Cases, Controls) return
def setworks(Arguments): ''' Default implementation for building the MOCA Boolean set networks (setworks). ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset( Arguments) Trials = int(Arguments.Optimization[0]) RepopulateFrequency = int(Arguments.Optimization[1]) PercentToRepopulate = float(Arguments.Optimization[2]) UnionFeatures = int(Arguments.BooleanSets[0]) IntersectionFeatures = int(Arguments.BooleanSets[1]) DifferenceFeatures = int(Arguments.BooleanSets[2]) #MultiProcessMode support if called. Each Phenotype gets its own node Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \ get_setworks(Arguments, \ Features, Variates, Phenotype, \ Markers, Markers, Markers, \ Trials, RepopulateFrequency, PercentToRepopulate, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures) #We only need the intersection of unique setworks passing the FDR threshold QValues = p_adjust(PValues, Arguments.CorrectionMethod) QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \ if QValues[PValue] < Arguments.FDR]) Barcodes = list( set.intersection(set(Setworks.keys()), set(QValues.keys()))) #finally, if we desire we can filter by performance at this stage. We could do it later, but we'll get a bigger Pickle now. Barcodes = [ Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments) ] if Arguments.PermutePhenotype: try: QValue = min(QValues.values()) print "Permutation test failed: you ran with 'PermutePhenotype = True' and setworks could be generated that passed your filters!!!", print "This means that your current FDR cutoff is not sufficient for this data. The minimum FDR observed during", print "this permutation test was " + str( QValue ) + ". You should do this a minimum of 10 times and set your FDR", print "threshold (i.e., 'FDR = threshold' in your Arguments file) AT LEAST one order of magnitude lower than the", print "lowest observed during permutation testing. This conservative threshold will help ensure that results", print "observed during your 'real' setworks run are statisically reliable. The setworks that passed your filters", print "for this permutation testing have been saved; if you care to see what features made it thru you can use", print "the standard 'Mode = PostProcess' to veiw them. Exiting..." except ValueError: print "You ran with 'PermutePhenotype = True' and no setworks could be generated that passed your filters --", print "this is a great start! You should do this a minimum of 10 times and set your FDR threshold (i.e., 'FDR = threshold'", print "in your Arguments file) AT LEAST one order of magnitude lower than the lowest observed during permutation testing.", print "This conservative threshold will help ensure that results observed during your 'real' setworks run are statisically", print "reliable. Exiting..." exit() if len(Barcodes): Results = {} Results["PValues"] = dict([(Barcode, PValues[Barcode]) for Barcode in Barcodes]) Results["QValues"] = dict([(Barcode, QValues[Barcode]) for Barcode in Barcodes]) Results["Performances"] = dict([(Barcode, Performances[Barcode]) for Barcode in Barcodes]) Results["Interactions"] = dict([(Barcode, Interactions[Barcode]) for Barcode in Barcodes]) Results["FeatureVectors"] = dict([ (Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes ]) Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0]) for Barcode in Barcodes]) Results["IntersectionFeatures"] = dict([ (Barcode, Setworks[Barcode][1]) for Barcode in Barcodes ]) Results["DifferenceFeatures"] = dict([ (Barcode, Setworks[Barcode][2]) for Barcode in Barcodes ]) Results["SampleCounts"] = dict([(Barcode, SampleCounts[Barcode]) for Barcode in Barcodes]) Results["CaseCounts"] = dict([(Barcode, CaseCounts[Barcode]) for Barcode in Barcodes]) Results["EffectSizes"] = dict([(Barcode, EffectSizes[Barcode]) for Barcode in Barcodes]) Results["Report"] = make_report(Labels, Phenotype, Barcodes, Arguments) Results["Labels"] = Labels Results["Barcodes"] = Barcodes Results["Phenotype"] = Variates[Features.index(Phenotype)] if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference( set([Arguments.Phenotype])) Pickle = "_".join([ "Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization)) ]) else: Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype. index(":")] cPickle.dump( Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) else: print "No setworks were generated. This could mean your data set is not sufficiently powered for deriving setworks", print "or that you set your filters unreasonably strict. Exiting...", return
def supervised(Arguments): ''' MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features ability to predict the phenotype. ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) #Clustermode support if called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues = {} Interactions = {} Performances = {} SampleCounts = {} CaseCounts = {} #just the postive class here EffectSizes = {} for Marker in Markers: TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA) PValue = fisher(TP,FP,FN,TN) PValues[Marker] = PValue.two_tail Interaction = interaction(PValue) Interactions[Marker] = Interaction Performances[Marker] = Performance(Interaction, TP,FP,FN,TN) EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN) SampleCounts[Marker] = TP + FP + FN + TN CaseCounts[Marker] = TP + FN FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Marker in Markers: FDR = FDRs[PValues[Marker]] if FDR < Arguments.FDR: pass else: PValues.pop(Marker, None) Interactions.pop(Marker, None) Performances.pop(Marker, None) SampleCounts.pop(Marker, None) CaseCounts.pop(Marker, None) EffectSizes.pop(Marker, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")]) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["Performances"] = Performances Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def supervised(Arguments): """ MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features ability to predict the phenotype. """ Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) # Clustermode support if called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues = {} Interactions = {} Performances = {} SampleCounts = {} CaseCounts = {} # just the postive class here EffectSizes = {} for Marker in Markers: TP, FP, FN, TN = contingency_table( Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA ) PValue = fisher(TP, FP, FN, TN) PValues[Marker] = PValue.two_tail Interaction = interaction(PValue) Interactions[Marker] = Interaction Performances[Marker] = Performance(Interaction, TP, FP, FN, TN) EffectSizes[Marker] = EffectSize(Interaction, TP, FP, FN, TN) SampleCounts[Marker] = TP + FP + FN + TN CaseCounts[Marker] = TP + FN FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Marker in Markers: FDR = FDRs[PValues[Marker]] if FDR < Arguments.FDR: pass else: PValues.pop(Marker, None) Interactions.pop(Marker, None) Performances.pop(Marker, None) SampleCounts.pop(Marker, None) CaseCounts.pop(Marker, None) EffectSizes.pop(Marker, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report( Labels, PValues.keys(), Arguments, Supervised=Phenotype[: Phenotype.index(":")] ) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["Performances"] = Performances Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join( [ "Pairwise", "Phenotype=" + Phenotype[: Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, ] ) else: Pickle = Arguments.Filename + "_" + Phenotype[: Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return