def get_setworks(Arguments, \ Features, Variates, Phenotype, \ UnionMarkers, IntersectionMarkers, DifferenceMarkers, \ Trials, RepopulateFrequency, PercentToRepopulate, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures): ''' The core engine for building MOCA setworks (networks of features combined using Boolean set operations). Very customizable. You can build your own implementation using MyMOCA.py, or you can run the simple default mode by calling the setworks function via the Arguments file or the command line (Setworks = True and --setworks True, respectively). Phenotype is the thing your selecting markers for Markers of each Boolean type represent the initial pool for that type Trials, RepopulateFrequency, PercentToRepopulate = Optimization parameters (see arguments or UsersManual) UnionFeatures, IntersectionFeatures, DifferenceFeatures = max number of each operation in a single comparison. ''' PValues = {} Performances = {} Interactions = {} FeatureVectors = {} Setworks = {} SampleCounts = {} CaseCounts = {} #just the postive class here EffectSizes = {} #Do this outside of the for loop to prevent re-reading if Arguments.Priors: Priors = get_priors(Arguments) #Get response outside of the loop, incase we want to permute the phenotype Response = Variates[Features.index(Phenotype)] if Arguments.PermutePhenotype: shuffle(Response) for Trial in range(Trials): if Trial and not Trial % RepopulateFrequency: for Barcode in rank( Performances, Arguments.RankMethod, int(len(Performances.keys()) * PercentToRepopulate)): Marker = Setworks[Barcode] UnionMarkers.extend(Marker[0]) IntersectionMarkers.extend(Marker[1]) DifferenceMarkers.extend(Marker[2]) #Define the min and max number of features to combine via each Boolean set operation UnionSample = weighted_sample(UnionMarkers, min(UnionFeatures, len(UnionMarkers))) UnionCombinations = combine_features(UnionSample, 0, len(UnionSample)) IntersectionSample = weighted_sample( IntersectionMarkers, min(IntersectionFeatures, len(IntersectionMarkers))) IntersectionCombinations = combine_features(IntersectionSample, 0, len(IntersectionSample)) DifferenceSample = weighted_sample( DifferenceMarkers, min(DifferenceFeatures, len(DifferenceMarkers))) DifferenceCombinations = combine_features(DifferenceSample, 0, len(DifferenceSample)) if Arguments.Priors: #Are we using priors? UnionCombinations, IntersectionCombinations, DifferenceCombinations = \ priors(Priors, UnionCombinations, IntersectionCombinations, DifferenceCombinations, Arguments) LocalInteractions = [ ] #We'll use this to eject passengers at the end of each trial for UnionCombination in UnionCombinations: for IntersectionCombination in IntersectionCombinations: for DifferenceCombination in DifferenceCombinations: FeatureVector = assemble_setwork(Features, Variates, UnionCombination, IntersectionCombination, DifferenceCombination, Arguments) if FeatureVector: Predictor = FeatureVector TP, FP, FN, TN = contingency_table(Predictor, Response, NA=Arguments.NA) PValue = fisher(TP, FP, FN, TN) Setwork = [sorted(UnionCombination), sorted(IntersectionCombination), \ sorted(DifferenceCombination)] if Arguments.ForceCooccurring and interaction( PValue) == "MutuallyExclusive": break #We don't want more than one of the same feature in a single setwork unless we are in Bandwidth mode if not Arguments.Bandwidth: if max( Counter(reduced_features( Setwork)).values()) == 1: pass else: break Barcode = barcode() Setworks[Barcode] = tuple(map(tuple, Setwork)) PValues[Barcode] = PValue.two_tail SampleCounts[Barcode] = TP + FP + FN + TN CaseCounts[Barcode] = TP + FN Interactions[Barcode] = interaction(PValue) Performances[Barcode] = Performance( Interactions[Barcode], TP, FP, FN, TN) EffectSizes[Barcode] = EffectSize( Interactions[Barcode], TP, FP, FN, TN) FeatureVectors[Barcode] = FeatureVector LocalInteractions.append(Barcode) if Arguments.EjectPassengers: EjectedPassengers = eject_passengers(LocalInteractions, Setworks, PValues, Arguments) for Barcode in EjectedPassengers: Setworks.pop(Barcode, None) Performances.pop( Barcode, None ) #Have to get rid of these to prevent repopulating with passengers #Momentarily make the setworks keys so that the python dictionary removes redundancies InvertedSetworks = dict(zip(Setworks.values(), Setworks.keys())) Setworks = dict( zip(InvertedSetworks.values(), InvertedSetworks.keys())) #Turn the barcodes back into keys return PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes
def unsupervised(Arguments): ''' Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.). In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for a single datatype. ''' if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Transformed.Features.values())) Variates = list(chain(*Data.Transformed.Variates.values())) if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Transformed.Features[Arguments.Data[0]] Features2 = Data.Transformed.Features[Arguments.Data[1]] PValues = {} Interactions = {} SampleCounts = {} CaseCounts = {} #just the positive class here Performances = {} EffectSizes = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA) PValue = fisher(a,b,c,d) PValues[tuple([Feature1, Feature2])] = PValue.two_tail Interactions[tuple([Feature1, Feature2])] = interaction(PValue) SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d CaseCounts[tuple([Feature1, Feature2])] = a + c #A placeholder solely to make pairwise post-processing generalizable Performances[tuple([Feature1, Feature2])] = "NA" EffectSizes[tuple([Feature1, Feature2])] = "NA" FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Interactions.pop(Pair, None) SampleCounts.pop(Pair, None) CaseCounts.pop(Pair, None) Performances.pop(Pair, None) EffectSizes.pop(Pair, None) Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["Performances"] = Performances Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def unsupervised(Arguments): """ Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.). In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for a single datatype. """ if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Transformed.Features.values())) Variates = list(chain(*Data.Transformed.Variates.values())) if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Transformed.Features[Arguments.Data[0]] Features2 = Data.Transformed.Features[Arguments.Data[1]] PValues = {} Interactions = {} SampleCounts = {} CaseCounts = {} # just the positive class here Performances = {} EffectSizes = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: a, b, c, d = contingency_table( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA ) PValue = fisher(a, b, c, d) PValues[tuple([Feature1, Feature2])] = PValue.two_tail Interactions[tuple([Feature1, Feature2])] = interaction(PValue) SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d CaseCounts[tuple([Feature1, Feature2])] = a + c # A placeholder solely to make pairwise post-processing generalizable Performances[tuple([Feature1, Feature2])] = "NA" EffectSizes[tuple([Feature1, Feature2])] = "NA" FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Interactions.pop(Pair, None) SampleCounts.pop(Pair, None) CaseCounts.pop(Pair, None) Performances.pop(Pair, None) EffectSizes.pop(Pair, None) Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["Performances"] = Performances Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": Pickle = "_".join( ["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod] ) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def supervised(Arguments): ''' MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features ability to predict the phenotype. ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) #Clustermode support if called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues = {} Interactions = {} Performances = {} SampleCounts = {} CaseCounts = {} #just the postive class here EffectSizes = {} for Marker in Markers: TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA) PValue = fisher(TP,FP,FN,TN) PValues[Marker] = PValue.two_tail Interaction = interaction(PValue) Interactions[Marker] = Interaction Performances[Marker] = Performance(Interaction, TP,FP,FN,TN) EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN) SampleCounts[Marker] = TP + FP + FN + TN CaseCounts[Marker] = TP + FN FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Marker in Markers: FDR = FDRs[PValues[Marker]] if FDR < Arguments.FDR: pass else: PValues.pop(Marker, None) Interactions.pop(Marker, None) Performances.pop(Marker, None) SampleCounts.pop(Marker, None) CaseCounts.pop(Marker, None) EffectSizes.pop(Marker, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")]) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["Performances"] = Performances Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def supervised(Arguments): """ MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features ability to predict the phenotype. """ Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) # Clustermode support if called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues = {} Interactions = {} Performances = {} SampleCounts = {} CaseCounts = {} # just the postive class here EffectSizes = {} for Marker in Markers: TP, FP, FN, TN = contingency_table( Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA ) PValue = fisher(TP, FP, FN, TN) PValues[Marker] = PValue.two_tail Interaction = interaction(PValue) Interactions[Marker] = Interaction Performances[Marker] = Performance(Interaction, TP, FP, FN, TN) EffectSizes[Marker] = EffectSize(Interaction, TP, FP, FN, TN) SampleCounts[Marker] = TP + FP + FN + TN CaseCounts[Marker] = TP + FN FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Marker in Markers: FDR = FDRs[PValues[Marker]] if FDR < Arguments.FDR: pass else: PValues.pop(Marker, None) Interactions.pop(Marker, None) Performances.pop(Marker, None) SampleCounts.pop(Marker, None) CaseCounts.pop(Marker, None) EffectSizes.pop(Marker, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report( Labels, PValues.keys(), Arguments, Supervised=Phenotype[: Phenotype.index(":")] ) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["Performances"] = Performances Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join( [ "Pairwise", "Phenotype=" + Phenotype[: Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, ] ) else: Pickle = Arguments.Filename + "_" + Phenotype[: Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return