Esempio n. 1
0
def setworks(Arguments):
    '''
    Default implementation for building the MOCA Boolean set networks (setworks). 
    '''

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(
        Arguments)

    Trials = int(Arguments.Optimization[0])
    RepopulateFrequency = int(Arguments.Optimization[1])
    PercentToRepopulate = float(Arguments.Optimization[2])

    UnionFeatures = int(Arguments.BooleanSets[0])
    IntersectionFeatures = int(Arguments.BooleanSets[1])
    DifferenceFeatures = int(Arguments.BooleanSets[2])

    #MultiProcessMode support if called. Each Phenotype gets its own node
    Node = int(Arguments.MultiProcessMode[0])
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for Phenotype in Phenotypes[Node::TotalNodes]:

        PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \
            get_setworks(Arguments, \
                             Features, Variates, Phenotype, \
                             Markers, Markers, Markers, \
                             Trials, RepopulateFrequency, PercentToRepopulate, \
                             UnionFeatures, IntersectionFeatures, DifferenceFeatures)

        #We only need the intersection of unique setworks passing the FDR threshold
        QValues = p_adjust(PValues, Arguments.CorrectionMethod)
        QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \
                            if QValues[PValue] < Arguments.FDR])
        Barcodes = list(
            set.intersection(set(Setworks.keys()), set(QValues.keys())))

        #finally, if we desire we can filter by performance at this stage. We could do it later, but we'll get a bigger Pickle now.
        Barcodes = [
            Barcode for Barcode in Barcodes
            if minimum_performance(Performances[Barcode], Arguments)
        ]

        if Arguments.PermutePhenotype:
            try:
                QValue = min(QValues.values())
                print "Permutation test failed: you ran with 'PermutePhenotype = True' and setworks could be generated that passed your filters!!!",
                print "This means that your current FDR cutoff is not sufficient for this data. The minimum FDR observed during",
                print "this permutation test was " + str(
                    QValue
                ) + ". You should do this a minimum of 10 times and set your FDR",
                print "threshold (i.e., 'FDR = threshold' in your Arguments file) AT LEAST one order of magnitude lower than the",
                print "lowest observed during permutation testing. This conservative threshold will help ensure that results",
                print "observed during your 'real' setworks run are statisically reliable. The setworks that passed your filters",
                print "for this permutation testing have been saved; if you care to see what features made it thru you can use",
                print "the standard 'Mode = PostProcess' to veiw them. Exiting..."

            except ValueError:
                print "You ran with 'PermutePhenotype = True' and no setworks could be generated that passed your filters --",
                print "this is a great start! You should do this a minimum of 10 times and set your FDR threshold (i.e., 'FDR = threshold'",
                print "in your Arguments file) AT LEAST one order of magnitude lower than the lowest observed during permutation testing.",
                print "This conservative threshold will help ensure that results observed during your 'real' setworks run are statisically",
                print "reliable. Exiting..."
                exit()

        if len(Barcodes):

            Results = {}
            Results["PValues"] = dict([(Barcode, PValues[Barcode])
                                       for Barcode in Barcodes])
            Results["QValues"] = dict([(Barcode, QValues[Barcode])
                                       for Barcode in Barcodes])
            Results["Performances"] = dict([(Barcode, Performances[Barcode])
                                            for Barcode in Barcodes])
            Results["Interactions"] = dict([(Barcode, Interactions[Barcode])
                                            for Barcode in Barcodes])
            Results["FeatureVectors"] = dict([
                (Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes
            ])
            Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0])
                                             for Barcode in Barcodes])
            Results["IntersectionFeatures"] = dict([
                (Barcode, Setworks[Barcode][1]) for Barcode in Barcodes
            ])
            Results["DifferenceFeatures"] = dict([
                (Barcode, Setworks[Barcode][2]) for Barcode in Barcodes
            ])
            Results["SampleCounts"] = dict([(Barcode, SampleCounts[Barcode])
                                            for Barcode in Barcodes])
            Results["CaseCounts"] = dict([(Barcode, CaseCounts[Barcode])
                                          for Barcode in Barcodes])
            Results["EffectSizes"] = dict([(Barcode, EffectSizes[Barcode])
                                           for Barcode in Barcodes])
            Results["Report"] = make_report(Labels, Phenotype, Barcodes,
                                            Arguments)
            Results["Labels"] = Labels
            Results["Barcodes"] = Barcodes
            Results["Phenotype"] = Variates[Features.index(Phenotype)]

            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(
                    set([Arguments.Phenotype]))
                Pickle = "_".join([
                    "Phenotype=" + Phenotype[:Phenotype.index(":")],
                    "_".join(sorted(DataTypes)),
                    str(Arguments.FeatureMin), Arguments.CorrectionMethod,
                    "".join(map(str, Arguments.BooleanSets)),
                    "".join(map(str, Arguments.Optimization))
                ])
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.
                                                              index(":")]

                cPickle.dump(
                    Results,
                    open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

        else:
            print "No setworks were generated. This could mean your data set is not sufficiently powered for deriving setworks",
            print "or that you set your filters unreasonably strict. Exiting...",

    return
Esempio n. 2
0
def cross_validation(Arguments, \
                         Labels, Features, Variates, \
                         Markers, Phenotype, CrossValidations, Cases, Controls):
    '''
    Labels, Features, and Variates have the usual meanings. Markers are any data you 
    give MOCA that you didn't tell it is the Phenotype. Phenotype is the thing your 
    trying to predict (i.e., select markers for)--only one Phenotype per LeaveSomeOut run please.
    Interger numnber of CrossValidations to do (e.g., leave ONE out or TEN-fold cross validation)
    Cases and Controls are dictionaries specifying which labels go with which CrossValidation. 
    For each cross-validation this pickles a feature matrix of the following format
    
    Label1 Label2 Label3.....
    Phenotype 0 0 1.....
    (Setwork1 InteractionType) 1 0 1.....
    (Setwork2 InteractionType) 1 1 0.....
    (Setwork3 InteractionType) 0 0 0.....
    .
    .
    .
    .
    .
    '''
    
    #Read in the setwork optimization paraments
    Trials = int(Arguments.Optimization[0])
    RepopulateFrequency = int(Arguments.Optimization[1])
    PercentToRepopulate = float(Arguments.Optimization[2])

    #Read in the Boolean set parameters
    UnionFeatures = int(Arguments.BooleanSets[0])
    IntersectionFeatures = int(Arguments.BooleanSets[1])
    DifferenceFeatures = int(Arguments.BooleanSets[2])

    #Multiprocess mode support if called. Cross-validation is so compute intensive that you 
    #can only do this for one Phenotype at a time, and the different cross-validations are 
    #distributed to different processors if MultiProcessMode is called
    Node = int(Arguments.MultiProcessMode[0]) 
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for CrossValidation in range(CrossValidations)[Node::TotalNodes]:
        #Split the data is directed
        TrainLabels = list(set(Labels) - set(Cases[CrossValidation] + Controls[CrossValidation]))
        TrainVariates = get_ordered_matrix(TrainLabels, Labels, Variates)
        
        #Get setworks from the training data
        PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \
            get_setworks(Arguments, \
                             Features, TrainVariates, Phenotype, \
                             Markers, Markers, Markers, \
                             Trials, RepopulateFrequency, PercentToRepopulate, \
                             UnionFeatures, IntersectionFeatures, DifferenceFeatures)
        
        #Make the cross-validation matrix. First row is the case-control-label header
        CrossValidationFeatureMatrix = [Cases[CrossValidation] + Controls[CrossValidation]]
        PhenotypeVector = [1 for Case in Cases[CrossValidation]] + [0 for Control in Controls[CrossValidation]]
        CrossValidationFeatureMatrix.append(PhenotypeVector) #second row is the phenotype vector

        #We only need the intersection of unique setworks passing the FDR threshold
        QValues = p_adjust(PValues, Arguments.CorrectionMethod)
        QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \
                            if QValues[PValue] < Arguments.FDR])
        Barcodes = list(set.intersection(set(Setworks.keys()), set(QValues.keys())))

        #finally, if we desire we can filter by fdr at this stage. We could do it later, but we'll get a bigger Pickle now. 
        Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)]

        Results = {}
        Results["PValues"] = dict([(Barcode, PValues[Barcode]) for Barcode in Barcodes])
        Results["QValues"] = dict([(Barcode, QValues[Barcode]) for Barcode in Barcodes])
        Results["Performances"] = dict([(Barcode, Performances[Barcode]) for Barcode in Barcodes]) 
        Results["Interactions"] = dict([(Barcode, Interactions[Barcode]) for Barcode in Barcodes])
        Results["FeatureVectors"] = dict([(Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes]) 
        Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0]) for Barcode in Barcodes])
        Results["IntersectionFeatures"] = dict([(Barcode, Setworks[Barcode][1]) for Barcode in Barcodes])
        Results["DifferenceFeatures"] = dict([(Barcode, Setworks[Barcode][2]) for Barcode in Barcodes])
        Results["Report"] = make_report(Cases, Controls, Phenotype, CrossValidation, Arguments)
        Results["Barcodes"] = Barcodes 

        if Arguments.Filename.lower() == "default":
            DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
            Pickle = "_".join(["Phenotype=" + Phenotype[:Phenotype.index(":")],
                               "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod,
                               "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization)),
                               ".Validation" + str(CrossValidation)])
        else:
            Pickle = Arguments.Filename + ".Validation" + str(CrossValidation)
        
        cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Esempio n. 3
0
def unsupervised(Arguments):
    '''
    Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to 
    so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
    In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for 
    a single datatype. 
    '''

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Transformed.Features.values()))
    Variates = list(chain(*Data.Transformed.Variates.values()))
    if len(Arguments.Data) == 1:
        Features1 = Features
        Features2 = Features

    if len(Arguments.Data) == 2:
        Features1 = Data.Transformed.Features[Arguments.Data[0]]
        Features2 = Data.Transformed.Features[Arguments.Data[1]]

    PValues = {}
    Interactions = {}
    SampleCounts = {}
    CaseCounts = {} #just the positive class here
    Performances = {}
    EffectSizes  = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)],
                                            NA=Arguments.NA)
                PValue = fisher(a,b,c,d)
                PValues[tuple([Feature1, Feature2])] = PValue.two_tail
                Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
                SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
                CaseCounts[tuple([Feature1, Feature2])] = a + c
                #A placeholder solely to make pairwise post-processing generalizable
                Performances[tuple([Feature1, Feature2])] = "NA"
                EffectSizes[tuple([Feature1, Feature2])] = "NA"
                
    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Interactions.pop(Pair, None)
            SampleCounts.pop(Pair, None)
            CaseCounts.pop(Pair, None)
            Performances.pop(Pair, None)
            EffectSizes.pop(Pair, None)

    Results = {}
    Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) 
    Results["PValues"] = PValues
    Results["Interactions"] = Interactions
    Results["FDRs"] = FDRs
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["Performances"] = Performances
    Results["EffectSizes"] = EffectSizes

    if Arguments.Filename.lower() == "default":
        Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin),
                           Arguments.CorrectionMethod])
    else:
        Pickle = Arguments.Filename

    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
        
    return
Esempio n. 4
0
def cross_validation(Arguments, \
                         Labels, Features, Variates, \
                         Markers, Phenotype, CrossValidations, Cases, Controls):
    '''
    Labels, Features, and Variates have the usual meanings. Markers are any data you 
    give MOCA that you didn't tell it is the Phenotype. Phenotype is the thing your 
    trying to predict (i.e., select markers for)--only one Phenotype per LeaveSomeOut run please.
    Interger numnber of CrossValidations to do (e.g., leave ONE out or TEN-fold cross validation)
    Cases and Controls are dictionaries specifying which labels go with which CrossValidation. 
    For each cross-validation this pickles a feature matrix of the following format
    
    Label1 Label2 Label3.....
    Phenotype 0 0 1.....
    (Setwork1 InteractionType) 1 0 1.....
    (Setwork2 InteractionType) 1 1 0.....
    (Setwork3 InteractionType) 0 0 0.....
    .
    .
    .
    .
    .
    '''

    #Read in the setwork optimization paraments
    Trials = int(Arguments.Optimization[0])
    RepopulateFrequency = int(Arguments.Optimization[1])
    PercentToRepopulate = float(Arguments.Optimization[2])

    #Read in the Boolean set parameters
    UnionFeatures = int(Arguments.BooleanSets[0])
    IntersectionFeatures = int(Arguments.BooleanSets[1])
    DifferenceFeatures = int(Arguments.BooleanSets[2])

    #Multiprocess mode support if called. Cross-validation is so compute intensive that you
    #can only do this for one Phenotype at a time, and the different cross-validations are
    #distributed to different processors if MultiProcessMode is called
    Node = int(Arguments.MultiProcessMode[0])
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for CrossValidation in range(CrossValidations)[Node::TotalNodes]:
        #Split the data is directed
        TrainLabels = list(
            set(Labels) -
            set(Cases[CrossValidation] + Controls[CrossValidation]))
        TrainVariates = get_ordered_matrix(TrainLabels, Labels, Variates)

        #Get setworks from the training data
        PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \
            get_setworks(Arguments, \
                             Features, TrainVariates, Phenotype, \
                             Markers, Markers, Markers, \
                             Trials, RepopulateFrequency, PercentToRepopulate, \
                             UnionFeatures, IntersectionFeatures, DifferenceFeatures)

        #Make the cross-validation matrix. First row is the case-control-label header
        CrossValidationFeatureMatrix = [
            Cases[CrossValidation] + Controls[CrossValidation]
        ]
        PhenotypeVector = [1 for Case in Cases[CrossValidation]
                           ] + [0 for Control in Controls[CrossValidation]]
        CrossValidationFeatureMatrix.append(
            PhenotypeVector)  #second row is the phenotype vector

        #We only need the intersection of unique setworks passing the FDR threshold
        QValues = p_adjust(PValues, Arguments.CorrectionMethod)
        QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \
                            if QValues[PValue] < Arguments.FDR])
        Barcodes = list(
            set.intersection(set(Setworks.keys()), set(QValues.keys())))

        #finally, if we desire we can filter by fdr at this stage. We could do it later, but we'll get a bigger Pickle now.
        Barcodes = [
            Barcode for Barcode in Barcodes
            if minimum_performance(Performances[Barcode], Arguments)
        ]

        Results = {}
        Results["PValues"] = dict([(Barcode, PValues[Barcode])
                                   for Barcode in Barcodes])
        Results["QValues"] = dict([(Barcode, QValues[Barcode])
                                   for Barcode in Barcodes])
        Results["Performances"] = dict([(Barcode, Performances[Barcode])
                                        for Barcode in Barcodes])
        Results["Interactions"] = dict([(Barcode, Interactions[Barcode])
                                        for Barcode in Barcodes])
        Results["FeatureVectors"] = dict([(Barcode, FeatureVectors[Barcode])
                                          for Barcode in Barcodes])
        Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0])
                                         for Barcode in Barcodes])
        Results["IntersectionFeatures"] = dict([(Barcode, Setworks[Barcode][1])
                                                for Barcode in Barcodes])
        Results["DifferenceFeatures"] = dict([(Barcode, Setworks[Barcode][2])
                                              for Barcode in Barcodes])
        Results["Report"] = make_report(Cases, Controls, Phenotype,
                                        CrossValidation, Arguments)
        Results["Barcodes"] = Barcodes

        if Arguments.Filename.lower() == "default":
            DataTypes = set(Arguments.Data).difference(
                set([Arguments.Phenotype]))
            Pickle = "_".join([
                "Phenotype=" + Phenotype[:Phenotype.index(":")],
                "_".join(sorted(DataTypes)),
                str(Arguments.FeatureMin), Arguments.CorrectionMethod,
                "".join(map(str, Arguments.BooleanSets)),
                "".join(map(str, Arguments.Optimization)),
                ".Validation" + str(CrossValidation)
            ])
        else:
            Pickle = Arguments.Filename + ".Validation" + str(CrossValidation)

        cPickle.dump(Results,
                     open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Esempio n. 5
0
def pairwise_continuous(Arguments):
    '''
    '''

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Features.values()))
    Variates = list(chain(*Data.Variates.values()))

    if Arguments.Phenotype:
        Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature]
        Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature]

    else:

        if len(Arguments.Data) == 1:
            Features1 = Features
            Features2 = Features

        if len(Arguments.Data) == 2:
            Features1 = Data.Features[Arguments.Data[0]]
            Features2 = Data.Features[Arguments.Data[1]]

    PValues = {}
    Correlations = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                PValues[tuple([Feature1, Feature2])] = correlation_pvalue(Variates[Features.index(Feature1)],
                                                                          Variates[Features.index(Feature2)])
                Correlations[tuple([Feature1, Feature2])] = correlation(Variates[Features.index(Feature1)],
                                                                       Variates[Features.index(Feature2)])
    
    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Correlations.pop(Pair, None)

    if len(PValues.keys()):
        Results = {}
        Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype)
        Results["PValues"] = PValues
        Results["Correlations"] = Correlations
        Results["FDRs"] = FDRs

        if Arguments.Filename.lower() == "default":
            Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod])
        else:
            Pickle = Arguments.Filename
                
        cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
    
    return
Esempio n. 6
0
def supervised(Arguments):
    '''
    MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
    supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). 
    Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features 
    ability to predict the phenotype. 
    '''
    
    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    #Clustermode support if called
    Node = int(Arguments.MultiProcessMode[0]) 
    TotalNodes = int(Arguments.MultiProcessMode[1]) 

    for Phenotype in Phenotypes[Node::TotalNodes]:
        PValues = {}
        Interactions = {}
        Performances = {}
        SampleCounts = {}
        CaseCounts = {} #just the postive class here
        EffectSizes  = {}
        for Marker in Markers:
            TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)],
                                            NA=Arguments.NA)
            PValue = fisher(TP,FP,FN,TN)
            PValues[Marker] = PValue.two_tail
            Interaction = interaction(PValue)
            Interactions[Marker] = Interaction
            Performances[Marker] = Performance(Interaction, TP,FP,FN,TN)
            EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN)
            SampleCounts[Marker] = TP + FP + FN + TN
            CaseCounts[Marker] = TP + FN

        FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
        for Marker in Markers:
            FDR = FDRs[PValues[Marker]]
            if FDR < Arguments.FDR:
                pass
            else:
                PValues.pop(Marker, None)
                Interactions.pop(Marker, None)
                Performances.pop(Marker, None)
                SampleCounts.pop(Marker, None)
                CaseCounts.pop(Marker, None)
                EffectSizes.pop(Marker, None)

        if len(PValues.keys()):
            Results = {}
            Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")])
            Results["PValues"] = PValues
            Results["Interactions"] = Interactions
            Results["Performances"] = Performances
            Results["FDRs"] = FDRs
            Results["SampleCounts"] = SampleCounts
            Results["CaseCounts"] = CaseCounts
            Results["EffectSizes"] = EffectSizes
            
            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
                Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")],
                                   "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod])
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")]
                
            cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
        
    return
Esempio n. 7
0
def unsupervised(Arguments):
    """
    Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to 
    so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
    In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for 
    a single datatype. 
    """

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Transformed.Features.values()))
    Variates = list(chain(*Data.Transformed.Variates.values()))
    if len(Arguments.Data) == 1:
        Features1 = Features
        Features2 = Features

    if len(Arguments.Data) == 2:
        Features1 = Data.Transformed.Features[Arguments.Data[0]]
        Features2 = Data.Transformed.Features[Arguments.Data[1]]

    PValues = {}
    Interactions = {}
    SampleCounts = {}
    CaseCounts = {}  # just the positive class here
    Performances = {}
    EffectSizes = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                a, b, c, d = contingency_table(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA
                )
                PValue = fisher(a, b, c, d)
                PValues[tuple([Feature1, Feature2])] = PValue.two_tail
                Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
                SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
                CaseCounts[tuple([Feature1, Feature2])] = a + c
                # A placeholder solely to make pairwise post-processing generalizable
                Performances[tuple([Feature1, Feature2])] = "NA"
                EffectSizes[tuple([Feature1, Feature2])] = "NA"

    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Interactions.pop(Pair, None)
            SampleCounts.pop(Pair, None)
            CaseCounts.pop(Pair, None)
            Performances.pop(Pair, None)
            EffectSizes.pop(Pair, None)

    Results = {}
    Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments)
    Results["PValues"] = PValues
    Results["Interactions"] = Interactions
    Results["FDRs"] = FDRs
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["Performances"] = Performances
    Results["EffectSizes"] = EffectSizes

    if Arguments.Filename.lower() == "default":
        Pickle = "_".join(
            ["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]
        )
    else:
        Pickle = Arguments.Filename

    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Esempio n. 8
0
def pairwise_continuous(Arguments):
    """
    """

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Features.values()))
    Variates = list(chain(*Data.Variates.values()))

    if Arguments.Phenotype:
        Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature]
        Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature]

    else:

        if len(Arguments.Data) == 1:
            Features1 = Features
            Features2 = Features

        if len(Arguments.Data) == 2:
            Features1 = Data.Features[Arguments.Data[0]]
            Features2 = Data.Features[Arguments.Data[1]]

    PValues = {}
    Correlations = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                PValues[tuple([Feature1, Feature2])] = correlation_pvalue(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]
                )
                Correlations[tuple([Feature1, Feature2])] = correlation(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]
                )

    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Correlations.pop(Pair, None)

    if len(PValues.keys()):
        Results = {}
        Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype)
        Results["PValues"] = PValues
        Results["Correlations"] = Correlations
        Results["FDRs"] = FDRs

        if Arguments.Filename.lower() == "default":
            Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod])
        else:
            Pickle = Arguments.Filename

        cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Esempio n. 9
0
def supervised(Arguments):
    """
    MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
    supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). 
    Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features 
    ability to predict the phenotype. 
    """

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    # Clustermode support if called
    Node = int(Arguments.MultiProcessMode[0])
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for Phenotype in Phenotypes[Node::TotalNodes]:
        PValues = {}
        Interactions = {}
        Performances = {}
        SampleCounts = {}
        CaseCounts = {}  # just the postive class here
        EffectSizes = {}
        for Marker in Markers:
            TP, FP, FN, TN = contingency_table(
                Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA
            )
            PValue = fisher(TP, FP, FN, TN)
            PValues[Marker] = PValue.two_tail
            Interaction = interaction(PValue)
            Interactions[Marker] = Interaction
            Performances[Marker] = Performance(Interaction, TP, FP, FN, TN)
            EffectSizes[Marker] = EffectSize(Interaction, TP, FP, FN, TN)
            SampleCounts[Marker] = TP + FP + FN + TN
            CaseCounts[Marker] = TP + FN

        FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
        for Marker in Markers:
            FDR = FDRs[PValues[Marker]]
            if FDR < Arguments.FDR:
                pass
            else:
                PValues.pop(Marker, None)
                Interactions.pop(Marker, None)
                Performances.pop(Marker, None)
                SampleCounts.pop(Marker, None)
                CaseCounts.pop(Marker, None)
                EffectSizes.pop(Marker, None)

        if len(PValues.keys()):
            Results = {}
            Results["Report"] = make_report(
                Labels, PValues.keys(), Arguments, Supervised=Phenotype[: Phenotype.index(":")]
            )
            Results["PValues"] = PValues
            Results["Interactions"] = Interactions
            Results["Performances"] = Performances
            Results["FDRs"] = FDRs
            Results["SampleCounts"] = SampleCounts
            Results["CaseCounts"] = CaseCounts
            Results["EffectSizes"] = EffectSizes

            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
                Pickle = "_".join(
                    [
                        "Pairwise",
                        "Phenotype=" + Phenotype[: Phenotype.index(":")],
                        "_".join(sorted(DataTypes)),
                        str(Arguments.FeatureMin),
                        Arguments.CorrectionMethod,
                    ]
                )
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[: Phenotype.index(":")]

            cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return