Python get_supervised_dataset Examples, DataHandler.get_supervised_dataset Python Examples

Example #1

0

Show file

File: LeaveSomeOut.py Project: stick-figure/MOCA

def leave_some_out(Arguments):
    '''
    Makes the data splits before sending data off for cross validation.
    Every label gets used once. To the extent possible, try and put the same 
    number of controls in every split and cases in every split. And, try and 
    balance the number of cases and controls in each individual split. Example:

    Split 1: Cancer10, Cancer28, Cancer15, Healthy2 Healthy3
    Split 2: Cancer2, Cancer12, Cancer1, Healthy10, Healthy1
    Split 3: Cancer1, Cancer3, Healthy9, Healthy0
    etc
    etc
    '''

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(
        Arguments)

    #Only one phenotype at a time for cross validation
    Phenotype = Phenotypes[0]

    #Important: if you are in Multiprocess mode you need to set the seed or the data splits won't make sense!!!
    CrossValidations = int(ceil(len(Labels) / float(Arguments.LeaveSomeOut)))

    #Get the cases ("1"s) and controls ("0"s) from the Phenotype vector
    Cases = [
        Label for Label in Labels
        if Variates[Features.index(Phenotype)][Labels.index(Label)]
    ]
    Controls = [
        Label for Label in Labels
        if not Variates[Features.index(Phenotype)][Labels.index(Label)]
    ]

    shuffle(
        Cases
    )  #Get rid of bias that MIGHT be inherent in the original data structure
    #Split as evenly as possible among the cross-validations
    Cases = dict([(Iteration, Cases[LabelRange:len(Cases):CrossValidations]) \
                      for Iteration, LabelRange in enumerate(range(CrossValidations))])

    shuffle(
        Controls
    )  #Get rid of bias that MIGHT be inherent in the original data structure
    #Split as evenly as possible among the cross-validations
    Controls = dict([(Iteration, Controls[LabelRange:len(Controls):CrossValidations]) \
                         for Iteration, LabelRange in enumerate(range(CrossValidations))])

    #only for the leave ONE out case do we do cases and controls in series
    if Arguments.LeaveSomeOut == 1:
        if len(Cases) < len(Controls):
            Cases = dict(zip(Cases.keys(), list(reversed(Cases.values()))))
        else:
            Controls = dict(
                zip(Controls.keys(), list(reversed(Controls.values()))))

    cross_validation(Arguments, Labels, Features, Variates, \
                         Markers, Phenotype, CrossValidations, Cases, Controls)

    return

Example #2

0

Show file

File: LeaveSomeOut.py Project: KarchinLab/MOCA

def leave_some_out(Arguments):
    '''
    Makes the data splits before sending data off for cross validation.
    Every label gets used once. To the extent possible, try and put the same 
    number of controls in every split and cases in every split. And, try and 
    balance the number of cases and controls in each individual split. Example:

    Split 1: Cancer10, Cancer28, Cancer15, Healthy2 Healthy3
    Split 2: Cancer2, Cancer12, Cancer1, Healthy10, Healthy1
    Split 3: Cancer1, Cancer3, Healthy9, Healthy0
    etc
    etc
    '''

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    #Only one phenotype at a time for cross validation
    Phenotype = Phenotypes[0]

    #Important: if you are in Multiprocess mode you need to set the seed or the data splits won't make sense!!!
    CrossValidations = int(ceil(len(Labels)/float(Arguments.LeaveSomeOut)))

    #Get the cases ("1"s) and controls ("0"s) from the Phenotype vector
    Cases = [Label for Label in Labels if Variates[Features.index(Phenotype)][Labels.index(Label)]]
    Controls = [Label for Label in Labels if not Variates[Features.index(Phenotype)][Labels.index(Label)]]

    shuffle(Cases) #Get rid of bias that MIGHT be inherent in the original data structure
    #Split as evenly as possible among the cross-validations
    Cases = dict([(Iteration, Cases[LabelRange:len(Cases):CrossValidations]) \
                      for Iteration, LabelRange in enumerate(range(CrossValidations))])
    
    shuffle(Controls) #Get rid of bias that MIGHT be inherent in the original data structure
    #Split as evenly as possible among the cross-validations
    Controls = dict([(Iteration, Controls[LabelRange:len(Controls):CrossValidations]) \
                         for Iteration, LabelRange in enumerate(range(CrossValidations))])
    
    #only for the leave ONE out case do we do cases and controls in series
    if Arguments.LeaveSomeOut == 1:
        if len(Cases) < len(Controls):
            Cases = dict(zip(Cases.keys(), list(reversed(Cases.values()))))
        else:
            Controls = dict(zip(Controls.keys(), list(reversed(Controls.values()))))

    cross_validation(Arguments, Labels, Features, Variates, \
                         Markers, Phenotype, CrossValidations, Cases, Controls)
    
    return

Example #3

0

Show file

File: Setworks.py Project: stick-figure/MOCA

def setworks(Arguments):
    '''
    Default implementation for building the MOCA Boolean set networks (setworks). 
    '''

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(
        Arguments)

    Trials = int(Arguments.Optimization[0])
    RepopulateFrequency = int(Arguments.Optimization[1])
    PercentToRepopulate = float(Arguments.Optimization[2])

    UnionFeatures = int(Arguments.BooleanSets[0])
    IntersectionFeatures = int(Arguments.BooleanSets[1])
    DifferenceFeatures = int(Arguments.BooleanSets[2])

    #MultiProcessMode support if called. Each Phenotype gets its own node
    Node = int(Arguments.MultiProcessMode[0])
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for Phenotype in Phenotypes[Node::TotalNodes]:

        PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \
            get_setworks(Arguments, \
                             Features, Variates, Phenotype, \
                             Markers, Markers, Markers, \
                             Trials, RepopulateFrequency, PercentToRepopulate, \
                             UnionFeatures, IntersectionFeatures, DifferenceFeatures)

        #We only need the intersection of unique setworks passing the FDR threshold
        QValues = p_adjust(PValues, Arguments.CorrectionMethod)
        QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \
                            if QValues[PValue] < Arguments.FDR])
        Barcodes = list(
            set.intersection(set(Setworks.keys()), set(QValues.keys())))

        #finally, if we desire we can filter by performance at this stage. We could do it later, but we'll get a bigger Pickle now.
        Barcodes = [
            Barcode for Barcode in Barcodes
            if minimum_performance(Performances[Barcode], Arguments)
        ]

        if Arguments.PermutePhenotype:
            try:
                QValue = min(QValues.values())
                print "Permutation test failed: you ran with 'PermutePhenotype = True' and setworks could be generated that passed your filters!!!",
                print "This means that your current FDR cutoff is not sufficient for this data. The minimum FDR observed during",
                print "this permutation test was " + str(
                    QValue
                ) + ". You should do this a minimum of 10 times and set your FDR",
                print "threshold (i.e., 'FDR = threshold' in your Arguments file) AT LEAST one order of magnitude lower than the",
                print "lowest observed during permutation testing. This conservative threshold will help ensure that results",
                print "observed during your 'real' setworks run are statisically reliable. The setworks that passed your filters",
                print "for this permutation testing have been saved; if you care to see what features made it thru you can use",
                print "the standard 'Mode = PostProcess' to veiw them. Exiting..."

            except ValueError:
                print "You ran with 'PermutePhenotype = True' and no setworks could be generated that passed your filters --",
                print "this is a great start! You should do this a minimum of 10 times and set your FDR threshold (i.e., 'FDR = threshold'",
                print "in your Arguments file) AT LEAST one order of magnitude lower than the lowest observed during permutation testing.",
                print "This conservative threshold will help ensure that results observed during your 'real' setworks run are statisically",
                print "reliable. Exiting..."
                exit()

        if len(Barcodes):

            Results = {}
            Results["PValues"] = dict([(Barcode, PValues[Barcode])
                                       for Barcode in Barcodes])
            Results["QValues"] = dict([(Barcode, QValues[Barcode])
                                       for Barcode in Barcodes])
            Results["Performances"] = dict([(Barcode, Performances[Barcode])
                                            for Barcode in Barcodes])
            Results["Interactions"] = dict([(Barcode, Interactions[Barcode])
                                            for Barcode in Barcodes])
            Results["FeatureVectors"] = dict([
                (Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes
            ])
            Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0])
                                             for Barcode in Barcodes])
            Results["IntersectionFeatures"] = dict([
                (Barcode, Setworks[Barcode][1]) for Barcode in Barcodes
            ])
            Results["DifferenceFeatures"] = dict([
                (Barcode, Setworks[Barcode][2]) for Barcode in Barcodes
            ])
            Results["SampleCounts"] = dict([(Barcode, SampleCounts[Barcode])
                                            for Barcode in Barcodes])
            Results["CaseCounts"] = dict([(Barcode, CaseCounts[Barcode])
                                          for Barcode in Barcodes])
            Results["EffectSizes"] = dict([(Barcode, EffectSizes[Barcode])
                                           for Barcode in Barcodes])
            Results["Report"] = make_report(Labels, Phenotype, Barcodes,
                                            Arguments)
            Results["Labels"] = Labels
            Results["Barcodes"] = Barcodes
            Results["Phenotype"] = Variates[Features.index(Phenotype)]

            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(
                    set([Arguments.Phenotype]))
                Pickle = "_".join([
                    "Phenotype=" + Phenotype[:Phenotype.index(":")],
                    "_".join(sorted(DataTypes)),
                    str(Arguments.FeatureMin), Arguments.CorrectionMethod,
                    "".join(map(str, Arguments.BooleanSets)),
                    "".join(map(str, Arguments.Optimization))
                ])
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.
                                                              index(":")]

                cPickle.dump(
                    Results,
                    open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

        else:
            print "No setworks were generated. This could mean your data set is not sufficiently powered for deriving setworks",
            print "or that you set your filters unreasonably strict. Exiting...",

    return

Example #4

0

Show file

File: Pairwise.py Project: stick-figure/MOCA

def supervised(Arguments):
    '''
    MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
    supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). 
    Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features 
    ability to predict the phenotype. 
    '''
    
    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    #Clustermode support if called
    Node = int(Arguments.MultiProcessMode[0]) 
    TotalNodes = int(Arguments.MultiProcessMode[1]) 

    for Phenotype in Phenotypes[Node::TotalNodes]:
        PValues = {}
        Interactions = {}
        Performances = {}
        SampleCounts = {}
        CaseCounts = {} #just the postive class here
        EffectSizes  = {}
        for Marker in Markers:
            TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)],
                                            NA=Arguments.NA)
            PValue = fisher(TP,FP,FN,TN)
            PValues[Marker] = PValue.two_tail
            Interaction = interaction(PValue)
            Interactions[Marker] = Interaction
            Performances[Marker] = Performance(Interaction, TP,FP,FN,TN)
            EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN)
            SampleCounts[Marker] = TP + FP + FN + TN
            CaseCounts[Marker] = TP + FN

        FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
        for Marker in Markers:
            FDR = FDRs[PValues[Marker]]
            if FDR < Arguments.FDR:
                pass
            else:
                PValues.pop(Marker, None)
                Interactions.pop(Marker, None)
                Performances.pop(Marker, None)
                SampleCounts.pop(Marker, None)
                CaseCounts.pop(Marker, None)
                EffectSizes.pop(Marker, None)

        if len(PValues.keys()):
            Results = {}
            Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")])
            Results["PValues"] = PValues
            Results["Interactions"] = Interactions
            Results["Performances"] = Performances
            Results["FDRs"] = FDRs
            Results["SampleCounts"] = SampleCounts
            Results["CaseCounts"] = CaseCounts
            Results["EffectSizes"] = EffectSizes
            
            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
                Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")],
                                   "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod])
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")]
                
            cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
        
    return

Example #5

0

Show file

File: Pairwise.py Project: KarchinLab/MOCA

def supervised(Arguments):
    """
    MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
    supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). 
    Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features 
    ability to predict the phenotype. 
    """

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    # Clustermode support if called
    Node = int(Arguments.MultiProcessMode[0])
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for Phenotype in Phenotypes[Node::TotalNodes]:
        PValues = {}
        Interactions = {}
        Performances = {}
        SampleCounts = {}
        CaseCounts = {}  # just the postive class here
        EffectSizes = {}
        for Marker in Markers:
            TP, FP, FN, TN = contingency_table(
                Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA
            )
            PValue = fisher(TP, FP, FN, TN)
            PValues[Marker] = PValue.two_tail
            Interaction = interaction(PValue)
            Interactions[Marker] = Interaction
            Performances[Marker] = Performance(Interaction, TP, FP, FN, TN)
            EffectSizes[Marker] = EffectSize(Interaction, TP, FP, FN, TN)
            SampleCounts[Marker] = TP + FP + FN + TN
            CaseCounts[Marker] = TP + FN

        FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
        for Marker in Markers:
            FDR = FDRs[PValues[Marker]]
            if FDR < Arguments.FDR:
                pass
            else:
                PValues.pop(Marker, None)
                Interactions.pop(Marker, None)
                Performances.pop(Marker, None)
                SampleCounts.pop(Marker, None)
                CaseCounts.pop(Marker, None)
                EffectSizes.pop(Marker, None)

        if len(PValues.keys()):
            Results = {}
            Results["Report"] = make_report(
                Labels, PValues.keys(), Arguments, Supervised=Phenotype[: Phenotype.index(":")]
            )
            Results["PValues"] = PValues
            Results["Interactions"] = Interactions
            Results["Performances"] = Performances
            Results["FDRs"] = FDRs
            Results["SampleCounts"] = SampleCounts
            Results["CaseCounts"] = CaseCounts
            Results["EffectSizes"] = EffectSizes

            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
                Pickle = "_".join(
                    [
                        "Pairwise",
                        "Phenotype=" + Phenotype[: Phenotype.index(":")],
                        "_".join(sorted(DataTypes)),
                        str(Arguments.FeatureMin),
                        Arguments.CorrectionMethod,
                    ]
                )
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[: Phenotype.index(":")]

            cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return