Ejemplo n.º 1
0
def get_setworks(Arguments, \
                     Features, Variates, Phenotype, \
                     UnionMarkers, IntersectionMarkers, DifferenceMarkers, \
                     Trials, RepopulateFrequency, PercentToRepopulate, \
                     UnionFeatures, IntersectionFeatures, DifferenceFeatures):
    '''
    The core engine for building MOCA setworks (networks of features combined using 
    Boolean set operations). Very customizable. You can build your own implementation 
    using MyMOCA.py, or you can run the simple default mode by calling the setworks function 
    via the Arguments file or the command line (Setworks = True and --setworks True, respectively).

    Phenotype is the thing your selecting markers for
    Markers of each Boolean type represent the initial pool for that type
    Trials, RepopulateFrequency, PercentToRepopulate = Optimization parameters (see arguments or UsersManual)
    UnionFeatures, IntersectionFeatures, DifferenceFeatures = max number of each operation in a single comparison.
    '''

    PValues = {}
    Performances = {}
    Interactions = {}
    FeatureVectors = {}
    Setworks = {}
    SampleCounts = {}
    CaseCounts = {}  #just the postive class here
    EffectSizes = {}

    #Do this outside of the for loop to prevent re-reading
    if Arguments.Priors: Priors = get_priors(Arguments)

    #Get response outside of the loop, incase we want to permute the phenotype
    Response = Variates[Features.index(Phenotype)]
    if Arguments.PermutePhenotype: shuffle(Response)

    for Trial in range(Trials):
        if Trial and not Trial % RepopulateFrequency:
            for Barcode in rank(
                    Performances, Arguments.RankMethod,
                    int(len(Performances.keys()) * PercentToRepopulate)):
                Marker = Setworks[Barcode]
                UnionMarkers.extend(Marker[0])
                IntersectionMarkers.extend(Marker[1])
                DifferenceMarkers.extend(Marker[2])

        #Define the min and max number of features to combine via each Boolean set operation
        UnionSample = weighted_sample(UnionMarkers,
                                      min(UnionFeatures, len(UnionMarkers)))
        UnionCombinations = combine_features(UnionSample, 0, len(UnionSample))

        IntersectionSample = weighted_sample(
            IntersectionMarkers,
            min(IntersectionFeatures, len(IntersectionMarkers)))
        IntersectionCombinations = combine_features(IntersectionSample, 0,
                                                    len(IntersectionSample))

        DifferenceSample = weighted_sample(
            DifferenceMarkers, min(DifferenceFeatures, len(DifferenceMarkers)))
        DifferenceCombinations = combine_features(DifferenceSample, 0,
                                                  len(DifferenceSample))

        if Arguments.Priors:  #Are we using priors?
            UnionCombinations, IntersectionCombinations, DifferenceCombinations = \
                priors(Priors, UnionCombinations, IntersectionCombinations, DifferenceCombinations, Arguments)

        LocalInteractions = [
        ]  #We'll use this to eject passengers at the end of each trial
        for UnionCombination in UnionCombinations:
            for IntersectionCombination in IntersectionCombinations:
                for DifferenceCombination in DifferenceCombinations:

                    FeatureVector = assemble_setwork(Features, Variates,
                                                     UnionCombination,
                                                     IntersectionCombination,
                                                     DifferenceCombination,
                                                     Arguments)
                    if FeatureVector:

                        Predictor = FeatureVector
                        TP, FP, FN, TN = contingency_table(Predictor,
                                                           Response,
                                                           NA=Arguments.NA)
                        PValue = fisher(TP, FP, FN, TN)
                        Setwork = [sorted(UnionCombination), sorted(IntersectionCombination), \
                                       sorted(DifferenceCombination)]

                        if Arguments.ForceCooccurring and interaction(
                                PValue) == "MutuallyExclusive":
                            break

                        #We don't want more than one of the same feature in a single setwork unless we are in Bandwidth mode
                        if not Arguments.Bandwidth:
                            if max(
                                    Counter(reduced_features(
                                        Setwork)).values()) == 1:
                                pass
                            else:
                                break

                        Barcode = barcode()
                        Setworks[Barcode] = tuple(map(tuple, Setwork))
                        PValues[Barcode] = PValue.two_tail
                        SampleCounts[Barcode] = TP + FP + FN + TN
                        CaseCounts[Barcode] = TP + FN
                        Interactions[Barcode] = interaction(PValue)
                        Performances[Barcode] = Performance(
                            Interactions[Barcode], TP, FP, FN, TN)
                        EffectSizes[Barcode] = EffectSize(
                            Interactions[Barcode], TP, FP, FN, TN)
                        FeatureVectors[Barcode] = FeatureVector
                        LocalInteractions.append(Barcode)

        if Arguments.EjectPassengers:
            EjectedPassengers = eject_passengers(LocalInteractions, Setworks,
                                                 PValues, Arguments)

            for Barcode in EjectedPassengers:
                Setworks.pop(Barcode, None)
                Performances.pop(
                    Barcode, None
                )  #Have to get rid of these to prevent repopulating with passengers

    #Momentarily make the setworks keys so that the python dictionary removes redundancies
    InvertedSetworks = dict(zip(Setworks.values(), Setworks.keys()))
    Setworks = dict(
        zip(InvertedSetworks.values(),
            InvertedSetworks.keys()))  #Turn the barcodes back into keys

    return PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes
Ejemplo n.º 2
0
def unsupervised(Arguments):
    '''
    Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to 
    so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
    In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for 
    a single datatype. 
    '''

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Transformed.Features.values()))
    Variates = list(chain(*Data.Transformed.Variates.values()))
    if len(Arguments.Data) == 1:
        Features1 = Features
        Features2 = Features

    if len(Arguments.Data) == 2:
        Features1 = Data.Transformed.Features[Arguments.Data[0]]
        Features2 = Data.Transformed.Features[Arguments.Data[1]]

    PValues = {}
    Interactions = {}
    SampleCounts = {}
    CaseCounts = {} #just the positive class here
    Performances = {}
    EffectSizes  = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)],
                                            NA=Arguments.NA)
                PValue = fisher(a,b,c,d)
                PValues[tuple([Feature1, Feature2])] = PValue.two_tail
                Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
                SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
                CaseCounts[tuple([Feature1, Feature2])] = a + c
                #A placeholder solely to make pairwise post-processing generalizable
                Performances[tuple([Feature1, Feature2])] = "NA"
                EffectSizes[tuple([Feature1, Feature2])] = "NA"
                
    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Interactions.pop(Pair, None)
            SampleCounts.pop(Pair, None)
            CaseCounts.pop(Pair, None)
            Performances.pop(Pair, None)
            EffectSizes.pop(Pair, None)

    Results = {}
    Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) 
    Results["PValues"] = PValues
    Results["Interactions"] = Interactions
    Results["FDRs"] = FDRs
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["Performances"] = Performances
    Results["EffectSizes"] = EffectSizes

    if Arguments.Filename.lower() == "default":
        Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin),
                           Arguments.CorrectionMethod])
    else:
        Pickle = Arguments.Filename

    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
        
    return
Ejemplo n.º 3
0
def unsupervised(Arguments):
    """
    Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to 
    so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
    In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for 
    a single datatype. 
    """

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Transformed.Features.values()))
    Variates = list(chain(*Data.Transformed.Variates.values()))
    if len(Arguments.Data) == 1:
        Features1 = Features
        Features2 = Features

    if len(Arguments.Data) == 2:
        Features1 = Data.Transformed.Features[Arguments.Data[0]]
        Features2 = Data.Transformed.Features[Arguments.Data[1]]

    PValues = {}
    Interactions = {}
    SampleCounts = {}
    CaseCounts = {}  # just the positive class here
    Performances = {}
    EffectSizes = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                a, b, c, d = contingency_table(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA
                )
                PValue = fisher(a, b, c, d)
                PValues[tuple([Feature1, Feature2])] = PValue.two_tail
                Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
                SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
                CaseCounts[tuple([Feature1, Feature2])] = a + c
                # A placeholder solely to make pairwise post-processing generalizable
                Performances[tuple([Feature1, Feature2])] = "NA"
                EffectSizes[tuple([Feature1, Feature2])] = "NA"

    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Interactions.pop(Pair, None)
            SampleCounts.pop(Pair, None)
            CaseCounts.pop(Pair, None)
            Performances.pop(Pair, None)
            EffectSizes.pop(Pair, None)

    Results = {}
    Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments)
    Results["PValues"] = PValues
    Results["Interactions"] = Interactions
    Results["FDRs"] = FDRs
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["Performances"] = Performances
    Results["EffectSizes"] = EffectSizes

    if Arguments.Filename.lower() == "default":
        Pickle = "_".join(
            ["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]
        )
    else:
        Pickle = Arguments.Filename

    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Ejemplo n.º 4
0
def supervised(Arguments):
    '''
    MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
    supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). 
    Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features 
    ability to predict the phenotype. 
    '''
    
    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    #Clustermode support if called
    Node = int(Arguments.MultiProcessMode[0]) 
    TotalNodes = int(Arguments.MultiProcessMode[1]) 

    for Phenotype in Phenotypes[Node::TotalNodes]:
        PValues = {}
        Interactions = {}
        Performances = {}
        SampleCounts = {}
        CaseCounts = {} #just the postive class here
        EffectSizes  = {}
        for Marker in Markers:
            TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)],
                                            NA=Arguments.NA)
            PValue = fisher(TP,FP,FN,TN)
            PValues[Marker] = PValue.two_tail
            Interaction = interaction(PValue)
            Interactions[Marker] = Interaction
            Performances[Marker] = Performance(Interaction, TP,FP,FN,TN)
            EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN)
            SampleCounts[Marker] = TP + FP + FN + TN
            CaseCounts[Marker] = TP + FN

        FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
        for Marker in Markers:
            FDR = FDRs[PValues[Marker]]
            if FDR < Arguments.FDR:
                pass
            else:
                PValues.pop(Marker, None)
                Interactions.pop(Marker, None)
                Performances.pop(Marker, None)
                SampleCounts.pop(Marker, None)
                CaseCounts.pop(Marker, None)
                EffectSizes.pop(Marker, None)

        if len(PValues.keys()):
            Results = {}
            Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")])
            Results["PValues"] = PValues
            Results["Interactions"] = Interactions
            Results["Performances"] = Performances
            Results["FDRs"] = FDRs
            Results["SampleCounts"] = SampleCounts
            Results["CaseCounts"] = CaseCounts
            Results["EffectSizes"] = EffectSizes
            
            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
                Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")],
                                   "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod])
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")]
                
            cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
        
    return
Ejemplo n.º 5
0
def supervised(Arguments):
    """
    MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
    supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). 
    Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features 
    ability to predict the phenotype. 
    """

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)

    # Clustermode support if called
    Node = int(Arguments.MultiProcessMode[0])
    TotalNodes = int(Arguments.MultiProcessMode[1])

    for Phenotype in Phenotypes[Node::TotalNodes]:
        PValues = {}
        Interactions = {}
        Performances = {}
        SampleCounts = {}
        CaseCounts = {}  # just the postive class here
        EffectSizes = {}
        for Marker in Markers:
            TP, FP, FN, TN = contingency_table(
                Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA
            )
            PValue = fisher(TP, FP, FN, TN)
            PValues[Marker] = PValue.two_tail
            Interaction = interaction(PValue)
            Interactions[Marker] = Interaction
            Performances[Marker] = Performance(Interaction, TP, FP, FN, TN)
            EffectSizes[Marker] = EffectSize(Interaction, TP, FP, FN, TN)
            SampleCounts[Marker] = TP + FP + FN + TN
            CaseCounts[Marker] = TP + FN

        FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
        for Marker in Markers:
            FDR = FDRs[PValues[Marker]]
            if FDR < Arguments.FDR:
                pass
            else:
                PValues.pop(Marker, None)
                Interactions.pop(Marker, None)
                Performances.pop(Marker, None)
                SampleCounts.pop(Marker, None)
                CaseCounts.pop(Marker, None)
                EffectSizes.pop(Marker, None)

        if len(PValues.keys()):
            Results = {}
            Results["Report"] = make_report(
                Labels, PValues.keys(), Arguments, Supervised=Phenotype[: Phenotype.index(":")]
            )
            Results["PValues"] = PValues
            Results["Interactions"] = Interactions
            Results["Performances"] = Performances
            Results["FDRs"] = FDRs
            Results["SampleCounts"] = SampleCounts
            Results["CaseCounts"] = CaseCounts
            Results["EffectSizes"] = EffectSizes

            if Arguments.Filename.lower() == "default":
                DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
                Pickle = "_".join(
                    [
                        "Pairwise",
                        "Phenotype=" + Phenotype[: Phenotype.index(":")],
                        "_".join(sorted(DataTypes)),
                        str(Arguments.FeatureMin),
                        Arguments.CorrectionMethod,
                    ]
                )
            else:
                Pickle = Arguments.Filename + "_" + Phenotype[: Phenotype.index(":")]

            cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return