Exemple #1
0
def validate_markers(Arguments):
    '''
    '''

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)
    Header, Results = load_results_file(get_path("ValidateMarkers"))

    CSVfile = open(Arguments.Filename + ".csv", "wb")
    CSVwriter = csv.writer(CSVfile, dialect='excel')
    CSVwriter.writerow(["Union","Intersection","Difference","Interaction", "Phenotype", "P-value", "Odds Ratio", "Effect Size",
                        "Sensitivity","Specificity","PPV","NPV","Accuracy", "MCC", "Sample Count", "Case Count"])
    for Phenotype in Phenotypes:
        Response = Variates[Features.index(Phenotype)]
        for Marker in Results:
            try:
                Predictor = assemble_setwork(Features, Variates,
                                             filter(None, Marker[Header.index("Union")].split(", ")),
                                             filter(None, Marker[Header.index("Intersection")].split(", ")),
                                             filter(None, Marker[Header.index("Difference")].split(", ")), Arguments)
                
                TP,FP,FN,TN = contingency_table(Predictor, Response, NA=Arguments.NA)
                performance = Performance(Marker[Header.index("Interaction")], TP,FP,FN,TN)
                effect_size = EffectSize(Marker[Header.index("Interaction")], TP,FP,FN,TN)
                CSVwriter.writerow([Marker[Header.index("Union")], Marker[Header.index("Intersection")], Marker[Header.index("Difference")],
                                    Marker[Header.index("Interaction")], Phenotype[:Phenotype.index(":")], "%0.2e" %fisher(TP,FP,FN,TN).two_tail,
                                    "%0.2f" %effect_size.odds_ratio, "%0.2f" %effect_size.difference_of_proportions, "%0.2f" %performance.sensitivity,
                                    "%0.2f" %performance.specificity, "%0.2f" %performance.PPV, "%0.2f" %performance.NPV,
                                    "%0.2f" %performance.accuracy, "%0.2f" %performance.MCC, TP+FP+FN+TN, TP+FN])
            except ValueError:
                CSVwriter.writerow([Marker[Header.index("Union")], Marker[Header.index("Intersection")], Marker[Header.index("Difference")], "NA"])
                
    CSVfile.close()
    
    return
Exemple #2
0
def get_rowsidematrix(Features, Arguments):
    '''
    '''

    GrayScale = {
        -9: "gray10",
        -8: "gray20",
        -7: "gray30",
        -6: "gray40",
        -5: "gray50",
        -4: "gray60",
        -3: "gray70",
        -2: "gray80"
    }

    Matrix = dict([(Row.split()[0], Row.split()[1])
                   for Row in file(get_path("RowSideColors"))])

    PValues = []
    for Feature in Features:
        PValues.append(Matrix[Feature])

    Label = ["Mesenchymal"]
    Variates = [
        int(("%0.2e" % float(PValue)).split("e")[1]) for PValue in PValues
    ]
    Variates = [
        "black" if Variate <= -10 else
        ("white" if Variate >= -1 else GrayScale[Variate])
        for Variate in Variates
    ]

    return string_matrix(Features, Label, [Variates], Arguments)
Exemple #3
0
def make_priors(Arguments):
    '''
    This function can make a Priors file from any setwork output (probably from either a Setworks run, or a 
    ValidateBiomarkers run). This Priors file can then be used to bias future searches. Can be useful for 
    focusing setwork selection from "big data". For instance, do a Setworks run to see which individual features
    are being selected with high frequency. Next, make the Priors file and rerun Setworks forcing MOCA to bias 
    its search around the individual features predicted to be important in your previous run. This can be useful, 
    because some data is too big to search thru with the normal MOCA optimization process, without a little help--
    this would be that help. 
    '''

    Header, Markers = load_results_file(get_path("Priors"))
    
    UnionFeatures, IntersectionFeatures, DifferenceFeatures = [], [], []
    for Marker in Markers:
        if Marker[0]: UnionFeatures.extend(map(lambda x: x.strip(","), Marker[0].split()))
        if Marker[1]: IntersectionFeatures.extend(map(lambda x: x.strip(","), Marker[1].split()))
        if Marker[2]: DifferenceFeatures.extend(map(lambda x: x.strip(","), Marker[2].split()))

    Priors = open(Arguments.Filename, "w")
    Priors.write("%s %s \n" %("Union =", " ".join(UnionFeatures)))
    Priors.write("%s %s \n" %("Intersection =", " ".join(IntersectionFeatures)))
    Priors.write("%s %s \n" %("Difference =", " ".join(DifferenceFeatures)))
    Priors.close()

    return
Exemple #4
0
def load_setworks(Arguments):
    '''
    Load the setworks pickled dictionary of dictionaries, and return lists that have been truncated
    to specified number of results (TopInteractions = 100, by default). The list can additionally be
    truncated using performance metric, thru the MinimumPerformance argument. 
    '''
   
    Results = cPickle.load(open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb"))

    PValues = Results["PValues"] 
    QValues = Results["QValues"] 
    Performances = Results["Performances"]
    Interactions = Results["Interactions"] 
    FeatureVectors = Results["FeatureVectors"]
    UnionFeatures = Results["UnionFeatures"]
    IntersectionFeatures = Results["IntersectionFeatures"] 
    DifferenceFeatures = Results["DifferenceFeatures"]
    SampleCounts = Results["SampleCounts"]
    CaseCounts = Results["CaseCounts"]
    EffectSizes = Results["EffectSizes"]
    Barcodes = Results["Barcodes"]
    Report = Results["Report"]
    Labels = Results["Labels"]

    #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided)
    Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)]

    #Initial sort will be done by decreasing balanced accuracy
    Barcodes = rank(Performances, Arguments.RankMethod, Arguments.TopInteractions)
    
    return Barcodes, PValues, QValues, Performances, Interactions, \
        UnionFeatures, IntersectionFeatures, DifferenceFeatures, \
        FeatureVectors, SampleCounts, CaseCounts, EffectSizes, Report, Labels
Exemple #5
0
def load_validation_data(Arguments):
    '''
    Load the python pickles you made from LeaveSomeOut cross-validation calculations
    '''

    CrossValidations = dict([(File, {}) for File in os.listdir(get_path("MOCA.results")) if Arguments.Filename + ".Validation" in File])
    for CrossValidation in CrossValidations.keys():
        Results = cPickle.load(open(get_path("MOCA.results") + "/" + CrossValidation, "rb"))
    
        CrossValidations[CrossValidation]["Cases"] = Results["Report"]["Cases"][1]
        CrossValidations[CrossValidation]["Controls"] = Results["Report"]["Controls"][1]
        CrossValidations[CrossValidation]["Barcodes"] = Results["Barcodes"]  
        CrossValidations[CrossValidation]["PValues"] = Results["PValues"] 
        CrossValidations[CrossValidation]["QValues"] = Results["QValues"] 
        CrossValidations[CrossValidation]["Performances"] = Results["Performances"]
        CrossValidations[CrossValidation]["Interactions"] = Results["Interactions"]
        CrossValidations[CrossValidation]["FeatureVectors"] = Results["FeatureVectors"]
        CrossValidations[CrossValidation]["UnionFeatures"] = Results["UnionFeatures"]
        CrossValidations[CrossValidation]["IntersectionFeatures"] = Results["IntersectionFeatures"] 
        CrossValidations[CrossValidation]["DifferenceFeatures"] = Results["DifferenceFeatures"]
        CrossValidations[CrossValidation]["Report"] = Results["Report"]

    return CrossValidations  
Exemple #6
0
def get_rowsidematrix(Features, Arguments):
    '''
    '''
    
    GrayScale = {-9:"gray10", -8:"gray20", -7:"gray30", -6:"gray40", -5:"gray50", -4:"gray60", -3:"gray70", -2:"gray80"}

    Matrix = dict([(Row.split()[0], Row.split()[1]) for Row in file(get_path("RowSideColors"))])
    
    PValues = []
    for Feature in Features:
        PValues.append(Matrix[Feature])

    Label = ["Mesenchymal"]
    Variates = [int(("%0.2e" %float(PValue)).split("e")[1]) for PValue in PValues]
    Variates = ["black" if Variate <= -10 else ("white" if Variate >= -1 else GrayScale[Variate]) for Variate in Variates]

    return string_matrix(Features, Label, [Variates], Arguments)
Exemple #7
0
def pairwise(Arguments):
    '''
    Print results from you MOCA pairwise runs. 
    '''

    Results = cPickle.load(open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb"))

    if not Results["Report"]["Pairwise"]:
        print "You have 'Pairwise = True' in your arguments file, yet the file you pointed to (MOCA.results/",
        print Arguments.Filename + ") did not result from a pairwise MOCA calculation."
        print "Exiting..."
        exit()

    if Results["Report"]["Continuous-valued correlation"]:
        pairwise_continuous(Results, Arguments)
    elif type(Results["Report"]["Supervised"]) == list:
        supervised(Results, Arguments)
    else:
        unsupervised(Results, Arguments)
    
    return 
Exemple #8
0
def pairwise(Arguments):
    '''
    Print results from you MOCA pairwise runs. 
    '''

    Results = cPickle.load(
        open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb"))

    if not Results["Report"]["Pairwise"]:
        print "You have 'Pairwise = True' in your arguments file, yet the file you pointed to (MOCA.results/",
        print Arguments.Filename + ") did not result from a pairwise MOCA calculation."
        print "Exiting..."
        exit()

    if Results["Report"]["Continuous-valued correlation"]:
        pairwise_continuous(Results, Arguments)
    elif type(Results["Report"]["Supervised"]) == list:
        supervised(Results, Arguments)
    else:
        unsupervised(Results, Arguments)

    return
Exemple #9
0
def load_setworks(Arguments):
    '''
    Load the setworks pickled dictionary of dictionaries, and return lists that have been truncated
    to specified number of results (TopInteractions = 100, by default). The list can additionally be
    truncated using performance metric, thru the MinimumPerformance argument. 
    '''

    Results = cPickle.load(
        open(get_path("MOCA.results") + "/" + Arguments.Filename, "rb"))

    PValues = Results["PValues"]
    QValues = Results["QValues"]
    Performances = Results["Performances"]
    Interactions = Results["Interactions"]
    FeatureVectors = Results["FeatureVectors"]
    UnionFeatures = Results["UnionFeatures"]
    IntersectionFeatures = Results["IntersectionFeatures"]
    DifferenceFeatures = Results["DifferenceFeatures"]
    SampleCounts = Results["SampleCounts"]
    CaseCounts = Results["CaseCounts"]
    EffectSizes = Results["EffectSizes"]
    Barcodes = Results["Barcodes"]
    Report = Results["Report"]
    Labels = Results["Labels"]

    #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided)
    Barcodes = [
        Barcode for Barcode in Barcodes
        if minimum_performance(Performances[Barcode], Arguments)
    ]

    #Initial sort will be done by decreasing balanced accuracy
    Barcodes = rank(Performances, Arguments.RankMethod,
                    Arguments.TopInteractions)

    return Barcodes, PValues, QValues, Performances, Interactions, \
        UnionFeatures, IntersectionFeatures, DifferenceFeatures, \
        FeatureVectors, SampleCounts, CaseCounts, EffectSizes, Report, Labels
Exemple #10
0
def leave_some_out(Arguments):
    ''' 
    Function for selecting biomarkers from cross-validation output. Strict in that it only returns 
    biomarkers that were selected during each cross validation. Might have the benefit, relative 
    to vote-based prediction, that these were so predictive that they will translate better to future 
    predictions. Also lends itself to simple clinical use because they require little or no computational 
    support for subsequent prediction...it is simply the marker
    '''

    CrossValidations = load_validation_data(Arguments)
    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)
    Phenotype = Phenotypes[0]
    Response = Variates[Features.index(Phenotype)]
    Setworks = {}
    for CrossValidation in CrossValidations:
        for Barcode in CrossValidations[CrossValidation]["Barcodes"]:
            Setwork = (CrossValidations[CrossValidation]["UnionFeatures"][Barcode], \
                           CrossValidations[CrossValidation]["IntersectionFeatures"][Barcode], \
                           CrossValidations[CrossValidation]["DifferenceFeatures"][Barcode], \
                           CrossValidations[CrossValidation]["Interactions"][Barcode])

            if Setworks.has_key(Setwork): Setworks[Setwork].append(Barcode)
            else: Setworks[Setwork] = [Barcode]

    PValues = {}
    QValues = {}
    Performances = {}
    Interactions = {}
    FeatureVectors = {}
    UnionFeatures = {}
    IntersectionFeatures = {}
    DifferenceFeatures = {}
    SampleCounts = {}
    CaseCounts = {}
    EffectSizes = {}
    
    Barcodes = []
    for Setwork in Setworks: 
        if len(Setworks[Setwork]) == len(CrossValidations): #Sework had to be selected in each cross validation!!!
            Union, Intersection, Difference, Interaction = Setwork
            Predictor = assemble_setwork(Features, Variates, Union, Intersection, Difference, Arguments)
            TP,FP,FN,TN = contingency_table(Predictor, Response, NA=Arguments.NA)

            Barcode = Setworks[Setwork][0] 
            Barcodes.append(Barcode)
            PValues[Barcode] = fisher(TP,FP,FN,TN).two_tail
            QValues[Barcode] = "NA"
            Performances[Barcode] = Performance(Interaction, TP,FP,FN,TN)
            Interactions[Barcode] = Interaction
            EffectSizes[Barcode] = EffectSize(Interactions[Barcode], TP,FP,FN,TN)
            FeatureVectors[Barcode] = Predictor
            UnionFeatures[Barcode] = Union
            IntersectionFeatures[Barcode] = Intersection
            DifferenceFeatures[Barcode] = Difference
            SampleCounts[Barcode] = TP + FP + FN + TN
            CaseCounts[Barcode] = TP + FN

    Results = {}
    Results["PValues"] = PValues
    Results["QValues"] = QValues
    Results["Performances"] = Performances 
    Results["Interactions"] = Interactions
    Results["FeatureVectors"] = FeatureVectors
    Results["UnionFeatures"] = UnionFeatures
    Results["IntersectionFeatures"] = IntersectionFeatures
    Results["DifferenceFeatures"] = DifferenceFeatures
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["EffectSizes"] = EffectSizes
    #Doesn't matter which index we use we just need one report. The last accessed 'CrossValidation' will do. 
    Results["Report"] = make_report(Labels, CrossValidations[CrossValidation]["Report"])
    Results["Labels"] = Labels
    Results["Barcodes"] = Barcodes
    Results["Phenotype"] = Response

    if Arguments.Filename.lower() == "default":
        DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
        Pickle = "_".join(["Phenotype=" + Phenotype[:Phenotype.index(":")],
                           "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod,
                           "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization))])
    else:
        Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")]
        
    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    
    '''
    #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided)
    Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)]

    #Initial sort will be done by decreasing balanced accuracy
    Barcodes = sorted(Barcodes, key=lambda Barcode: \
                      (Performances[Barcode].sensitivity + Performances[Barcode].specificity)/2, reverse=True)

    CSVfile = open(Arguments.Filename + ".csv", "wb")
    CSVwriter = csv.writer(CSVfile, dialect='excel')

    #Right the excel header
    CSVwriter.writerow(["Union","Intersection","Difference","Interaction", "Phenotype",
                        "Sensitivity","Specificity","PPV","NPV","Accuracy", "Sample Count"])
    
    for Barcode in Barcodes:
        p = Performances[Barcode]
        Sens, Spec, PPV, NPV, Accuracy = p.sensitivity, p.specificity, p.PPV, p.NPV, p.accuracy 
        CSVwriter.writerow([", ".join(UnionFeatures[Barcode]), ", ".join(IntersectionFeatures[Barcode]),
                           ", ".join(DifferenceFeatures[Barcode]), Interactions[Barcode],
                            Phenotype[:Phenotype.index(":")], "%0.2f" %Sens, "%0.2f" %Spec,
                            "%0.2f" %PPV, "%0.2f" %NPV, "%0.2f" %Accuracy, SampleCount[Barcode]])

    CSVfile.close()
    '''
    
    return 
Exemple #11
0
            exit()
    else:
        print "To run MOCA you need to choose either a mode, request a report, or call your own function from MyMOCA."
        print "Please see the 'Arguments file' and/or execute moca.py --help. Exiting..."
        exit()


# Check to see that you have a paths file called 'Paths' in your current working directory
if not os.path.isfile("Paths"):
    print "You must have a paths file in your current-working directory, and it must be called 'Paths'"
    print "Exiting..."
    exit()

# MOCA data and results directories must exist for any MOCA calculation
try:
    os.makedirs(get_path("MOCA.data"))
except OSError:
    pass

try:
    os.makedirs(get_path("MOCA.results"))
except OSError:
    pass

# vvv Doesn't work well in MultiprocessMode vvv
# if not os.path.exists(get_path("MOCA.results")): os.makedirs(get_path("MOCA.results"))

# Test the speed of MOCA...
if get_arguments().Profile:
    cProfile.run("main(get_arguments())", sort="cumulative")
else:  # ...or just do a MOCA run
Exemple #12
0
            exit()
    else:
        print "To run MOCA you need to choose either a mode, request a report, or call your own function from MyMOCA."
        print "Please see the 'Arguments file' and/or execute moca.py --help. Exiting..."
        exit()


#Check to see that you have a paths file called 'Paths' in your current working directory
if not os.path.isfile("Paths"):
    print "You must have a paths file in your current-working directory, and it must be called 'Paths'"
    print "Exiting..."
    exit()

#MOCA data and results directories must exist for any MOCA calculation
try:
    os.makedirs(get_path("MOCA.data"))
except OSError:
    pass

try:
    os.makedirs(get_path("MOCA.results"))
except OSError:
    pass

# vvv Doesn't work well in MultiprocessMode vvv
#if not os.path.exists(get_path("MOCA.results")): os.makedirs(get_path("MOCA.results"))

#Test the speed of MOCA...
if get_arguments().Profile:
    cProfile.run("main(get_arguments())", sort="cumulative")
else:  #...or just do a MOCA run