Esempio n. 1
0
def validate_markers(Arguments):
    '''
    '''

    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)
    Header, Results = load_results_file(get_path("ValidateMarkers"))

    CSVfile = open(Arguments.Filename + ".csv", "wb")
    CSVwriter = csv.writer(CSVfile, dialect='excel')
    CSVwriter.writerow(["Union","Intersection","Difference","Interaction", "Phenotype", "P-value", "Odds Ratio", "Effect Size",
                        "Sensitivity","Specificity","PPV","NPV","Accuracy", "MCC", "Sample Count", "Case Count"])
    for Phenotype in Phenotypes:
        Response = Variates[Features.index(Phenotype)]
        for Marker in Results:
            try:
                Predictor = assemble_setwork(Features, Variates,
                                             filter(None, Marker[Header.index("Union")].split(", ")),
                                             filter(None, Marker[Header.index("Intersection")].split(", ")),
                                             filter(None, Marker[Header.index("Difference")].split(", ")), Arguments)
                
                TP,FP,FN,TN = contingency_table(Predictor, Response, NA=Arguments.NA)
                performance = Performance(Marker[Header.index("Interaction")], TP,FP,FN,TN)
                effect_size = EffectSize(Marker[Header.index("Interaction")], TP,FP,FN,TN)
                CSVwriter.writerow([Marker[Header.index("Union")], Marker[Header.index("Intersection")], Marker[Header.index("Difference")],
                                    Marker[Header.index("Interaction")], Phenotype[:Phenotype.index(":")], "%0.2e" %fisher(TP,FP,FN,TN).two_tail,
                                    "%0.2f" %effect_size.odds_ratio, "%0.2f" %effect_size.difference_of_proportions, "%0.2f" %performance.sensitivity,
                                    "%0.2f" %performance.specificity, "%0.2f" %performance.PPV, "%0.2f" %performance.NPV,
                                    "%0.2f" %performance.accuracy, "%0.2f" %performance.MCC, TP+FP+FN+TN, TP+FN])
            except ValueError:
                CSVwriter.writerow([Marker[Header.index("Union")], Marker[Header.index("Intersection")], Marker[Header.index("Difference")], "NA"])
                
    CSVfile.close()
    
    return
Esempio n. 2
0
def leave_some_out(Arguments):
    ''' 
    Function for selecting biomarkers from cross-validation output. Strict in that it only returns 
    biomarkers that were selected during each cross validation. Might have the benefit, relative 
    to vote-based prediction, that these were so predictive that they will translate better to future 
    predictions. Also lends itself to simple clinical use because they require little or no computational 
    support for subsequent prediction...it is simply the marker
    '''

    CrossValidations = load_validation_data(Arguments)
    Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)
    Phenotype = Phenotypes[0]
    Response = Variates[Features.index(Phenotype)]
    Setworks = {}
    for CrossValidation in CrossValidations:
        for Barcode in CrossValidations[CrossValidation]["Barcodes"]:
            Setwork = (CrossValidations[CrossValidation]["UnionFeatures"][Barcode], \
                           CrossValidations[CrossValidation]["IntersectionFeatures"][Barcode], \
                           CrossValidations[CrossValidation]["DifferenceFeatures"][Barcode], \
                           CrossValidations[CrossValidation]["Interactions"][Barcode])

            if Setworks.has_key(Setwork): Setworks[Setwork].append(Barcode)
            else: Setworks[Setwork] = [Barcode]

    PValues = {}
    QValues = {}
    Performances = {}
    Interactions = {}
    FeatureVectors = {}
    UnionFeatures = {}
    IntersectionFeatures = {}
    DifferenceFeatures = {}
    SampleCounts = {}
    CaseCounts = {}
    EffectSizes = {}
    
    Barcodes = []
    for Setwork in Setworks: 
        if len(Setworks[Setwork]) == len(CrossValidations): #Sework had to be selected in each cross validation!!!
            Union, Intersection, Difference, Interaction = Setwork
            Predictor = assemble_setwork(Features, Variates, Union, Intersection, Difference, Arguments)
            TP,FP,FN,TN = contingency_table(Predictor, Response, NA=Arguments.NA)

            Barcode = Setworks[Setwork][0] 
            Barcodes.append(Barcode)
            PValues[Barcode] = fisher(TP,FP,FN,TN).two_tail
            QValues[Barcode] = "NA"
            Performances[Barcode] = Performance(Interaction, TP,FP,FN,TN)
            Interactions[Barcode] = Interaction
            EffectSizes[Barcode] = EffectSize(Interactions[Barcode], TP,FP,FN,TN)
            FeatureVectors[Barcode] = Predictor
            UnionFeatures[Barcode] = Union
            IntersectionFeatures[Barcode] = Intersection
            DifferenceFeatures[Barcode] = Difference
            SampleCounts[Barcode] = TP + FP + FN + TN
            CaseCounts[Barcode] = TP + FN

    Results = {}
    Results["PValues"] = PValues
    Results["QValues"] = QValues
    Results["Performances"] = Performances 
    Results["Interactions"] = Interactions
    Results["FeatureVectors"] = FeatureVectors
    Results["UnionFeatures"] = UnionFeatures
    Results["IntersectionFeatures"] = IntersectionFeatures
    Results["DifferenceFeatures"] = DifferenceFeatures
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["EffectSizes"] = EffectSizes
    #Doesn't matter which index we use we just need one report. The last accessed 'CrossValidation' will do. 
    Results["Report"] = make_report(Labels, CrossValidations[CrossValidation]["Report"])
    Results["Labels"] = Labels
    Results["Barcodes"] = Barcodes
    Results["Phenotype"] = Response

    if Arguments.Filename.lower() == "default":
        DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
        Pickle = "_".join(["Phenotype=" + Phenotype[:Phenotype.index(":")],
                           "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod,
                           "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization))])
    else:
        Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")]
        
    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    
    '''
    #Get rid of barcodes that for setworks that didn't pass a performance threshold (if provided)
    Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)]

    #Initial sort will be done by decreasing balanced accuracy
    Barcodes = sorted(Barcodes, key=lambda Barcode: \
                      (Performances[Barcode].sensitivity + Performances[Barcode].specificity)/2, reverse=True)

    CSVfile = open(Arguments.Filename + ".csv", "wb")
    CSVwriter = csv.writer(CSVfile, dialect='excel')

    #Right the excel header
    CSVwriter.writerow(["Union","Intersection","Difference","Interaction", "Phenotype",
                        "Sensitivity","Specificity","PPV","NPV","Accuracy", "Sample Count"])
    
    for Barcode in Barcodes:
        p = Performances[Barcode]
        Sens, Spec, PPV, NPV, Accuracy = p.sensitivity, p.specificity, p.PPV, p.NPV, p.accuracy 
        CSVwriter.writerow([", ".join(UnionFeatures[Barcode]), ", ".join(IntersectionFeatures[Barcode]),
                           ", ".join(DifferenceFeatures[Barcode]), Interactions[Barcode],
                            Phenotype[:Phenotype.index(":")], "%0.2f" %Sens, "%0.2f" %Spec,
                            "%0.2f" %PPV, "%0.2f" %NPV, "%0.2f" %Accuracy, SampleCount[Barcode]])

    CSVfile.close()
    '''
    
    return