print ('Correlation with {column_name}'.format(column_name = column_name)) print (corr_matrix[column_name].sort_values(ascending=False)) def save_fig(fig_id, tight_layout=True): ''' save figue to folder. file name will include current timestamp ''' fig_id += '_' + str(time.time()) path = os.path.join('.', "images", fig_id + ".png") print("Saving figure", fig_id) if tight_layout: plt.tight_layout() plt.savefig(path, format='png', dpi=300) if __name__ == '__main__': housing = load_data(HOUSING_PATH, 'housing.csv') print (housing.head()) print (housing.describe()) scatter_plot_by_column(housing, 'longitude', 'latitude', 'population', 'median_house_value') attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] scatter_matrix_for_attributes(housing, attributes) #example for feature mapping - better correlation with the target value housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] housing["population_per_household"] = housing["population"]/housing["households"] #see correlation show_correlation_with_column(housing, 'median_house_value')
def unsupervised(Arguments): ''' Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.). In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for a single datatype. ''' if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Transformed.Features.values())) Variates = list(chain(*Data.Transformed.Variates.values())) if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Transformed.Features[Arguments.Data[0]] Features2 = Data.Transformed.Features[Arguments.Data[1]] PValues = {} Interactions = {} SampleCounts = {} CaseCounts = {} #just the positive class here Performances = {} EffectSizes = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA) PValue = fisher(a,b,c,d) PValues[tuple([Feature1, Feature2])] = PValue.two_tail Interactions[tuple([Feature1, Feature2])] = interaction(PValue) SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d CaseCounts[tuple([Feature1, Feature2])] = a + c #A placeholder solely to make pairwise post-processing generalizable Performances[tuple([Feature1, Feature2])] = "NA" EffectSizes[tuple([Feature1, Feature2])] = "NA" FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Interactions.pop(Pair, None) SampleCounts.pop(Pair, None) CaseCounts.pop(Pair, None) Performances.pop(Pair, None) EffectSizes.pop(Pair, None) Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["Performances"] = Performances Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def pairwise_continuous(Arguments): ''' ''' if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Features.values())) Variates = list(chain(*Data.Variates.values())) if Arguments.Phenotype: Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature] Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature] else: if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Features[Arguments.Data[0]] Features2 = Data.Features[Arguments.Data[1]] PValues = {} Correlations = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: PValues[tuple([Feature1, Feature2])] = correlation_pvalue(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]) Correlations[tuple([Feature1, Feature2])] = correlation(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]) FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Correlations.pop(Pair, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype) Results["PValues"] = PValues Results["Correlations"] = Correlations Results["FDRs"] = FDRs if Arguments.Filename.lower() == "default": Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def unsupervised(Arguments): """ Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.). In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for a single datatype. """ if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Transformed.Features.values())) Variates = list(chain(*Data.Transformed.Variates.values())) if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Transformed.Features[Arguments.Data[0]] Features2 = Data.Transformed.Features[Arguments.Data[1]] PValues = {} Interactions = {} SampleCounts = {} CaseCounts = {} # just the positive class here Performances = {} EffectSizes = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: a, b, c, d = contingency_table( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA ) PValue = fisher(a, b, c, d) PValues[tuple([Feature1, Feature2])] = PValue.two_tail Interactions[tuple([Feature1, Feature2])] = interaction(PValue) SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d CaseCounts[tuple([Feature1, Feature2])] = a + c # A placeholder solely to make pairwise post-processing generalizable Performances[tuple([Feature1, Feature2])] = "NA" EffectSizes[tuple([Feature1, Feature2])] = "NA" FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Interactions.pop(Pair, None) SampleCounts.pop(Pair, None) CaseCounts.pop(Pair, None) Performances.pop(Pair, None) EffectSizes.pop(Pair, None) Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["Performances"] = Performances Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": Pickle = "_".join( ["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod] ) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def pairwise_continuous(Arguments): """ """ if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Features.values())) Variates = list(chain(*Data.Variates.values())) if Arguments.Phenotype: Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature] Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature] else: if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Features[Arguments.Data[0]] Features2 = Data.Features[Arguments.Data[1]] PValues = {} Correlations = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: PValues[tuple([Feature1, Feature2])] = correlation_pvalue( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)] ) Correlations[tuple([Feature1, Feature2])] = correlation( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)] ) FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Correlations.pop(Pair, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype) Results["PValues"] = PValues Results["Correlations"] = Correlations Results["FDRs"] = FDRs if Arguments.Filename.lower() == "default": Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
true_positive += 1 else: false_positive += 1 elif y[i] == 0: if prediction == 0: true_negative += 1 else: false_negative += 1 precision = true_positive / (true_positive + false_positive) recall = true_positive / (true_positive + false_negative) return (2 * precision * recall) / (precision + recall) data = load_data() train_X = [d[:24] for d in data[:int(len(data) * .8)]] train_y = [d[24] for d in data[:int(len(data) * .8)]] test_X = [d[:24] for d in data[int(len(data) * 0.2):]] test_y = [d[24] for d in data[int(len(data) * 0.2):]] clf = SVC(kernel='linear') clf.fit(train_X, train_y) print("SVC linear training set f-measure: ", f_measure(clf, train_X, train_y)) print("SVC linear test set f-measure: ", f_measure(clf, test_X, test_y)) clf = SVC(kernel='rbf') clf.fit(train_X, train_y)