Exemple #1
0
def CalcCorrelation(percentage, N, index):
    CreateTempResFile(percentage, N)
    getTrecEval(measure, index)
    x = [res.std for Qnr, res in QueriesRes.iteritems()]
    y = [res.trecScore for Qnr, res in QueriesRes.iteritems()]
    std_p = pearsonr(x, y)[0]
    std_s = spearmanr(x, y)[0]
    x = [
        res.std / math.sqrt(len(Qterms[Qnr].split()))
        for Qnr, res in QueriesRes.iteritems()
    ]
    std_n_p = pearsonr(x, y)[0]
    std_n_s = spearmanr(x, y)[0]
    x = [res.MAD for Qnr, res in QueriesRes.iteritems()]
    mad_p = pearsonr(x, y)[0]
    mad_s = spearmanr(x, y)[0]
    x = [
        res.MAD / math.sqrt(len(Qterms[Qnr].split()))
        for Qnr, res in QueriesRes.iteritems()
    ]
    mad_n_p = pearsonr(x, y)[0]
    mad_n_s = spearmanr(x, y)[0]
    if debug:
        print "N", N, "----", "Percentage", percentage
        print "std pearson      ", std_p
        print "std spearman     ", std_s
        print "std norm pearson ", std_n_p
        print "std norm spearman", std_n_s
        print "MAD pearson      ", mad_p
        print "MAD spearman     ", mad_s
        print "MAD norm pearson ", mad_n_p
        print "MAD norm spearman", mad_n_s
    return (std_p, std_s, std_n_p, std_n_s, mad_p, mad_s, mad_n_p, mad_n_s)
Exemple #2
0
def plotSpecific(A, y):
    change_threshold = np.arange(100) * 0.01
    samples = A[:, 5]
    (before, after) = getPrevNext(y, 10)
    (beforeE, afterE) = getPrevNext(y, 80)
    corr = []
    corrE = []
    for ch_th in change_threshold:
        B = (samples > ch_th).astype(int)
        (co, p) = spearmanr(B[10:], after[10:])
        corr.append(co)
        (coE, pE) = spearmanr(B[80:], afterE[80:])
        corrE.append(coE)
    l1, = plt.plot(change_threshold, corr, 'b')
    l2, = plt.plot(change_threshold, corrE, 'r')
    plt.legend([l1, l2], ['Next 10 Builds', 'Next 80 Builds'], loc=1)
    plt.xlim([0, 1])
    plt.ylabel('Correlation')
    plt.xlabel('Change Threshold')
    plt.title('Spearman: ' + featureList[5] + ' vs Builds')
    plt.show()

    next_threshold = np.arange(1, 250)
    B = (samples > 0.0).astype(int)
    corr = []
    for nx_th in next_threshold:
        (before, after) = getPrevNext(y, nx_th)
        (co, p) = spearmanr(B[nx_th:], before[nx_th:])
        corr.append(co)
    plt.plot(next_threshold, corr)
    plt.xlim([1, 250])
    plt.ylabel('Correlation')
    plt.xlabel('Previous Builds')
    plt.title('Spearman: ' + featureList[5] + ' at (0.0) vs Previous n Builds')
    plt.show()
def test_similarity_2(model, vocab):
    """Test the model for similarity. Method: get correlation between model similarity
    and similarity of items in the test set.
    
    This method is using data from Ruts et al. (2004)"""
    d = ruts_etal_similarity.get_similarity_dict()
    results = {category: {"skipped": set()} for category in d}
    pred_overall = []
    actual_overall = []
    for category in d:
        predicted_values = []
        actual_values = []
        for pair, score in d[category].items():
            if set(pair).issubset(vocab):
                predicted_values.append(model.similarity(*pair))
                actual_values.append(score)
            else:
                results[category]["skipped"].update(set(pair) - vocab)
            pred_overall += predicted_values
            actual_overall += actual_values
        results[category]["pairs_tested"] = len(predicted_values)
        results[category]["pearsonr"] = pearsonr(predicted_values, actual_values)
        results[category]["spearmanr"] = spearmanr(predicted_values, actual_values)
    results["overall"] = dict()
    results["overall"]["pairs_tested"] = len(predicted_values)
    results["overall"]["pearsonr"] = pearsonr(pred_overall, actual_overall)
    results["overall"]["spearmanr"] = spearmanr(pred_overall, actual_overall)
    return results
Exemple #4
0
def calc_auc_on_flat_results(all_y_train, all_scores_train, all_test_real_tags,
                             all_test_score_tags):
    try:
        test_auc = metrics.roc_auc_score(all_test_real_tags,
                                         all_test_score_tags)
        test_rho, p_value = stats.spearmanr(all_test_real_tags,
                                            all_test_score_tags)

        train_auc = metrics.roc_auc_score(all_y_train, all_scores_train)
        train_rho, pval_train = stats.spearmanr(all_y_train, all_scores_train)
        print("summary-----------------------")
        print("test_auc: " + str(test_auc))
        print("test_rho: " + str(test_rho))
        print("train_auc: " + str(train_auc))
        print("train_rho: " + str(train_rho))
    except ValueError:
        # Compute ROC curve and ROC area for each class
        print("train classification_report")
        train_auc = metrics.classification_report(all_y_train,
                                                  all_scores_train)
        for row in train_auc.split("\n"):
            print(row)
        print("test classification_report")
        test_auc = metrics.classification_report(all_test_real_tags,
                                                 all_test_score_tags)
        for row in test_auc.split("\n"):
            print(row)

        train_rho, pval_train = stats.spearmanr(all_y_train, all_scores_train)
        test_rho, p_value = stats.spearmanr(all_test_real_tags,
                                            all_test_score_tags)

    return train_auc, test_auc, train_rho, test_rho
Exemple #5
0
def CalcCorrelation(percentage, N,  index):
    CreateTempResFile(percentage, N)
    getTrecEval(measure,  index)
    x = [res.std for Qnr, res in QueriesRes.iteritems()]
    y = [res.trecScore for Qnr, res in QueriesRes.iteritems()]
    std_p = pearsonr(x, y)[0]
    std_s = spearmanr(x, y)[0]
    x = [res.std / math.sqrt(len(Qterms[Qnr].split())) for Qnr, res in QueriesRes.iteritems()]
    std_n_p = pearsonr(x, y)[0]
    std_n_s = spearmanr(x, y)[0]
    x = [res.MAD for Qnr, res in QueriesRes.iteritems()]
    mad_p = pearsonr(x, y)[0]
    mad_s = spearmanr(x, y)[0]
    x = [res.MAD / math.sqrt(len(Qterms[Qnr].split())) for Qnr, res in QueriesRes.iteritems()]
    mad_n_p = pearsonr(x, y)[0]
    mad_n_s = spearmanr(x, y)[0]
    if debug:
        print "N", N, "----", "Percentage", percentage
        print "std pearson      ", std_p
        print "std spearman     ", std_s
        print "std norm pearson ", std_n_p
        print "std norm spearman", std_n_s
        print "MAD pearson      ", mad_p 
        print "MAD spearman     ", mad_s
        print "MAD norm pearson ", mad_n_p
        print "MAD norm spearman", mad_n_s
    return (std_p, std_s, std_n_p, std_n_s, mad_p,mad_s, mad_n_p, mad_n_s)
Exemple #6
0
def eval(m, tok, task_name):
    with open(task_name, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines = [line.strip().split("\t") for line in lines]
    ys = [float(line[2]) for line in lines]

    input_term = []
    for line in lines:
        input_term.append(line[0])
    for line in lines:
        input_term.append(line[1])

    if tok is not None:
        preds_cls, preds_mean = get_simlarity_bert(input_term[0:len(lines)],
                                                   input_term[len(lines):], m,
                                                   tok)
        c_cls, p_cls = spearmanr(preds_cls, ys)
        print(task_name, "CLS", c_cls, p_cls)
        c_mean, p_mean = spearmanr(preds_mean, ys)
        print(task_name, "MEAN", c_mean, p_mean)
    else:
        try:
            dim = m.values()[0].shape[0]
        except BaseException:
            try:
                dim = m.vector_size
            except BaseException:
                dim = 300
        preds = get_simlarity(input_term[0:len(lines)],
                              input_term[len(lines):], m, dim)
        c, p = spearmanr(preds, ys)
        print(task_name, c, p)
def correlation(human_df, vectors, corpus):
    all_values = defaultdict(list)
    facet_human_vals = defaultdict(list)
    facet_pred_vals = defaultdict(list)

    for i, row in human_df.iterrows():
        pred_df = get_facet_sims_of_books(vectors, corpus, row['ID_A'],
                                          row['ID_B'])

        real_val = row["Similarity"]
        pred_val = pred_df[row["Facet"]].values[0]

        facet_human_vals[row["Facet"]].append(real_val)
        facet_pred_vals[row["Facet"]].append(pred_val)

        all_values['real'].append(real_val)
        all_values['predicted'].append(pred_val)

    # noinspection PyTypeChecker
    complete_correlation = stats.spearmanr(all_values['real'],
                                           all_values['predicted'])

    facet_correlation = {}
    for facet in facet_human_vals:
        # noinspection PyTypeChecker
        facet_correlation[facet] = stats.spearmanr(facet_human_vals[facet],
                                                   facet_pred_vals[facet])

    return complete_correlation, facet_correlation
def gen_input_goatools(filename, infos, exps, cutoff, q_info, q_exp, gos):
    out = open(filename, "w")
    datas = []
    ids = []
    pro_gos = []
    ccs = []
    for info, exp in zip(infos, exps):
        if info != q_info:
            if "positive" in filename:
                if (float(spearmanr(exp, q_exp)[0]) >= cutoff):
                    if get_pro_id(info) is not None:
                        out.write(get_pro_id(info) + "\n")
                    detect = False
                    for pro_id, go_list in gos.items():
                        if pro_id in info:
                            pro_gos.append(go_list)
                            detect = True
                            break
                    if not detect:
                        pro_gos.append("NA")
                    datas.append(exp)
                    ids.append(info)
                    ccs.append("{0:.5f}".format(float(spearmanr(exp, q_exp)[0])))
            elif "negative" in filename:
                if (float(spearmanr(exp, q_exp)[0]) <= cutoff):
                    detect = False
                    if get_pro_id(info) is not None:
                        out.write(get_pro_id(info) + "\n")
                    for pro_id, go_list in gos.items():
                        if pro_id in info:
                            pro_gos.append(go_list)
                            detect = True
                            break
                    if not detect:
                        pro_gos.append("NA")
                    datas.append(exp)
                    ids.append(info)
                    ccs.append("{0:.5f}".format(float(spearmanr(exp, q_exp)[0])))
    out.close()
    out_go = open(filename + "_go", "w")
    call(["python3", args.goatools_path, "--pval=0.05", "--indent",
          "--obo=" + args.obo_file, filename, args.population_file,
          args.go_association], stdout=out_go)
    out_go.close()
    fh = open(filename + "_go", "r")
    start = False
    enrichs = []
    for row in csv.reader(fh, delimiter='\t'):
        if start:
            if row[2] == "e":
                enrichs.append(row[0].replace(".", ""))
        if row[0] == "GO":
            start = True
    fh.close()
    os.remove(filename)
    os.remove(filename + "_go")
    return datas, ids, pro_gos, enrichs, ccs
Exemple #9
0
def plot_correlation(regr, name, X, y, loo):
    
    regr.fit(X, y)   
    
    y_pred, y_true = [], []
    for train_index, test_index in loo.split(X):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = y[train_index], y[test_index]

        _=regr.fit(X_train, Y_train)

        pred = regr.predict(X_test)
        y_pred.append(np.squeeze(pred))
        y_true.append(np.squeeze(Y_test))

    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    r2 = round(metrics.r2_score(y_true, y_pred),4)
    pearson = round(pearsonr(y_true, y_pred)[0],4)
    spearman = round(spearmanr(y_true, y_pred)[0],4)
    
    print('R-squared:', metrics.r2_score(y_true, y_pred))
    print('Person:', pearsonr(y_true, y_pred))
    print(spearmanr(y_true, y_pred),'\n')

    trace_1 = go.Scatter(
        x = y_true, y = y_pred, mode = 'markers', name='Scatter',
        marker = dict(size = 12, opacity = 0.5)
    )
    
    xs, ys = np.array(y_true), np.array(y_pred)

    regr = linear_model.LinearRegression()
    regr.fit(xs.reshape(-1, 1), ys.reshape(-1, 1))

    ys_pred = regr.predict(xs.reshape(-1, 1))
    trace_2 = go.Scatter(
        x = xs, y = np.squeeze(ys_pred), name='Regression',
        mode = 'lines', line = dict(width = 4)
    )

    name += 'R-squared: ' + str(r2) + \
    ', Pearson: ' + str(pearson) + \
    ', Spearman: ' + str(spearman)
  
    layout = go.Layout(
        title=name,
        width=650,
        yaxis= dict(title='Predicted'),
        xaxis= dict(title='Breteau index'),
        font=dict(size=16)
    )
    fig = go.Figure(data=[trace_1, trace_2], layout=layout)
    iplot(fig)
Exemple #10
0
def spearman(set_1, set_2, onlyfound=False):
    if onlyfound:
        set1 = []
        set2 = []
        for s1, s2 in zip(set_1, set_2):
            if s1 != -1 and s2 != -1:
                set1.append(s1)
                set2.append(s2)
        return spearmanr(set1, set2)[0]
    else:
        return spearmanr(set_1, set_2)[0]
Exemple #11
0
def outputResults(out1_epsilon, out2_epsilon, kernel, train_lt, test_lt):
    # Output the results to the appropriate output files
    writeFloatList(out1_epsilon, TRAINPREDICTIONSEPSILONFILENAME)
    writeFloatList(out2_epsilon, VALIDATIONPREDICTIONSEPSILONFILENAME)
    print "Pearson correlation between training labels and predictions, epsilon SVR:"
    print pearsonr(train_lt, out1_epsilon)
    print "Spearman correlation between training labels and predictions, epsilon SVR:"
    print spearmanr(train_lt, out1_epsilon)
    print "Pearson correlation between validation labels and predictions, epsilon SVR:"
    print pearsonr(test_lt, out2_epsilon)
    print "Spearman correlation between validation labels and predictions, epsilon SVR:"
    print spearmanr(test_lt, out2_epsilon)
def outputResults(out1_epsilon, out2_epsilon, kernel,  train_lt, test_lt):
	# Output the results to the appropriate output files
	writeFloatList(out1_epsilon, TRAINPREDICTIONSEPSILONFILENAME)
	writeFloatList(out2_epsilon, VALIDATIONPREDICTIONSEPSILONFILENAME)
	print "Pearson correlation between training labels and predictions, epsilon SVR:"
	print pearsonr(train_lt, out1_epsilon)
	print "Spearman correlation between training labels and predictions, epsilon SVR:"
	print spearmanr(train_lt, out1_epsilon)
	print "Pearson correlation between validation labels and predictions, epsilon SVR:"
	print pearsonr(test_lt, out2_epsilon)
	print "Spearman correlation between validation labels and predictions, epsilon SVR:"
	print spearmanr(test_lt, out2_epsilon)
def find_spearman_score(y, pred_y):
    if np.ndim(y) == 2:
        count = 0
        sum = 0
        for i in range(pred_y.shape[1]):
            corr = stats.spearmanr(y[:, i], pred_y[:, i])[0]
            if np.isnan(corr):
                continue
            count = count + 1
            sum = sum + corr
        return sum / count
    else:
        corr = stats.spearmanr(y, pred_y)[0]
    return corr
def main(kdts_path, drop_zeros):
    # Read in kdts data
    with open(kdts_path, "rb") as infile:
        slice_idx_to_data = pkl.load(infile)
    # Reduce to flat lists of distances
    gk = ('wlst', 'logical_time', 5)
    slice_indices = sorted(slice_idx_to_data.keys())
    nd_fraction_labels = [0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    flat_dists_seq = get_distances_seq(slice_idx_to_data, slice_indices, gk)

    if drop_zeros:
        for i in range(1, len(flat_dists_seq)):
            dists = flat_dists_seq[i]
            without_zeros = list(filter(lambda x: x != 0, dists))
            flat_dists_seq[i] = without_zeros

    #for i in range(len(flat_dists_seq)):
    #    dists = flat_dists_seq[i]
    #    n_zeros = 0
    #    for d in dists:
    #        if d == 0.0:
    #            n_zeros += 1
    #    percent_zeros = n_zeros / len(dists)
    #    print("% ND: {} --> # zeros: {}, % zeros: {}".format(nd_fraction_labels[i],n_zeros, percent_zeros))

    # Associate each kernel distance with the non-determinism fraction of the
    # runs its generating graphs represent
    nd_fraction_seq = []
    dist_seq = []
    for i in range(len(nd_fraction_labels)):
        for d in flat_dists_seq[i]:
            nd_fraction_seq.append(nd_fraction_labels[i])
            dist_seq.append(d)

    pearson_r, pearson_p = pearsonr(nd_fraction_seq, dist_seq)
    spearman_r, spearman_p = spearmanr(nd_fraction_seq, dist_seq)
    print("Kernel distance vs. % ND --> Pearson-R = {}, p = {}".format(
        pearson_r, pearson_p))
    print("Kernel distance vs. % ND --> Spearman-R = {}, p = {}".format(
        spearman_r, spearman_p))

    all_stats_seq = get_stats_seq(flat_dists_seq)
    for stat in ["mean", "median", "max", "variance"]:
        stats_seq = [s[stat] for s in all_stats_seq]
        pearson_r, pearson_p = pearsonr(nd_fraction_labels, stats_seq)
        spearman_r, spearman_p = spearmanr(nd_fraction_labels, stats_seq)
        print("Kernel distance {} vs. % ND --> Pearson-R = {}, p = {}".format(
            stat, pearson_r, pearson_p))
        print("Kernel distance {} vs. % ND --> Spearman-R = {}, p = {}".format(
            stat, spearman_r, spearman_p))
Exemple #15
0
def calc_auc_on_joined_results(Cross_validation, y_trains, y_train_preds,
                               y_tests, y_test_preds):
    all_y_train = np.array(y_trains).flatten()
    all_y_train
    for i in range(Cross_validation):
        all_y_train = all_y_train + y_trains[i]

    all_predictions_train = []
    for i in range(Cross_validation):
        all_predictions_train = all_predictions_train + list(y_train_preds[i])

    all_test_real_tags = []
    for i in range(Cross_validation):
        all_test_real_tags = all_test_real_tags + y_tests[i]

    all_test_pred_tags = []
    for i in range(Cross_validation):
        all_test_pred_tags = all_test_pred_tags + list(y_test_preds[i])

    try:
        train_auc = metrics.roc_auc_score(all_y_train, all_predictions_train)
        #fpr, tpr, thresholds = metrics.roc_auc_score(all_test_real_tags, all_test_pred_tags)
        # test_auc = metrics.auc(fpr, tpr)
        test_auc = metrics.roc_auc_score(all_test_real_tags,
                                         all_test_pred_tags)
        train_rho, pval_train = stats.spearmanr(
            all_y_train, np.array(all_predictions_train))
        test_rho, p_value = stats.spearmanr(all_test_real_tags,
                                            np.array(all_test_pred_tags))
    except ValueError:
        # Compute ROC curve and ROC area for each class
        print("train classification_report")
        train_auc = metrics.classification_report(all_y_train,
                                                  all_predictions_train)
        for row in train_auc.split("\n"):
            print(row)
        print("test classification_report")
        test_auc = metrics.classification_report(all_test_real_tags,
                                                 all_test_pred_tags)
        for row in test_auc.split("\n"):
            print(row)

        train_rho, pval_train = stats.spearmanr(
            all_y_train, np.array(all_predictions_train))
        test_rho, p_value = stats.spearmanr(all_test_real_tags,
                                            np.array(all_test_pred_tags))

    return all_y_train, all_predictions_train, all_test_real_tags, all_test_pred_tags,\
           train_auc, test_auc, train_rho, test_rho
Exemple #16
0
def getStatistics(A, y):

    prNx_threshold = [2, 3, 5, 10]
    change_threshold = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5]

    for feature in range(A.shape[1]):
        print('\n')
        print('#' * 150)
        print(featureList[feature])

        samples = A[:, feature]

        print('M vs Out ' + str(pearsonr(samples, y)))

        for ch_th in change_threshold:
            B = (A[:, feature] > ch_th).astype(int)
            print('Changes over Threshold ' + str(ch_th) + ': ' +
                  str((B == 1).sum()))
            if ((B == 1).sum()) > 0:
                print('Ch (' + str(ch_th) + ') vs Out : ' +
                      str(spearmanr(B, y)))
                failsIfChange = 0
                for i in range(len(B)):
                    if B[i] == 1 and y[i] != 0:
                        failsIfChange += 1
                print('P(fail | change): ' + str(failsIfChange) + '/' +
                      str((B == 1).sum()) + ' = ' + str(failsIfChange /
                                                        (B == 1).sum()))
                print('P(change | fail): ' + str(failsIfChange) + '/' +
                      str((y != 0).sum()) + ' = ' + str(failsIfChange /
                                                        (y != 0).sum()))

        for pr_th in prNx_threshold:
            (before, after) = getPrevNext(y, pr_th)
            print('M vs Bef (' + str(pr_th) + '): ' +
                  str(pearsonr(samples[pr_th:], before[pr_th:])))
            print('M vs Nxt (' + str(pr_th) + '): ' +
                  str(pearsonr(samples[pr_th:], after[pr_th:])))

            for ch_th in change_threshold:
                B = (A[:, feature] > ch_th).astype(int)
                if ((B == 1).sum()) > 0:
                    print('Ch (' + str(ch_th) + ') vs Bef (' + str(pr_th) +
                          '): ' + str(spearmanr(B[pr_th:], before[pr_th:])))
                    print('Ch (' + str(ch_th) + ') vs Nxt (' + str(pr_th) +
                          '): ' + str(spearmanr(B[pr_th:], after[pr_th:])))

        print('#' * 150)
Exemple #17
0
def npccf(x, y, method="spearmanr", min_lag=-10, max_lag=10):
    """ Compute cross correlation of time series x and y from min_lag to max_lag 
    (based on nonparametric correlation). r(lag) = corr(x[t-lag], y[t]).
    
    Parameters
    ----------
    x: time series
    y: time series
    method: "spearmanr" or "kendalltau"
    min_lag : int, default -10
    max_lag : int, default 10

    Returns
    ----------
    a dictionary with keys "corrs" (correlation coefficient corresponding to the lags),
    "lags" (corresponding lags), "lb" (lower bound) and "ub" (upper bound).
    """
    n1 = len(x)
    n2 = len(y)
    assert (n1 == n2
            ), "The length of time series x and time series y must be equal!"
    assert (min_lag <= max_lag), "min_lag must less than or equal to max_lag!"
    nlags = max_lag - min_lag + 1
    corrs = np.empty(nlags)
    if method == "spearmanr":
        for k, lag in enumerate(range(min_lag, (max_lag + 1))):
            if lag == 0:
                corrs[k] = spearmanr(x, y)[0]
            if lag < 0:
                corrs[k] = spearmanr(x[(-lag):], y[:lag])[0]
            if lag > 0:
                corrs[k] = spearmanr(x[:(-lag)], y[lag:])[0]
    elif method == "kendalltau":
        for k, lag in enumerate(range(min_lag, (max_lag + 1))):
            if lag == 0:
                corrs[k] = kendalltau(x, y)[0]
            if lag < 0:
                corrs[k] = kendalltau(x[(-lag):], y[:lag])[0]
            if lag > 0:
                corrs[k] = kendalltau(x[:(-lag)], y[lag:])[0]
    else:
        raise ValueError("The method %s is not supported." % method)
    return {
        "corrs": corrs,
        "lags": range(min_lag, (max_lag + 1)),
        "lb": np.repeat(-1 / np.sqrt(n1), nlags),
        "ub": np.repeat(1 / np.sqrt(n1), nlags)
    }
            def optimize_dist(nf, optimize=1):
                dist_vec = [
                ]  #array with accuracies for each pair within each LOOVC fold

                def nf_select(nf):
                    #fselector = mvpa2.FixedNElementTailSelector(np.round(nf), tail='upper',mode='select', sort=False)
                    #sbfs = mvpa2.SensitivityBasedFeatureSelection(mvpa2.OneWayAnova(), fselector, enable_ca=['sensitivities'], auto_train=True)
                    if (optimize == 1):
                        not_test_ds = ds[ds.chunks != chunk]
                        train_ds = not_test_ds[not_test_ds.chunks != val_chunk]
                        #sbfs.train(train_ds)
                        #ds2 = sbfs(not_test_ds) #optimize nf & include validation set for computing dists
                    elif (optimize == 0):
                        train_ds = ds[
                            ds.chunks !=
                            chunk]  #retrain with all data if not optimizing
                        #sbfs.train(train_ds)
                        #ds2 = sbfs(ds) #pick top features with training & use whole dataset for computing dists
                    return ds2

                #ds2 = nf_select(nf)
                for y in range(0, len(pair_list2)):

                    def mask(y, ds):
                        stim_mask_train0 = (ds.targets == pair_list2[y][0])
                        stim_mask_train1 = (ds.targets == pair_list2[y][1])
                        ds_stim1 = ds[stim_mask_train0]
                        ds_stim2 = ds[stim_mask_train1]
                        return ds_stim1, ds_stim2

                    ds_stim1, ds_stim2 = mask(y, ds)
                    dist_vec.append(
                        distance_funcs(np.mean(ds_stim1, axis=0),
                                       np.mean(ds_stim2, axis=0), ds.samples,
                                       ds_stim1, ds_stim2, dist))
                if (optimize == 1):
                    corr_test = spearmanr(val_accs, dist_vec)
                    #corr_test = pearsonr(val_accs[i],dist_vec)
                elif (optimize == 0):
                    corr_test = spearmanr(test_accs, dist_vec)
                    #corr_test = pearsonr(test_accs[i],dist_vec)
                corr = corr_test[0]
                pval = corr_test[1]
                #print corr, ',', pval, 'distance:', dist, np.round(nf), ', features,', 'chunk', chunk
                if (optimize == 1):
                    return 1 - corr
                elif (optimize == 0):
                    return corr, pval, dist_vec
def checarspearman_paralelo():
    lista = [
        [rbc_gel, rbc_amb],
        [muc_amb, muc_gel],
        [caoxd_amb, caoxd_gel],
        [hya_amb, hya_gel],
        [bac_amb, bac_gel],
        [pat_amb, pat_gel],
        [wbc_amb, wbc_gel],
        [epi_amb, epi_gel],
        [tri_amb, tri_gel],
        [uri_amb, uri_gel],
        [yea_amb, yea_gel],
        [amo_amb, amo_gel],
    ]
    relatorio = []
    print(___l)
    print()
    print("             RELATÓRIO DE CORRELAÇÕES (SPEARMAN) PARALELO")
    print()
    print(___l)
    for ex in lista:
        exame1, exame2 = ex[0], ex[1]
        print()
        print("RESULTADOS DE", list(Series(exame1["EXAME"]))[0], ":")
        for i in ["H4", "H8", "H12", "H24"]:
            p = ""
            resultado = spearmanr(np.array(exame1[i].map(float)), np.array(exame2[i].map(float)))
            relatorio.append(resultado)
            if resultado[1] > 0.05:
                p = "***"
            print("Entre", i, "e", i, ":", resultado[0], "(Spearman ρ), e ", resultado[1], "(valor de p)", p)
    print(___l)
    return relatorio
def correlations_ground_truth():
    print 'ground truth'
    #load network
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz")
    #read counts with zeros
    article_counts  =  pd.read_csv(TMP+'article_counts.tsv', sep='\t')
    cor = {}
    for damping in [0.8,0.9]:
        page_rank = pagerank(wikipedia, damping=damping)
        wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank
        page_rank_values = list()
        counts = list()
        correlations_values = {}
        for index, row in article_counts.iterrows():
            counts.append(float(row['counts']))
            page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))])
        print 'pearson'
        p = pearsonr(page_rank_values, counts)
        print p
        correlations_values['pearson']=p
        print 'spearmanr'
        s = spearmanr(page_rank_values, counts)
        print s
        correlations_values['spearmanr']=s
        print 'kendalltau'
        k = kendalltau(page_rank_values, counts)
        print k
        correlations_values['kendalltau']=k
        cor['page_rank_'+str(damping)]=correlations_values
    write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor)
def correlations_weighted_unweighted(labels):
    #load network
    print 'weighted vs unweighted'
    name = '_'.join(labels)
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    #read counts with zeros

    wikipedia_u = load_graph("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
    correlations_weighted_pagerank = {}
    for label in labels:
        for damping in [0.8,0.85,0.9]:
            correlations_values={}
            key_weighted = label+"_page_rank_weighted_"+str(damping)
            pagerank_weighted = wikipedia.vertex_properties[key_weighted]
            key_unweighted = "page_rank"+str(damping)
            pagerank_unweighted = wikipedia_u.vertex_properties[key_unweighted]
            print 'pearson'
            p = pearsonr(pagerank_weighted.a, pagerank_unweighted.a)
            print p
            correlations_values['pearson']=p
            print 'spearmanr'
            s = spearmanr(pagerank_weighted.a, pagerank_unweighted.a)
            print s
            correlations_values['spearmanr']=s
            print 'kendalltau'
            k = kendalltau(pagerank_weighted.a, pagerank_unweighted.a)
            print k
            correlations_values['kendalltau']=k
            correlations_weighted_pagerank[label+str(damping)]=correlations_values

    write_pickle(HOME+'output/correlations/correlations_pagerank_weightedvsunweighted'+name+'.obj', correlations_weighted_pagerank)
Exemple #22
0
def control_followers(db, ids, feats, data, repins, dataset):

	divs = [1, 200, 600, 1700, 4700, 13000, 1000000]

	n = len(divs)
	pins = db.get_pins_info(ids)
	
	followers = np.asarray([pins[pid][2] for pid in ids])

	groups = []
	for i in xrange(n-1) :
		g = np.nonzero((followers>=divs[i]) & (followers<divs[i+1]))[0]
		groups.append(g)

		print "%d < followers < %d (%d)\t" % (divs[i], divs[i+1], len(g))


	corrs = np.ones((len(feats), len(groups)), float)
	for i in xrange(len(feats)) :
		print "Feature:", feats[i]

		for j in xrange(len(groups)):

			_data = data[groups[j],i]
			_repins = repins[groups[j]]

			corrs[i,j] = spearmanr(_data, _repins)[1]
			print corrs[i,j],
def mono_bin(Y, X, n = 20):
    r = 0
    good=Y.sum()
    bad=Y.count()-good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X.rank(method='first'), n)})

        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
    d3['min']=d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
    d3['goodattribute'] = d3['sum'] / good
    d3['badattribute'] = (d3['total'] - d3['sum']) / bad
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
    d4 = (d3.sort_values(by='min')).reset_index(drop=True)
    print("=" * 60)
    print(d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1, n + 1):
        qua = X.quantile(i / (n + 1))
        cut.append(round(qua, 4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return iv,cut,woe
def get_performance_simple(d, cutoff=CUTOFF_AFFINITY_LOG):
    """
    INPUT:
      y     = A list of measured affinities in log10(ic50 nM) units.
      ypred = A list of predicted affinities in log10(ic50 nM) units.
    OUTPUT:
        1) pearson
        2) aroc
        3) rmsd
    """
    meas = d['y']
    labels = [x < cutoff for x in meas] # 1 = binder; 0 = nonbinder
    pred = d['ypred']
    
    # Get AUC:
    auc_model = ROC(pred, labels)
   
    
    # Get Pearson's correlation:
    # cor_pearson, cor_pearson_pai
    cor_pearson = pearsonr(meas, pred)
    cor_spearman = spearmanr(meas, pred)
    rmsd = get_rmsd(meas, pred)
    
    row = (cor_pearson[0], auc_model[0], rmsd)
    return row
def get_bigrams_for_feature(word_feature, neus, capacity):
    """
    Extract unigram features. The most frequent $capacity number of bigrams will be selected.
    Then these bigrams will be filted based on pearson regression.
    """
    top_bigrams = word_feature.get_top_bigrams(capacity)
    print 'most frequent bigrams: ', top_bigrams

    candicate_map = dict()
    for bigram in top_bigrams:
        bg_counts = word_feature.get_feauture_by_bigram(bigram)
        p = spearmanr(bg_counts, neus)
        if not math.isnan(p[0]):
            candicate_map[bigram] = p

    selected_bigrams = list()
    for candicate_bigram in candicate_map.keys():
        idx = 0
        while idx < len(selected_bigrams):
            if abs(candicate_map[selected_bigrams[idx]][0]) < abs(candicate_map[candicate_bigram][0]):
                break
            idx += 1
        selected_bigrams.insert(idx, candicate_bigram)
        if len(selected_bigrams) > bigram_capacity:
            selected_bigrams = selected_bigrams[:bigram_capacity]

    print '======== selected bigrams for feature ========'
    for selected_bigram in selected_bigrams:
        print selected_bigram, ':', candicate_map.get(selected_bigram)
    return selected_bigrams
def get_unigrams_for_feature(word_feature, neus, capacity):
    """
    Extract unigram features. The most frequent $capacity number of unigrams will be selected.
    Then these words will be filted based on spearman rank correlation.
    """
    top_unigrams = word_feature.get_top_unigrams(capacity)
    print 'most frequent unigrams: ', top_unigrams
    candicate_map = dict()
    for unigram in top_unigrams:
        w_counts = word_feature.get_feauture_by_unigram(unigram)
        p = spearmanr(w_counts, neus)
        if not math.isnan(p[0]):
            candicate_map[unigram] = p

    selected_unigrams = list()
    for candicate_unigram in candicate_map.keys():
        if len(selected_unigrams) == 0:
            selected_unigrams.append(candicate_unigram)
            continue
        idx = 0
        while idx < len(selected_unigrams):
            if abs(candicate_map[selected_unigrams[idx]][0]) < abs(candicate_map[candicate_unigram][0]):
                break
            idx += 1
        selected_unigrams.insert(idx, candicate_unigram)
        if len(selected_unigrams) > unigram_capacity:
            selected_unigrams = selected_unigrams[:unigram_capacity]

    print '======== selected unigrams for feature ========'
    for selected_unigram in selected_unigrams:
        print selected_unigram, ':', candicate_map.get(selected_unigram)
    return selected_unigrams
 def get_metrics(self, y, yhat, name):
     mse = self.compute_mse(y, yhat)
     pearson = pearsonr(y, yhat)[0][0]
     kendall = kendalltau(y, yhat)[0]
     spearman = spearmanr(y, yhat)[0]
     return {"lat": self.lat, "lon": self.lon, "model": name, "mse": mse,
             "pearson": pearson, "kendall": kendall, "spearman": spearman}
    def correlationBetweenAllFeaturesAndMOS(self):
        w, h = self.datafortrain.shape
        PLCC = []
        SROCC = []
        for i in range(h - 1):
            x = self.datafortrain[:, 0]
            y = self.datafortrain[:, i + 1]
            plcc, pval = statstool.pearsonr(x, y)
            srocc, pval = statstool.spearmanr(x, y)
            PLCC.append(plcc), SROCC.append(srocc)
        N = 15

        ind = np.arange(N)  # the x locations for the groups
        width = 0.35  # the width of the bars

        fig, ax = plt.subplots()

        # add some
        ax.set_xlabel(u'特征序号', fontsize=18)
        ax.set_ylabel(u'相关系数', fontsize=18)
        # ax.set_title('The Correlation between features and mos')

        rects1 = ax.bar(ind, tuple(PLCC), width, color='r')
        rects2 = ax.bar(ind + width, tuple(SROCC), width, color='b')
        ax.set_xticks(ind)
        ax.set_xticklabels(
            ('f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15'))
        ax.legend((rects1[0], rects2[0]), ('PLCC', 'SROCC'))
        plt.legend(loc='center right')
        plt.show()
def test_goodness(model, vocab):
    """Tests the model on its ability to create a goodness ranking for a category.
    Method: get spearman (rank) correlation between the predicted and the actual ranking.
    
    This method is using data from De Deyne et al. (2008)"""
    d = dedeyne_etal_goodness.get_goodness_rankings()
    results = {category: dict() for category in d}
    categories = set(d.keys()) & vocab
    for category in categories:
        exemplars = set(d[category]) & vocab
        sorted_exemplars = [
            b for a, b in sorted([(model.similarity(category, ex), ex) for ex in exemplars], reverse=True)
        ]
        predicted_ranking = []
        actual_ranking = []
        for exemplar in exemplars:
            actual_ranking.append(d[category].index(exemplar))
            predicted_ranking.append(sorted_exemplars.index(exemplar))
        results[category]["spearman"] = spearmanr(predicted_ranking, actual_ranking)
        results[category]["kendall"] = kendalltau(predicted_ranking, actual_ranking)
        results[category]["num_items"] = len(exemplars)
    avg_spearman = float(sum(abs(results[cat]["spearman"][0]) for cat in categories)) / len(categories)
    avg_kendall = float(sum(abs(results[cat]["kendall"][0]) for cat in categories)) / len(categories)
    results["overall"] = dict()
    results["overall"]["avg_spearman"] = avg_spearman
    results["overall"]["avg_kendall"] = avg_kendall
    return results
Exemple #30
0
def spearman_with_errors(x, y, yerr, Nmc=1000, plotflag=False, verbose=False):
    ysim = np.zeros(Nmc, 'f')
    rhosim = np.zeros(Nmc, 'f')
    psim = np.zeros(Nmc, 'f')

    for i in range(Nmc):
        ysim = np.random.normal(y, scale=yerr, size=len(y))
        rhosim[i], psim[i] = spearmanr(x, ysim)
    cave = np.mean(rhosim)
    cstd = np.std(rhosim)
    q1 = 50 - 34  # mean minus one std
    lower = np.percentile(rhosim, q1)
    q2 = 50 + 34  # mean minus one std
    upper = np.percentile(rhosim, q2)
    print 'mean (median) = %5.2f (%5.2f), std = %5.2f' % (
        cave, np.median(rhosim), cstd)
    print 'confidence interval from sorted list of MC fit values:'
    print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)' % (lower, cave - cstd,
                                                            upper, cave + cstd)
    k, pnorm = normaltest(rhosim)
    print 'probability that distribution of slopes is normal = %5.2f' % (pnorm)
    if plotflag:
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        plt.hist(rhosim, bins=10, normed=True)
        plt.xlabel(r'$Spearman \ \rho $')
        plt.axvline(x=cave, ls='-', color='k')
        plt.axvline(x=lower, ls='--', color='k')
        plt.axvline(x=upper, ls='--', color='k')
        plt.subplot(1, 2, 2)
        plt.hist(np.log10(psim), bins=10, normed=True)
        plt.xlabel(r'$\log_{10}(p \ value)$')
    return rhosim, psim
def third_order_poly_fit_plot (x, y, outname, yerror):
    def func(x, p1, p2, p3, p4):
        return p1 + p2 * x + p3 * x**2 + p4 * x**3
    
    xdata = np.array(x) 
    ydata = np.array(y)
    ymedian = np.median(y) 
    xnew = np.arange(1, max(x), 0.001) 
    popt, pcov, infodict, mesg, ier = curve_fit(func, xdata, ydata,p0=(1, 1, 1, 1),full_output=1) 
    ynew = [func(i, popt[0], popt[1], popt[2], popt[3]) for i in xnew]
    #plt.figure()
    fig, ax = plt.subplots()
    plt.errorbar(x, y, marker='x', yerr=yerror, ls="None")
    plt.plot(xnew, ynew)
    #plt.plot(x, y, 'x', xnew, ynew)
    plt.axis([.5, 9.5, 0, max( max(y), max(ynew) ) + 1])
    ax.set_xticklabels(['', '80S', 'poly2', 'poly3', 'poly4', 'poly5', 'poly6', 'poly7', 'poly8', 'cyto']) 
    plt.legend(['Input', 'Third order polynomial'])
    residuals = sum(infodict['fvec']**2)
    plt.title("Sq. Resid.: %5.4f; Res/Median: %5.4f" % (residuals, residuals/ymedian))
    perr = np.sqrt(np.diag(pcov))
    perr_percent = [ np.fabs(perr[i]/popt[i]) for i in range(len(popt))]
    avg_percent_error = np.mean(perr_percent)
    total_percent_error = sum(perr_percent)
    weighted_perr = sum([ perr_percent[i] * np.fabs(popt[i]) for i in range(len(popt))])
    prsn = pearsonr( [func(i, popt[0], popt[1], popt[2], popt[3]) for i in x], y)[0]
    sprmn = spearmanr( [func(i, popt[0], popt[1], popt[2], popt[3]) for i in x], y)[0]
    def prt(inp): #"pretty" 
        return ["%3.2f" % inp[i] for i in range(len(inp))]

    plt.text(.75, 1, "Parms %s\nerrors %s\n%% error %s\nmean %%: %3.2f sum %%: %3.2f weighted %%: %3.2f pearson: %3.2f spearman: %3.2f" % 
              (prt(popt), prt(perr), prt(perr_percent), avg_percent_error, total_percent_error, weighted_perr, prsn, sprmn),
              fontsize=8)
    plt.savefig(outname)
    plt.close(fig) 
Exemple #32
0
def compare_models(m1, tl1, m2, tl2):
    "Test how well the two models correlate"
    # Ensure overlap between the two tag lists:
    overlap = set(tl1) & set(tl2)
    # Get the row indices for the tags:
    m1_indices = {name: i for i, name in enumerate(tl1)}
    m2_indices = {name: i for i, name in enumerate(tl2)}
    # Prepare lists to collect data:
    m1_values = []
    m2_values = []
    differences = []
    # Create shorthand for the cosine similarity function:
    cosine = lambda x, y: float(pairwise_distances(x, y, metric='cosine'))
    # For all combinations of tags, compute the distances
    for a, b in combinations(overlap, 2):
        m1_pred = cosine(m1[m1_indices[a]], m1[m1_indices[b]])
        m2_pred = cosine(m2[m2_indices[a]], m2[m2_indices[b]])
        m1_values.append(m1_pred)
        m2_values.append(m2_pred)
        differences.append((abs(m1_pred - m2_pred), a + ' ' + b))
    # Correlate the two sets of distances
    correlation, sig = spearmanr(m1_values, m2_values)
    return {
        "correlation": correlation,
        "significance": sig,
        "differences": sorted(differences, reverse=True)
    }
Exemple #33
0
def evaluate_word2vec(model, measure='men'):
    "Evaluates a model on the basis of the provided similarity measure."
    # Load the similarity measure, if possible.
    try:
        sim_dict = __resource__[measure]
    except KeyError:
        return None

    # Select pairs that can be used for testing.
    sim_words = {word for pair in sim_dict for word in pair}
    usable_words = set(model.vocab.keys()) & sim_words
    usable_pairs = {
        key
        for key in sim_dict.keys() if set(key).issubset(usable_words)
    }

    # Gather lists of actual values and 'predictions'
    actual_values = []
    predicted_values = []
    for a, b in usable_pairs:
        actual_values.append(sim_dict[(a, b)])
        predicted_values.append(model.similarity(a, b))

    # Compute the correlation:
    correlation, sig = spearmanr(actual_values, predicted_values)
    return {
        "correlation": correlation,
        "explained": correlation * correlation,
        "significance": sig,
        "test_pairs": len(usable_pairs),
        "predictions": dict(zip(usable_pairs, predicted_values))
    }
Exemple #34
0
def get_unigrams_for_feature(word_feature, neus, capacity):
    """
    Extract unigram features. The most frequent $capacity number of unigrams will be selected.
    Then these words will be filted based on spearman rank correlation.
    """
    top_unigrams = word_feature.get_top_unigrams(capacity)
    print 'most frequent unigrams: ', top_unigrams
    candicate_map = dict()
    for unigram in top_unigrams:
        w_counts = word_feature.get_feauture_by_unigram(unigram)
        p = spearmanr(w_counts, neus)
        if not math.isnan(p[0]):
            candicate_map[unigram] = p

    selected_unigrams = list()
    for candicate_unigram in candicate_map.keys():
        if len(selected_unigrams) == 0:
            selected_unigrams.append(candicate_unigram)
            continue
        idx = 0
        while idx < len(selected_unigrams):
            if abs(candicate_map[selected_unigrams[idx]][0]) < abs(
                    candicate_map[candicate_unigram][0]):
                break
            idx += 1
        selected_unigrams.insert(idx, candicate_unigram)
        if len(selected_unigrams) > unigram_capacity:
            selected_unigrams = selected_unigrams[:unigram_capacity]

    print '======== selected unigrams for feature ========'
    for selected_unigram in selected_unigrams:
        print selected_unigram, ':', candicate_map.get(selected_unigram)
    return selected_unigrams
Exemple #35
0
 def evaluate(self, embs, data):
     details = []
     results = []
     cnt_found_pairs_total = 0
     for (x, y), sim in data:
         x = x.lower()
         y = y.lower()
         # print(x,y)
         if embs.has_word(x) and embs.has_word(y) and not math.isnan(
                 embs.get_vector(x).dot(embs.get_vector(y))):
             # print(m.get_row(x).dot(m.get_row(y)))
             v = embs.get_vector(x).dot(embs.get_vector(y))
             results.append((v, sim))
             cnt_found_pairs_total += 1
             details.append([x, y, float(v), float(sim)])
         else:
             if not self.ignore_oov:
                 # results.append((-1, sim))
                 # details.append([x, y, str(-1), str(sim)])
                 results.append((0, sim))
                 details.append([x, y, str(0), str(sim)])
                 # print('oov')
                 pass
     if len(results) <= 2:
         return -1, cnt_found_pairs_total, []
     actual, expected = zip(*results)
     # print(actual)
     return spearmanr(actual, expected)[0], cnt_found_pairs_total, details
Exemple #36
0
def calculateMetrics(obs, mod, obsStart, obsEnd, modStart, modEnd, obsStep,
                     modStep, modTimes):
    if obsStep > 1 or modStep > 1:
        obs, mod, plotTimes = matchSeriesMonth(obs, mod, obsStart, obsEnd,
                                               modStart, modEnd, modTimes)
    if obsStep <= 1 and modStep <= 1:
        obs, mod, plotTimes = matchSeriesDay(obs, mod, obsStart, obsEnd,
                                             modStart, modEnd, modTimes)
    obsSel = np.isnan(obs) == False
    modSel = np.isnan(mod) == False
    sel = obsSel & modSel
    if len(obs) > timeSize and len(mod) > timeSize:
        obsSel = np.isnan(obs) == False
        modSel = np.isnan(mod) == False
        sel = obsSel & modSel
        if len(obs[sel]) > timeSize and len(mod[sel]) > timeSize:
            R = spearmanr(obs[sel], mod[sel])[0]
            NS = nashSutcliffe(obs[sel], mod[sel])
            RMSE = rmse(obs[sel], mod[sel])
            Bias, numPoints = bias(obs[sel], mod[sel])
            KGE, CC, Alpha, Beta = kge(obs[sel], mod[sel])
            AC = anomalyCorrelation(obs[sel], mod[sel])
            return R, AC, KGE, CC, Alpha, Beta, NS, RMSE, Bias, numPoints
        else:
            return np.zeros((10))
    else:
        return np.zeros((10))
Exemple #37
0
def spearman_with_errors(x,y,yerr,Nmc=1000,plotflag=False,verbose=False):
    ysim=np.zeros(Nmc,'f')
    rhosim=np.zeros(Nmc,'f')
    psim=np.zeros(Nmc,'f')

    for i in range(Nmc):
        ysim=np.random.normal(y,scale=yerr,size=len(y))
        rhosim[i],psim[i] = spearmanr(x,ysim)
    cave=np.mean(rhosim)
    cstd=np.std(rhosim)
    q1=50-34 # mean minus one std
    lower=np.percentile(rhosim,q1)
    q2=50+34 # mean minus one std
    upper=np.percentile(rhosim,q2)
    print 'mean (median) = %5.2f (%5.2f), std = %5.2f'%(cave,np.median(rhosim),cstd)
    print 'confidence interval from sorted list of MC fit values:'
    print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)'%(lower,cave-cstd, upper,cave+cstd)
    k,pnorm=normaltest(rhosim)
    print 'probability that distribution of slopes is normal = %5.2f'%(pnorm)
    if plotflag:
        plt.figure(figsize=(10,4))
        plt.subplot(1,2,1)
        plt.hist(rhosim,bins=10,normed=True)
        plt.xlabel(r'$Spearman \ \rho $')
        plt.axvline(x=cave,ls='-',color='k')
        plt.axvline(x=lower,ls='--',color='k')
        plt.axvline(x=upper,ls='--',color='k')
        plt.subplot(1,2,2)
        plt.hist(np.log10(psim),bins=10,normed=True)
        plt.xlabel(r'$\log_{10}(p \ value)$')
    return rhosim,psim
Exemple #38
0
        def get_ranking_correlations(grp):

            grp = pd.pivot_table(grp,
                                 index=self.platform_col,
                                 values='value',
                                 columns=self.content_col).fillna(0)

            platform1s = []
            platform2s = []
            corrs = []

            for i, p1 in enumerate(platforms):
                for p2 in platforms[i + 1:]:

                    if p1 in grp.index and p2 in grp.index:
                        corr = spearmanr(grp.loc[p1].values,
                                         grp.loc[p2].values)[0]

                        platform1s.append(p1)
                        platform2s.append(p2)
                        corrs.append(corr)

            corr = pd.DataFrame({
                'platform1': platform1s,
                'platform2': platform2s,
                'value': corrs
            })

            return corr
def mono_bin(Y, X, n=20):
    r = 0
    good = Y.sum()
    bad = Y.count() - good
    while np.abs(r) < 1:
        pdqcut = pd.qcut(X, n).value_counts()
        # pd.qcut根据这些值的频率来选择箱子的均匀间隔
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index=True)
        # 等级相关程度的统计分析指标
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
    d3['goodattribute'] = d3['sum'] / good
    d3['badattribute'] = (d3['total'] - d3['sum']) / bad
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
    d4 = (d3.sort_index(by='min'))
    print("=" * 60)
    print(d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1, n + 1):
        qua = X.quantile(i / (n + 1))
        cut.append(round(qua, 4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return d4, iv, cut, woe
Exemple #40
0
def per_class_scatter(fold_dict, metadata, stat_name, md_name, md_dict, param_val=0.1, smooth=True, avg=False,
                   axis_label_size=14, legend_title_size=14, legend_label_size=12, title_size=16, save_file=None):
    data_list = []
    for epoch_num in sorted(fold_dict):
        if epoch_num == '5':
            for fold in fold_dict[epoch_num]:
                for entry in fold_dict[epoch_num][fold]:
                    param_val_i, stat_list = entry
                    if param_val_i == param_val:
                        for cls_idx, stat_val in enumerate(stat_list):
                            cls = metadata['idx_map'][str(cls_idx)]
                            md_val = md_dict[str(fold)][cls][md_name]
                            data_list.append((md_val, stat_val))
                    
    print('Num points: {}'.format(len(data_list)))
    arr = np.array(data_list).T
    cc, pv = pearsonr(*arr)
    sc, sv = spearmanr(*arr)
    print('Pearson Correlation: {:.3f}, {:.5f}'.format(cc, pv))
    print('Spearman Correlation: {:.3f}, {:.5f}'.format(sc, sv))
    plt.figure()
    plt.scatter(*arr, edgecolors='black')
    plt.xscale('log')
    plt.title('{} vs. {} for 35 Classes'.format(stat_name, metric_title_dict[md_name]), fontsize=title_size)
    plt.xlabel('{}'.format(metric_axis_dict[md_name]), fontsize=axis_label_size)
    plt.ylabel('{}'.format(stat_name), fontsize=axis_label_size)
    
    if save_file is not None:
        fig_path = os.path.join(FIG_DIR, save_file)
        plt.savefig(fig_path, bbox_inches="tight", format='png', dpi=300)
Exemple #41
0
def per_peak_scatter(fold_dict, metadata, param_name, md_name, md_dict, smooth=True, avg=False):
    data_list = []
    for epoch_num in sorted(fold_dict):
        if epoch_num == '5':
            for fold in sorted(fold_dict[epoch_num])[:-1]:
                entry_list = sorted(fold_dict[epoch_num][fold], key=lambda x: x[0])
                param_arr = np.array(list(zip(*entry_list))[0])
                entry_arr = np.array(list(zip(*entry_list))[1])
                max_idx = np.argmax(entry_arr, axis=0)
                max_list = param_arr[max_idx]
                for cls_idx, max_val in enumerate(max_list):
                    cls = metadata['idx_map'][str(cls_idx)]
                    md_val = md_dict[str(fold)][cls][md_name]
                    data_list.append((md_val, max_val))
                    
    print('Num points: {}'.format(len(data_list)))
    arr = np.array(data_list).T
    cc, pv = pearsonr(*arr)
    sc, sv = spearmanr(*arr)
    print('Pearson Correlation: {:.3f}, {:.5f}'.format(cc, pv))
    print('Spearman Correlation: {:.3f}, {:.5f}'.format(sc, sv))
    plt.figure()
    plt.scatter(*arr)
    plt.xscale('log')
    plt.title('{} vs. {} for 35 Classes'.format(param_name, md_name))
    plt.xlabel('{}'.format(md_name))
    plt.ylabel('{}'.format(param_name))
def compare_localness_lenses():
    localness = collections.defaultdict(lambda: [(0, 0), (0, 0)])

    for (i, path) in enumerate([EDITOR_COUNTS, SOURCE_COUNTS]):
        totals = collections.defaultdict(int)
        locals = collections.defaultdict(int)
        for row in sg_open_csvr(path):
            c = int(row['count'])
            key = (row['project'], row['article_country'])
            totals[key] += c
            if row['article_country'] == row['other_country']:
                locals[key] += c

        for key in locals:
            localness[key][i] = (1.0 * locals[key] / totals[key], totals[key])

    X = []
    Y = []
    for ((editor_p, editor_n), (source_p, source_n)) in localness.values():
        if editor_n > 100 and source_n > 100:
            X.append(editor_p)
            Y.append(source_p)

    from scipy.stats.stats import pearsonr, spearmanr

    print 'n = ', len(X)
    print 'spearman', spearmanr(X, Y)
    print 'pearson', pearsonr(X, Y)
    print 'num where source locality is higher: ', len([1 for (x, y) in zip(X, Y )if y > x])
Exemple #43
0
def evaluate_model(matrix, feature_names, measure='men'):
    "Evaluates a model on the basis of the provided similarity measure."
    # Load the similarity measure, if possible.
    try:
        sim_dict = __resource__[measure]
    except KeyError:
        return None

    # Select pairs that can be used for testing.
    sim_words = {word for pair in sim_dict for word in pair}
    usable_words = set(feature_names) & sim_words
    usable_pairs = {
        key
        for key in sim_dict.keys() if set(key).issubset(usable_words)
    }

    # Gather lists of actual values and 'predictions'
    actual_values = []
    predicted_values = []
    indices = {name: i for i, name in enumerate(feature_names)}
    cosine = lambda x, y: float(pairwise_distances(x, y, metric='cosine'))
    for a, b in usable_pairs:
        actual_values.append(sim_dict[(a, b)])
        predicted_values.append(cosine(matrix[indices[a]], matrix[indices[b]]))

    # Compute the correlation:
    correlation, sig = spearmanr(actual_values, predicted_values)
    return {
        "correlation": correlation,
        "explained": correlation * correlation,
        "significance": sig,
        "test_pairs": len(usable_pairs),
        "predictions": dict(zip(usable_pairs, predicted_values))
    }
def correlation(X, Y):
    indices = [i for i in range(len(X)) if X[i] != None]

    x1 = [X[i] for i in indices]
    x2 = [Y[i] for i in indices]
    print(min(x1), max(x1))
    return spearmanr(x1, x2), pearsonr(x1, x2)
Exemple #45
0
    def dist_to_string(self, judgements):
        output_string = []
        human_similarities = []
        cosine_similarities = []
        for judgement in judgements:
            if not judgement.strip():
                continue
            line = judgement.split(",")
            word_1 = line[0]
            word_1_index = self.get_word_id(word_1)
            word_2 = line[1]
            word_2_index = self.get_word_id(word_2)
            human_similarities.append(float(line[2]))

            word_1_context = self.distributional_model[word_1_index]
            word_2_context = self.distributional_model[word_2_index]
            length = 10 if len(word_1_context) > 10 and len(word_2_context) > 10 else min(len(word_1_context),
                                                                                          len(word_2_context))

            word_1_top_10 = sorted(word_1_context.items(), key=lambda kv: (-kv[1], kv[0]))[:length]
            word_2_top_10 = sorted(word_2_context.items(), key=lambda kv: (-kv[1], kv[0]))[:length]
            output_string.append(word_1 + " " + " ".join(['%s: %i' % (self.word2idx[k], v) for k, v in word_1_top_10]))
            output_string.append(word_2 + " " + " ".join(['%s: %i' % (self.word2idx[k], v) for k, v in word_2_top_10]))

            word_1_values = [t[1] for t in word_1_top_10]
            word_2_values = [t[1] for t in word_2_top_10]
            cosine_similarity = 1 - spatial.distance.cosine(word_1_values, word_2_values)
            cosine_similarities.append(cosine_similarity)

            output_string.append(word_1 + "," + word_2 + ":" + str(cosine_similarity))
        output_string.append("correlation:" + str(spearmanr(cosine_similarities, human_similarities)[0]))
        return "\n".join(output_string)
Exemple #46
0
    def generate_statistics(self,
                            data_x,
                            data_e,
                            data_t,
                            name,
                            session_dict,
                            save=True):
        #        self.saver.restore(sess=self.session, save_path=self.save_path)
        ci, cost, rae, ranking, gen, reg, disc, layer_one_recon, t_reg, t_mse = \
            self.predict_concordance_index(x=data_x,
                                           e=data_e,
                                           t=data_t,
                                           outcomes=
                                           session_dict[
                                               'outcomes'])

        observed_idx = self.extract_observed_death(name=name,
                                                   observed_e=data_e,
                                                   observed_t=data_t,
                                                   save=save)

        median_predicted_time = self.median_predict_time(session_dict)
        if name == 'Train':
            self.predicted_time_train = median_predicted_time
        if name == 'Valid':
            self.val_loss = cost
        observed_empirical = data_t[observed_idx]
        observed_predicted = median_predicted_time[observed_idx]
        observed_ci = concordance_index(
            event_times=observed_empirical,
            predicted_scores=np.nan_to_num(observed_predicted),
            event_observed=data_e[observed_idx])

        corr = spearmanr(observed_empirical, observed_predicted)
        ##### ibs / ibll #####
        time_grid = np.linspace(data_t.min(), data_t.max(), 100)
        ds = np.array(time_grid - np.array([0.0] + time_grid[:-1].tolist()))
        bs, bll = get_scores(y_train=self.train_t,
                             delta_train=self.train_e,
                             y_test=data_t,
                             delta_test=data_e,
                             pred_train=self.predicted_time_train,
                             pred_test=median_predicted_time,
                             time_grid=time_grid,
                             surv_residual=False,
                             cens_residual=False)

        ibs = sum(bs * ds) / (time_grid.max() - time_grid.min())
        ibll = sum(bll * ds) / (time_grid.max() - time_grid.min())
        ######################
        results = "{} || loss: {}, CI: {}, IBS: {}, IBLL: {}".format(
            name, np.round(cost, 4), np.round(ci, 4), np.round(ibs, 4),
            np.round(ibll, 4))
        #        logging.debug(results)
        print(results)

        if name == 'Test':
            self.ctd = ci
            self.ibs = ibs
            self.nbll = ibll
Exemple #47
0
def get_bigrams_for_feature(word_feature, neus, capacity):
    """
    Extract unigram features. The most frequent $capacity number of bigrams will be selected.
    Then these bigrams will be filted based on pearson regression.
    """
    top_bigrams = word_feature.get_top_bigrams(capacity)
    print 'most frequent bigrams: ', top_bigrams

    candicate_map = dict()
    for bigram in top_bigrams:
        bg_counts = word_feature.get_feauture_by_bigram(bigram)
        p = spearmanr(bg_counts, neus)
        if not math.isnan(p[0]):
            candicate_map[bigram] = p

    selected_bigrams = list()
    for candicate_bigram in candicate_map.keys():
        idx = 0
        while idx < len(selected_bigrams):
            if abs(candicate_map[selected_bigrams[idx]][0]) < abs(
                    candicate_map[candicate_bigram][0]):
                break
            idx += 1
        selected_bigrams.insert(idx, candicate_bigram)
        if len(selected_bigrams) > bigram_capacity:
            selected_bigrams = selected_bigrams[:bigram_capacity]

    print '======== selected bigrams for feature ========'
    for selected_bigram in selected_bigrams:
        print selected_bigram, ':', candicate_map.get(selected_bigram)
    return selected_bigrams
Exemple #48
0
def correlations_speed(cur, variable1, variable2, table):
    """
    Correlation of 2 variables (including scatter plot)
    """
    x = select(cur, variable1, table)
    y = select(cur, variable2, table)

    # Scatterplot
    #    mpl.style.use('ggplot')
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel("Gap")
    ax.set_ylabel("Sentiment magnitude")
    fig.suptitle('Correlation funding gap and sentiment magnitude')
    plt.scatter(x, y)
    plt.show()

    # Pearson correlation and p-value
    p_corr_speed_length = pearsonr(x, y)
    print("Pearson: ", p_corr_speed_length)

    # Spearman correaltion and p-value
    s_corr_speed_length = spearmanr(x, y)
    print("Spearman: ", s_corr_speed_length)

    # Kendall correlation and p-value
    k_corr_speed_length = kendalltau(x, y)
    print("Kendall: ", k_corr_speed_length)
def byGene(geneSpanFN, wigDir1, wigDir2, chrom, strand, outFN, simulation = False):
    '''hela must be 2nd wigDir2 cuz strand flip'''
    strand = str(strand) #undo autocast
   
    print 'loading wigs'
    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')

    print 'calculating bin values'
    f = open(geneSpanFN, 'r')
    fOut = open(outFN, 'w')
    for line in f:
        ls = line.strip().split('\t')
        sChrom, sStrand = ls[1], ls[2]
        if sChrom != chrom or sStrand != strand:
            continue
        geneName = ls[0]
        geneStarts = [int(x) for x in ls[3].split(',')]
        geneEnds = [int(x) for x in ls[4].split(',')]
        spanPairs = zip(geneStarts, geneEnds)

        frameLength = 10
        skipAmount = 2
        theSpan = fullSpanFromPairs(spanPairs)
        spanLength = len(theSpan)


        binAvgs1 = []
        binAvgs2 = []

        for theBinAvg, theCoord_Val in [(binAvgs1, coord_value1), (binAvgs2, coord_value2)]:
            #mix up bins if simulation
            if simulation:
                newSpan = mixSpanByBin(theSpan, frameLength)
            else:
                newSpan = theSpan
            
            i = 0
            while (i+frameLength) < (spanLength+1):
                binNums = newSpan[i:(i + frameLength)]
                theBinAvg.append(binAvg(theCoord_Val, binNums))
                i = i + skipAmount

        #get rid of all 0,0 pairs for correlation 
        editPairs = zip(binAvgs1, binAvgs2)
        newPairs = [pair for pair in editPairs if not (pair[0] == 0 and pair[1] == 0)]
        newX = [pair[0] for pair in newPairs]
        newY = [pair[1] for pair in newPairs]

        dataLoad = sum(binAvgs1) + sum(binAvgs2)
        dataLoad = float(dataLoad)/2
        pcc = pStats.pearsonr(binAvgs1, binAvgs2)
        scc, pVal = pStats.spearmanr(binAvgs1, binAvgs2)
        outString = [geneName, pcc[0], ','.join([str(x) for x in binAvgs1]), ','.join([str(x) for x in binAvgs2]), '%s:%s:%s' % (sChrom, sStrand, theSpan[0]), dataLoad, scc]  
        fOut.write('\t'.join([str(x) for x in outString]) + '\n')

    fOut.close()
    f.close()
Exemple #50
0
def evaluate(representation, data):
    results = []
    for (x, y), sim in data:
    #    if representation.oov(x) or representation.oov(y):
    #        continue
        results.append((representation.similarity(x, y), sim))
    actual, expected = zip(*results)
    return spearmanr(actual, expected)[0]
def spearmanc(sites, data, ages, deltas = False):
    res = {}
    random_ages = list(permutations(ages)) if deltas else gen_random_ages()

    for s in sites:
        values = (getvd if deltas else getvf)(data, s)
        if len(set(values)) == 1:
            continue

        res[s] = {}
        res[s]['score'], res[s]['pval'] = spearmanr(values, ages)

        res[s]['rscore'] = json.dumps([spearmanr(values, rage)[0] for rage in random_ages])


        res[s]['data'] = json.dumps(values)
    return res
Exemple #52
0
def innerQuery(name, dest, count, doprint,stopwords,trecoutput,sorm,normalized):
    with open(dest, "w") as f:
        f.write("<parameters>\n")
        f.write("<index>/home/ginger/Documents/IR/Project2-qpp/" + name + "-index</index>\n")
        f.write("<runID>2016</runID>\n")
        f.write("<trecFormat>true</trecFormat>\n")
        f.write("<stemmer>Krovetz</stemmer>\n")
        f.write("<count>" + str(count) + "</count>\n")
        f.write("<baseline>okapi,k1:1.2,b:0.75,k3:7</baseline>\n")
        counter1, min1, max1, wcount1 = makeQueries("../../IR2016/queries/topics.301-350", dest, f,doprint)
        counter2, min2, max2, wcount2 = makeQueries("../../IR2016/queries/topics.351-400", dest, f,doprint)
        counter3, min3, max3, wcount3 = makeQueries("../../IR2016/queries/topics.401-450", dest, f,doprint)
        f.write(stopwords)
        f.write("</parameters>\n")

    wordcounts = wcount1
    wordcounts.extend(wcount2)
    wordcounts.extend(wcount3)

    if doprint:
        counter = sum([counter1, counter2, counter3])
        minval = min([min1, min2, min3])
        maxval = max([max1, max2, max3])
        print "queries,150,", 1. * counter / 150, ",", minval, ",", maxval
        #trecoutput.write( "name,k,recip,p10")
    #do indri stuff
    qrelsfile = "../qrels/qrels.txt"
    namefile = str(name) + str(count)
    resfile = "../queries/results/" + namefile + ".txt"
    queryfile = "../queries/" + namefile + ".txt"
    if sorm:
        fun = mad
        namefile = "MAD" + namefile
    else:
        fun = np.std

    os.system( "IndriRunQuery " + queryfile + " > " + resfile)
    correlations0 = "Pearsons "
    correlations1 = "Spearman "
    for cut in np.arange(0.1,1,0.2):
        newresfile = "../queries/results/" + namefile + "." + str(cut) + ".txt"
        trecfile = "../trec/" + namefile  + "." + str(cut) + ".txt"
        sds = applyCut(resfile,newresfile,count,cut,fun,normalized)
        os.system( "../trec_eval -q " + qrelsfile + " " + newresfile + " > " + trecfile)
        p10s = parseTrec_Eval(trecfile, name + "," + str(count),trecoutput)
        correlations0 += " & " + str(round(pearsonr(sds,p10s)[0],4))
        correlations1 += " & " + str(round(spearmanr(sds, p10s)[0],4))

    lineending = " \\\\ \\hline"
    print '''\\begin{table}[h!]
\\centering
\\begin{tabular}{|l|l|l|l|l|l|}
\\hline'''
    print name + " " + str(count ) + " & $\sigma_{0.1\%}$ & $\sigma_{0.3\%}$ & $\sigma_{0.5\%}$ & $\sigma_{0.7\%}$ & $\sigma_{0.9\%}$" + lineending
    print correlations0 + lineending
    print correlations1 + lineending
    print '''\\end{tabular}
\\caption{$\\sigma_{\\%}$ correlations for ''' + namefile + '''}
def generate_graphs(team_alias):
    fig = plt.figure(figsize=(15, 6))
    ax = fig.add_subplot(121)
    times_to_cross, results = get_data(team_alias)
    ax.hist(times_to_cross, bins=8 * 10, range=(0, 8), facecolor="green")
    plt.xlabel("Time to cross halfcourt (s)")
    plt.ylabel("Normalized frequency")
    plt.title("Histogram of time to cross halfcourt for %s" % (team_alias))
    mean = np.mean(times_to_cross)
    std = np.std(times_to_cross)
    n = len(times_to_cross)

    text = ax.text(
        0.05, 0.85, "Mean: {:.2f} \nStd Dev: {:.2f} \n# Possessions: {: d}".format(mean, std, n), transform=ax.transAxes
    )

    ax2 = fig.add_subplot(122)
    points, times = np.histogram(times_to_cross, bins=8 * 10, range=(0.00, 8.00), weights=results)
    num_crosses, times = np.histogram(times_to_cross, bins=8 * 10, range=(0.00, 8.00))
    to_remove = []
    for index, ncross in enumerate(num_crosses):
        if abs(ncross) == 0:
            to_remove.append(index)

    points = np.delete(points, to_remove)
    num_crosses = np.delete(num_crosses, to_remove)
    times = np.delete(times, to_remove)
    # hist1 = np.array(map(lambda x: 1.0 if abs(x) == 0 else float(x), hist1))
    avg_points = points / num_crosses
    times = np.delete(times, -1)
    ax2.scatter(times, avg_points)
    # avg = np.mean(avg_points)
    # std = np.std(avg_points)
    # corr, pvals = pearsonr(times, avg_points)
    # ax2.text(0.05,0.85, 'Avg: {:.2f} \nStd Dev: {:.2f} \nPearson\' corr: {:.2f}'.format(avg, std, corr), transform=ax2.transAxes)

    avg_all = np.mean(avg_points)
    std = np.std(avg_points)
    pcorr_all, p_val = pearsonr(times_to_cross, results)
    spcorr_all, sp_val = spearmanr(times_to_cross, results)
    ax2.text(
        0.05,
        0.85,
        "Pearson Corr: {:.2f}\n\
            Pearson 2t_val: {:.2f}\n\
            Spearman Corr: {:.2f}\n\
            Spearman 2t_val: {:.2f}".format(
            pcorr_all, p_val, spcorr_all, sp_val
        ),
        transform=ax2.transAxes,
    )

    # plt.bar(left=bar_x, height=final_hist, width=0.1)
    plt.xlabel("Time to cross halfcourt (s)")
    plt.ylabel("Average points per possession")
    plt.title("Avg points vs time to cross halfcourt for %s" % (team_alias))
    fig.savefig("csvs/%s/graphs.png" % (team_alias))
Exemple #54
0
def spearman_boot(x,y,N=5000,cont_int=68.):
    boot_rho=zeros(N,'f')
    boot_p=zeros(N,'f')
    for i in range(N):
        indices=randint(0,len(x)-1,len(x))
        xboot=x[indices]
        yboot=y[indices]
        boot_rho[i],boot_p[i]=spearmanr(xboot,yboot)
    return scoreatpercentile(boot_rho,per=50),scoreatpercentile(boot_p,per=50)#,boot_rho,boot_p 
 def get_correlations(self, x, y):
     from scipy.stats.stats import spearmanr, pearsonr
     correlations = []
     rows = self.db.view('results/all').rows
     for row in rows:
         x_values = row['value'].get(x)
         y_values = row['value'].get(y)
         correlations.append((row.key, spearmanr(x_values, y_values), pearsonr(x_values, y_values)))
     return correlations
def final_corr():

	zfile = open("cluster_zscore.txt", "r")
	efile = open("entropy_list_redo.txt", "r")

	clust_zscore = dict()
	clust_entropy = dict()
	for each in efile:
		line = each.split()
		cluster = int(line[0])
		entropy = float(line[1])
		clust_entropy[cluster] = entropy

	for each in zfile:
		line = each.split()
		cluster = int(line[0])
		zscore = float(line[1])
		clust_zscore[cluster] = zscore
	x = []
	y = []
	for i in range(150000):
		if not clust_zscore.has_key(i) or not clust_entropy.has_key(i):
			continue
		x.append(clust_entropy[i])
		y.append(clust_zscore[i])

	correlation, pvalue = spearmanr(x,y)
	print "spearman " + str(correlation)
	print pvalue

	correlation, pvalue = pearsonr(x,y)
	print "pearson " + str(correlation)
	x2 = []
	y2 = []
	for i in range(150000):
		if not clust_zscore.has_key(i):
			continue
		x2.append(i)
		y2.append(clust_zscore[i])

	x1 = []
	y1 = []
	for i in range(150000):
		if not clust_entropy.has_key(i):
			continue
		x1.append(i)
		y1.append(clust_entropy[i])

	plt.scatter(x,y)
	plt.title('Entropy of a Cluster vs. citing distance (redo)')
	plt.xlabel('Entropy')
	plt.ylabel('Citing Distance z scores')
	plt.savefig('entropy_citingscores_redo.png')
	plt.show()
	efile.close()
	zfile.close()
Exemple #57
0
def evaluate(representation, data):
    results = []
    seen_num = 0
    for (x, y), sim in data:
        if representation.similarity(x, y) is not None :
            seen_num += 1
            results.append((representation.similarity(x, y), sim))
    actual, expected = zip(*results)
    print ("seen/total: " + str(seen_num) + "/" + str(len(data)))
    return spearmanr(actual, expected)[0]
def draw_plot(ax, x, y, color, x_axis, y_axis, title):
    scatterplot.draw_actual_plot(ax, x, y, color, x_axis, y_axis, title, size=40)
    coeff, pval = pearsonr(x, y)
    rho, pval = spearmanr(x, y)
    mae = mean_abs_error(x, y)
    conv.add_text_dict(ax, { "PCC" : coeff, "Rho" : rho, "MAE" : mae })
        
    scatterplot.add_x_y_line(ax, min_val=min(x), max_val=max(x))

    return [coeff, rho, mae]
def compute_worker_bam(obj, chr_tbp):
    print chr_tbp
    
    file_a = pysam.Samfile(obj.file_a, 'rb')
    file_b = pysam.Samfile(obj.file_b, 'rb')
    
    feature_data = open(obj.feature + "/" + chr_tbp)
    feature_out_path = obj.tmp_path + "/" + chr_tbp
    feature_out = open(feature_out_path, 'w')
   
    start = 0
    end = 0
    vals1 = 0
    vals2 = 0
    corr = 0
    for line in feature_data:
        line = line.strip()
        sline = line.split()
        #pdb.set_trace()
        start = int(sline[1]) - 1 - obj.flank
        end = int(sline[2]) + obj.flank
        
        vals1 = np.zeros(end - start)
        vals2 = np.zeros(end - start)
      
        for column in file_a.pileup(reference=chr_tbp, start=start, end=end):
            if (column.pos >= start and column.pos < end):
 #              pdb.set_trace()
                try:
                    vals1[(column.pos - start)] = column.n
                except:
                    pdb.set_trace()
        
        for column in file_b.pileup(reference=chr_tbp, start=start, end=end):
            if (column.pos >= start and column.pos < end):
#               pdb.set_trace()
                try:
                    vals2[(column.pos - start)] = column.n
                except:
                    pdb.set_trace()            
        
        
        
        if obj.corr_type == "cross" or obj.corr_type == "auto": 
            corr = ss.fftconvolve(vals1, vals2, 'same')
        elif obj.corr_type == "spearmanr":
            corr = [stats.spearmanr(vals1, vals2)[0]]
        elif obj.corr_type == "pearsonr":
            corr = [stats.pearsonr(vals1, vals2)[0]]
        feature_out.write("\t".join(sline[3:5] + map(str,corr)) + "\n")
       
    feature_data.close()
    feature_out.close()
def apply_stats(data,runTTest):
	peakList = getPeakList(data)
	tempList = list()

	colNames = ['Fatty Acid Type',           #1
				'Peak Name',                 #2
				'Pearson Coefficient',       #3
				'Pearson P Value',           #4
				'Spearman Coefficient',      #5
				'Spearman P Value',          #6
				'P Geometric Mean (%)',      #7
				'Q Geometric Mean (ug/ml)',  #8
				'P Mean (%)',                #9
				'P Stdev',                   #10
				'Q Mean (ug/ml)',            #11
				'Q Stdev',                   #12
				'P T-test',                  #13
				'P T-test P value',          #14
				'Q T-test',                  #15
				'Q T-test P value',          #16
				'Common Name']               #17


	for entry in peakList:
		try:
			pearson = pearsonr(entry['p'],entry['q'])
			spearman = spearmanr(entry['p'],entry['q'])
			if runTTest == 'y':
				ttestP = ttest_1samp(entry['p'],0)
				ttestQ = ttest_1samp(entry['q'],0)
			else:
				ttestP = ('-','-')
				ttestQ = ('-','-')
			tempList += [entry['FAtype'],            #1
						entry['peakName'],           #2
						pearson[0],                  #3
						pearson[1],                  #4
						spearman[0],                 #5
						spearman[1],                 #6
						gmean(entry['p']),           #7
						gmean(entry['q']),           #8
						np.mean(entry['p']),         #9
						np.std(entry['p'],ddof=1),   #10
						np.mean(entry['q']),         #11
						np.std(entry['q'],ddof=1),   #12
						ttestP[0],                   #13
						ttestP[1],                   #14
						ttestQ[0],                   #15
						ttestQ[1],                   #16
						entry['common']],            #17
		except: pass

	return pd.DataFrame(tempList, columns=colNames)