def predict(self, clf, train=None, test=None, binary=False): """make predictions for ordinal or binary task""" predictions = clf.predict(test.X) if not binary: predictions = self.rescale_predictions(clf, train.y, clf.predict(train.X), predictions) test_y = test.y if binary: prob_predict = clf.predict_proba(test.X) test_y = self.binarize(test.y) print >> sys.stderr, 'accuracy (bin) = ', accuracy_score( predictions, test_y) print >> sys.stderr, 'tau (bin) = ', kendalltau( predictions, test_y) # correlation for ordinal task print >> sys.stderr, 'r (ord) = ', pearsonr( prob_predict[:, 0], test.y) print >> sys.stderr, 'tau (ord) = ', kendalltau( prob_predict[:, 0], test.y) else: print >> sys.stderr, 'r (ord) = ', pearsonr(predictions, test_y) print >> sys.stderr, 'tau (ord) = ', kendalltau( predictions, test_y) # score for binary task print >>sys.stderr, 'accuracy (bin) = ', \ accuracy_score(self.binarize(predictions), self.binarize(test_y)) print >>sys.stderr, 'tau (bin) = ', \ kendalltau(self.binarize(predictions), self.binarize(test_y)) for a, b in zip(test_y, predictions): print a, b
def npccf(x, y, method="spearmanr", min_lag=-10, max_lag=10): """ Compute cross correlation of time series x and y from min_lag to max_lag (based on nonparametric correlation). r(lag) = corr(x[t-lag], y[t]). Parameters ---------- x: time series y: time series method: "spearmanr" or "kendalltau" min_lag : int, default -10 max_lag : int, default 10 Returns ---------- a dictionary with keys "corrs" (correlation coefficient corresponding to the lags), "lags" (corresponding lags), "lb" (lower bound) and "ub" (upper bound). """ n1 = len(x) n2 = len(y) assert (n1 == n2 ), "The length of time series x and time series y must be equal!" assert (min_lag <= max_lag), "min_lag must less than or equal to max_lag!" nlags = max_lag - min_lag + 1 corrs = np.empty(nlags) if method == "spearmanr": for k, lag in enumerate(range(min_lag, (max_lag + 1))): if lag == 0: corrs[k] = spearmanr(x, y)[0] if lag < 0: corrs[k] = spearmanr(x[(-lag):], y[:lag])[0] if lag > 0: corrs[k] = spearmanr(x[:(-lag)], y[lag:])[0] elif method == "kendalltau": for k, lag in enumerate(range(min_lag, (max_lag + 1))): if lag == 0: corrs[k] = kendalltau(x, y)[0] if lag < 0: corrs[k] = kendalltau(x[(-lag):], y[:lag])[0] if lag > 0: corrs[k] = kendalltau(x[:(-lag)], y[lag:])[0] else: raise ValueError("The method %s is not supported." % method) return { "corrs": corrs, "lags": range(min_lag, (max_lag + 1)), "lb": np.repeat(-1 / np.sqrt(n1), nlags), "ub": np.repeat(1 / np.sqrt(n1), nlags) }
def update(self, es, **kwargs): if es.countiter < 2: self.initialize(es) self.fit = es.fit.fit else: ft1, ft2 = self.fit[int(self.index_to_compare)], self.fit[int(np.ceil(self.index_to_compare))] ftt1, ftt2 = es.fit.fit[(es.popsize - 1) // 2], es.fit.fit[int(np.ceil((es.popsize - 1) / 2))] pt2 = self.index_to_compare - int(self.index_to_compare) # ptt2 = (es.popsize - 1) / 2 - (es.popsize - 1) // 2 # not in use s = 0 if 1 < 3: s += pt2 * sum(es.fit.fit <= self.fit[int(np.ceil(self.index_to_compare))]) s += (1 - pt2) * sum(es.fit.fit < self.fit[int(self.index_to_compare)]) s -= es.popsize / 2. s *= 2. / es.popsize # the range was popsize, is 2 elif 11 < 3: # compare ft with median of ftt s += self.index_to_compare - sum(self.fit <= es.fit.fit[es.popsize // 2]) s *= 2 / es.popsize # the range was popsize, is 2 else: # compare ftt j-index of ft s += (1 - pt2) * np.sign(ft1 - ftt1) s += pt2 * np.sign(ft2 - ftt1) self.s = (1 - self.c) * self.s + self.c * s es.sigma *= np.exp(self.s / self.damp) # es.more_to_write.append(10**(self.s)) #es.more_to_write.append(10**((2 / es.popsize) * (sum(es.fit.fit < self.fit[int(self.index_to_compare)]) - (es.popsize + 1) / 2))) # # es.more_to_write.append(10**(self.index_to_compare - sum(self.fit <= es.fit.fit[es.popsize // 2]))) # # es.more_to_write.append(10**(np.sign(self.fit[int(self.index_to_compare)] - es.fit.fit[es.popsize // 2]))) if 11 < 3: import scipy.stats.stats as stats zkendall = stats.kendalltau(list(es.fit.fit) + list(self.fit), len(es.fit.fit) * [0] + len(self.fit) * [1])[0] es.more_to_write.append(10**zkendall) self.fit = es.fit.fit
def get_metrics(self, y, yhat, name): mse = self.compute_mse(y, yhat) pearson = pearsonr(y, yhat)[0][0] kendall = kendalltau(y, yhat)[0] spearman = spearmanr(y, yhat)[0] return {"lat": self.lat, "lon": self.lon, "model": name, "mse": mse, "pearson": pearson, "kendall": kendall, "spearman": spearman}
def valid(val_loader, model, args, funcs=[]): if not callable(getattr(model, "predict", None)): assert callable(getattr(model, "compare", None)) corrs, funcs_res = zip(*[ pairwise_valid(val_loader, model, pv_seed, funcs) for pv_seed in getattr(args, "pairwise_valid_seeds", [1, 12, 123]) ]) funcs_res = np.mean(funcs_res, axis=0) logging.info("pairwise: {}".format(corrs)) # return np.mean(corrs), true_accs, p_scores, funcs_res return np.mean(corrs), funcs_res model.eval() all_scores = [] true_accs = [] for step, (archs, accs, _) in enumerate(val_loader): scores = list(model.predict(archs).cpu().data.numpy()) all_scores += scores true_accs += list(accs) if args.save_predict is not None: with open(args.save_predict, "wb") as wf: pickle.dump((true_accs, all_scores), wf) corr = stats.kendalltau(true_accs, all_scores).correlation funcs_res = [func(true_accs, all_scores) for func in funcs] return corr, funcs_res
def batch_corr(x, y, method="pearsonr", ngap=1): """ Compute correlation on streaming sequence x and y (batch method) x: time series y: time series ngap: output correlation every ngap observations. """ n1 = len(x) n2 = len(y) assert (n1 == n2 ), "The length of time series x and time series y must be equal!" corrs = np.empty(n1) if method == "pearsonr": for i in range(0, len(x), ngap): corrs[i] = pearsonr(x[:(i + 1)], y[:(i + 1)])[0] elif method == "spearmanr": for i in range(0, len(x), ngap): corrs[i] = spearmanr(x[:(i + 1)], y[:(i + 1)])[0] elif method == "kendalltau": for i in range(0, len(x), ngap): corrs[i] = kendalltau(x[:(i + 1)], y[:(i + 1)])[0] else: raise ValueError( ('The method "%s" is not supported. Please specify one of ' 'the following options: "pearsonr", "spearmanr" or "kendalltau"') % method) return corrs
def test_xp(true_scores, predict_scores): true_inds = np.argsort(true_scores)[::-1] true_scores = np.array(true_scores) reorder_true_scores = true_scores[true_inds] predict_scores = np.array(predict_scores) reorder_predict_scores = predict_scores[true_inds] ranks = np.argsort(reorder_predict_scores)[::-1] num_archs = len(ranks) # calculate precision at each point cur_inds = np.zeros(num_archs) passed_set = set() for i_rank, rank in enumerate(ranks): cur_inds[i_rank] = (cur_inds[i_rank - 1] if i_rank > 0 else 0) + \ int(i_rank in passed_set) + int(rank <= i_rank) passed_set.add(rank) patks = cur_inds / (np.arange(num_archs) + 1) THRESH = 100 p_corrs = [] for prec in [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]: k = np.where(patks[THRESH:] >= prec)[0][0] + THRESH arch_inds = ranks[:k][ranks[:k] < k] # stats.kendalltau(arch_inds, np.arange(len(arch_inds))) p_corrs.append( (k, float(k) / num_archs, len(arch_inds), prec, stats.kendalltau(reorder_true_scores[arch_inds], reorder_predict_scores[arch_inds]).correlation)) return p_corrs
def correlations_weighted_unweighted(labels): #load network print 'weighted vs unweighted' name = '_'.join(labels) wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") #read counts with zeros wikipedia_u = load_graph("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz") correlations_weighted_pagerank = {} for label in labels: for damping in [0.8,0.85,0.9]: correlations_values={} key_weighted = label+"_page_rank_weighted_"+str(damping) pagerank_weighted = wikipedia.vertex_properties[key_weighted] key_unweighted = "page_rank"+str(damping) pagerank_unweighted = wikipedia_u.vertex_properties[key_unweighted] print 'pearson' p = pearsonr(pagerank_weighted.a, pagerank_unweighted.a) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(pagerank_weighted.a, pagerank_unweighted.a) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(pagerank_weighted.a, pagerank_unweighted.a) print k correlations_values['kendalltau']=k correlations_weighted_pagerank[label+str(damping)]=correlations_values write_pickle(HOME+'output/correlations/correlations_pagerank_weightedvsunweighted'+name+'.obj', correlations_weighted_pagerank)
def kendall_rank_correlation(item_item, user_user, itemknn, wrmf): """ Find/display kendall_rank_correlation between each of the recommendation methods in the input parameters :param item_item: list having top k shows through item-item :param user_user: list having top k shows through user-user :param itemknn: list having top k shows through MyMediaLite standard library's itemknn method :param wrmf: list having top k shows through MyMediaLite standard library's itemknn method :return: nothing """ recommend_types = [item_item, user_user, itemknn, wrmf] k_r_correlation = np.zeros((4, 4)) for i in range(4): for j in range(4): if j >= i: k_r_correlation[i][j] = kendalltau(recommend_types[i], recommend_types[j])[0] else: k_r_correlation[i][j] = k_r_correlation[j][i] print("\n*** Kendall Rank correlation coefficient ***") table_labels = ["Item_Item", "User_User", "ItemKNN", "WRMF"] print("{0:^11s}{1:^11s}{2:^11s}{3:^11s}{4:^11s}".format("", *table_labels)) for i in range(4): print("{0:11s}".format(table_labels[i]), end='') for j in range(4): print("{0:^11.5f}".format(k_r_correlation[i][j]), end='') print()
def correlations_ground_truth(): print 'ground truth' #load network wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz") #read counts with zeros article_counts = pd.read_csv(TMP+'article_counts.tsv', sep='\t') cor = {} for damping in [0.8,0.9]: page_rank = pagerank(wikipedia, damping=damping) wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank page_rank_values = list() counts = list() correlations_values = {} for index, row in article_counts.iterrows(): counts.append(float(row['counts'])) page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(page_rank_values, counts) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(page_rank_values, counts) print k correlations_values['kendalltau']=k cor['page_rank_'+str(damping)]=correlations_values write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor)
def test_goodness(model, vocab): """Tests the model on its ability to create a goodness ranking for a category. Method: get spearman (rank) correlation between the predicted and the actual ranking. This method is using data from De Deyne et al. (2008)""" d = dedeyne_etal_goodness.get_goodness_rankings() results = {category: dict() for category in d} categories = set(d.keys()) & vocab for category in categories: exemplars = set(d[category]) & vocab sorted_exemplars = [ b for a, b in sorted([(model.similarity(category, ex), ex) for ex in exemplars], reverse=True) ] predicted_ranking = [] actual_ranking = [] for exemplar in exemplars: actual_ranking.append(d[category].index(exemplar)) predicted_ranking.append(sorted_exemplars.index(exemplar)) results[category]["spearman"] = spearmanr(predicted_ranking, actual_ranking) results[category]["kendall"] = kendalltau(predicted_ranking, actual_ranking) results[category]["num_items"] = len(exemplars) avg_spearman = float(sum(abs(results[cat]["spearman"][0]) for cat in categories)) / len(categories) avg_kendall = float(sum(abs(results[cat]["kendall"][0]) for cat in categories)) / len(categories) results["overall"] = dict() results["overall"]["avg_spearman"] = avg_spearman results["overall"]["avg_kendall"] = avg_kendall return results
def batch_corr(x, y, method="pearsonr", ngap=1): """ Compute correlation on streaming sequence x and y (batch method) Parameters ---------- x: time series y: time series method: determin which type of correlation is computed. Accept method is "pearsonr", "spearmanr" or "kendalltau" ngap: output correlation every ngap observations Returns ------- corrs: correlations computed at selected time indexes. The selected time indexes are ngap-1, 2*ngap-1, ... t: selected time indexes """ n1 = len(x) n2 = len(y) assert (n1 == n2),"The length of time series x and time series y must be equal!" corrs = np.empty(n1//ngap) if method == "pearsonr": for i in range(ngap-1, n1, ngap): corrs[(i+1)/ngap - 1] = pearsonr(x[:(i+1)], y[:(i+1)])[0] elif method == "spearmanr": for i in range(ngap-1, n1, ngap): corrs[(i+1)/ngap - 1] = spearmanr(x[:(i+1)], y[:(i+1)])[0] elif method == "kendalltau": for i in range(ngap-1, n1, ngap): corrs[(i+1)/ngap -1] = kendalltau(x[:(i+1)], y[:(i+1)])[0] else: raise ValueError(('The method "%s" is not supported. Please specify one of ' 'the following options: "pearsonr", "spearmanr" or "kendalltau"') % method) t = range(ngap-1, n1, ngap) return corrs, t
def correlations_speed(cur, variable1, variable2, table): """ Correlation of 2 variables (including scatter plot) """ x = select(cur, variable1, table) y = select(cur, variable2, table) # Scatterplot # mpl.style.use('ggplot') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.set_xlabel("Gap") ax.set_ylabel("Sentiment magnitude") fig.suptitle('Correlation funding gap and sentiment magnitude') plt.scatter(x, y) plt.show() # Pearson correlation and p-value p_corr_speed_length = pearsonr(x, y) print("Pearson: ", p_corr_speed_length) # Spearman correaltion and p-value s_corr_speed_length = spearmanr(x, y) print("Spearman: ", s_corr_speed_length) # Kendall correlation and p-value k_corr_speed_length = kendalltau(x, y) print("Kendall: ", k_corr_speed_length)
def KendallsTau(GroundAvgInfs,AlgoAvgInfs,PageRanks,OutDegree): import numpy as np from scipy.stats import stats Ground_rank = sorted(range(len(GroundAvgInfs)), key=lambda i: GroundAvgInfs[i])[-len(GroundAvgInfs):] Algo_rank = sorted(range(len(AlgoAvgInfs)), key=lambda i: AlgoAvgInfs[i])[-len(AlgoAvgInfs):] Pagerank_rank = sorted(range(len(PageRanks)), key=lambda i: PageRanks[i])[-len(PageRanks):] OutDeg_rank = sorted(range(len(OutDegree)), key=lambda i: OutDegree[i])[-len(OutDegree):] Ground_rank.reverse() Pagerank_rank.reverse() Algo_rank.reverse() OutDeg_rank.reverse() positions = list(range(453)) dictionary = dict(zip(Ground_rank,positions)) Ground_rankings = [dictionary[i] for i in Ground_rank] Page_rankings = [dictionary[i] for i in Pagerank_rank] Algo_rankings = [dictionary[i] for i in Algo_rank] OutDeg_rankings = [dictionary[i] for i in OutDeg_rank] ''' Below is a full code of Kendall's tau just in case you want to know how it works concord = np.zeros(453) discord = np.zeros(453) for ii in Page_rankings: for jj in Page_rankings: if jj > ii: if Page_rankings[ii] < Page_rankings[jj]: concord[ii] = concord[ii] + 1 elif Page_rankings[ii] > Page_rankings[jj]: discord[ii] = discord[ii] + 1 concord_total = np.sum(concord) discord_total = np.sum(discord) kendall_tau = (concord_total-discord_total)/(concord_total+discord_total) ''' AlgoTau, p_value = stats.kendalltau(Ground_rankings,Algo_rankings) PageTau, Pp_value = stats.kendalltau(Ground_rankings,Page_rankings) OutTau, Op_value = stats.kendalltau(Ground_rankings,OutDeg_rankings) return AlgoTau, PageTau, OutTau
def get_spearman_and_kendalltau_correlations(top_n_aspects: int = 10): correlations = {} for reviews_path in settings.BING_LIU_ASPECT_DATASETS_PATHS: dataset_name = basename(reviews_path).split('.')[0] print(f'\nDataset to analyze: {dataset_name}') # get freq aspects from Bing Liu manually created datasets aspects_freq_manual_assignment = get_aspect_frequency_ranking(reviews_path=reviews_path, top_n=top_n_aspects) print(f'TOP{top_n_aspects} Manually extracted aspects: {aspects_freq_manual_assignment}') # get aspects from RST + PageRank aspects_from_rst_based_on_pagerank = get_aspects_rankings_from_rst( [ aspects_graph_path for aspects_graph_path in ASPECTS_GRAPH_PATHS if dataset_name in aspects_graph_path ][0], aspects_freq_manual_assignment ) aspects_from_rst_based_on_pagerank_top = get_aspect_ranking_based_on_rst_and_pagerank( [ aspects_graph_path for aspects_graph_path in ASPECTS_GRAPH_PATHS if dataset_name in aspects_graph_path ][0], top_n_aspects ) print(f'Bing Liu aspects: {aspects_from_rst_based_on_pagerank}') print(f'RST aspects: {aspects_from_rst_based_on_pagerank_top}') aspects_freq_manual_assignment_ranking, aspects_from_rst_based_on_pagerank_ranking = create_rankings( aspects_freq_manual_assignment, aspects_from_rst_based_on_pagerank) spearman_correlation = stats.spearmanr( aspects_freq_manual_assignment_ranking, aspects_from_rst_based_on_pagerank_ranking) print(f'{dataset_name}, Spearman correlation of ranking: {spearman_correlation}') kendalltau_correlation = stats.kendalltau( aspects_freq_manual_assignment_ranking, aspects_from_rst_based_on_pagerank_ranking) print(f'{dataset_name}, KendalTau correlation of ranking: {kendalltau_correlation}') aspects_manual = set(aspects_freq_manual_assignment) aspects_rst = set(aspects_from_rst_based_on_pagerank_top) correlations[dataset_name] = { 'Spearman Correlation': spearman_correlation[0], 'Spearman p-value': spearman_correlation[1], 'Kendall Tau Correlation': kendalltau_correlation[0], 'Kendall Tau p-value': kendalltau_correlation[1], 'Jaccard': len(aspects_manual.intersection(aspects_rst))/len(aspects_manual.union(aspects_rst)), 'Recall': len(aspects_manual.intersection(aspects_rst)) / len(aspects_manual), 'Precision': len(aspects_manual.intersection(aspects_rst)) / len(aspects_rst) } return correlations
def similarity(v1, v2): v1 = np.array(v1) v2 = np.array(v2) pcc = pearsonr(v1, v2)[0] cos = cosine(v1, v2) spc = spearmanr(v1, v2)[0] kdt = kendalltau(v1, v2)[0] return (pcc, cos, spc, kdt)
def compute_distance(self, recommendation_list_1, recommendation_list_2): if self.distance == 'kendalltau': distance, p_value = stats.kendalltau(recommendation_list_1, recommendation_list_2) if self.distance == 'weighted_kendalltau': distance, p_value = stats.weightedtau(recommendation_list_1, recommendation_list_2) return distance
def similarity_tf(v1, v2): ret_pcc, ret_cos, ret_SA = sess.run([_pcc, _cos, _SA], feed_dict={ _x: v1, _y: v2, _len: len(v1) }) ret_spc = spearmanr(v1, v2)[0] ret_kdt = kendalltau(v1, v2)[0] return ret_pcc, ret_cos, ret_spc, ret_kdt, ret_SA
def corr(x, y, method="pearsonr"): """ Compute pearson correlation on time series x and y """ if method == "pearsonr": return pearsonr(x, y)[0] elif method == "spearmanr": return spearmanr(x, y)[0] elif method == "kendalltau": return kendalltau(x, y)[0]
def _correlation_test(self, parents=None, method=0, corr_alpha=0.05): """correlation test of X,Y apply pearson, spearman and kendall's tau-b correlation parameters __________ method -> 0 for pearson correlation 1 for spearman correlation 2 for kendall's tau-b correlation default as spearman correlation cuz non-gaussian distribution of data. corr_alpha -> correlation significant alpha for two-tails test default as 0.05. attribute _________ _corr_parents -> correlation parents. nest dict. """ # inital parents if parents is None: parents, _, _len_node = self._set_default_parents() # initial correlation parents _corr_parents = defaultdict(dict) # loop for all target variable for i in range(self.N): _corr_parents[i] = list() # return X,Y for parents X,Y,_len_node = self._set_parents_matrix(self.data,i,parents) if _len_node == 0: _corr_parents[i] = [] else: # caculate correlation coefficient and p value # for each column in X and Y for j in range(_len_node): # corr and p value for X column and Y for three methods. if method == 0: corr_result = pearsonr(X[:,j],Y) elif method == 1: corr_result = spearmanr(X[:,j],Y) elif method == 2: corr_result = kendalltau(X[:,j],Y) # if significant, append selected link. if corr_result[1] < corr_alpha: _corr_parents[i].append(parents[i][j]) return _corr_parents
def calculate_kendall_correlation( score_function, aggregation_function, path_ground_truth_list: str = PATH_GROUND_TRUTH_LIST, path_argument_list: str = PATH_ARGUMENT_LIST): """ This function calls the score function and computes the with the results the ranking and then calculates the kendall tau value with the baseline ranking :param score_function: Functions which computes the score value like jacards similarity :param aggregation_function: This function collects the max, min, average oder sum value for an argument :param most_premises_function: If set to true score will be calculated with number of premises :param random_score_function: If set to true score will be drawn of uniform distribution :param path_ground_truth_list: :param path_argument_list: :return: kendall tau value and dictionary with tau values for all conclusions """ kendall_tau_results = [] # Calculate score values with different score function score_results = calculate_score( score_function, path_ground_truth_list=path_ground_truth_list, path_argument_list=path_argument_list) # Aggregate results with min, max, sum and average method aggregated_score_results = calculate_aggregation_with( aggregation_function, score_results) # Calculate score values with uniform correlation if score_function == calculate_random_score: random.seed(114) # 12 15 17s for conclusion_id in aggregated_score_results.keys(): for argument_id in aggregated_score_results[conclusion_id].keys(): aggregated_score_results[conclusion_id][ argument_id] = random.uniform(0, 1) score_ranking = generate_ranking_from_aggregation(aggregated_score_results) # Collect baseline ranking from ground-truth-list.csv baseline_ranking = collect_baseline_ranking(path_ground_truth_list) baseline_ranking_dict = {} for rank in baseline_ranking: baseline_ranking_dict[rank[0]] = rank[1] # Calculate tau values and collect values in dict tau_conclusion_dict = {} for conclusion_id in score_ranking: ranking = score_ranking[conclusion_id] baseline_list = [] scores_list = [] for argument_id in ranking.keys(): baseline_list.append(baseline_ranking_dict[argument_id]) scores_list.append(ranking[argument_id]) tau, p_value = stats.kendalltau(baseline_list, scores_list) if math.isnan(tau): tau = 0.0 kendall_tau_results.append(tau) tau_conclusion_dict[conclusion_id] = tau return round(sum(kendall_tau_results) / len(kendall_tau_results), 2), tau_conclusion_dict
def get_corelation(X, Y): # Compute kendall and pearson correlation and return assert (len(X) == len(Y)), "X and Y must have same length" assert len(X) > 1, "Both X and Y must have at least 2 elements" correlation = {} cc, p_value = ss.pearsonr(X, Y) #+[0.0] assume that FD=0 when MS=0 correlation['pearson'] = {'corr': cc, 'p-value': p_value} cc, p_value = ss.kendalltau(X, Y) #+[0.0] assume that FD=0 when MS=0 correlation['kendall'] = {'corr': cc, 'p-value': p_value} return correlation
def valid_epoch(logger, val_loader, model, cfg, funcs=[]): model.eval() all_scores = [] true_accs = [] for _, (archs, accs) in enumerate(val_loader): scores = list(model.predict(archs).cpu().data.numpy()) all_scores += scores true_accs += list(accs) corr = stats.kendalltau(true_accs, all_scores).correlation funcs_res = [func(true_accs, all_scores) for func in funcs] return corr, funcs_res
def custom_scatter(x, y, ax): rho, rhoval = pearsonr(x, y) try: tau, tauval = kendalltau(x, y) except OverflowError: tau = -10 tauval = -10 rhoval = pformatting(rhoval) tauval = pformatting(tauval) ax.annotate(r'$\rho$' + ': p={} ({})'.format(np.round(rho, 2), rhoval) + '\n' + r'$\tau$' + ': p={} ({})'.format(np.round(tau, 2), tauval), xy=(0.05, 0.8), fontsize=11, xycoords='axes fraction')
def compute_means(raw_df, quantity_df, quantity_label, axis): mean_df = pd.DataFrame(columns=[quantity_label]) mean_df[quantity_label] = quantity_df["Quantity"] mean_df["Arithmetic Mean"] = raw_df.mean(axis=axis) mean_df["Geometric Mean"] = raw_df.apply(geo_mean, axis=axis) mean_df["Median"] = raw_df.mean(axis=axis) cols = [] for col in raw_df.columns.values: cols.append(col) raw_df["v"] = raw_df[cols].count(axis=1) m = np.mean(raw_df['v']) raw_df['w'] = raw_df['v'] / (raw_df['v'] + m) raw_df['r'] = np.mean(raw_df[cols], axis=1) c = np.mean(raw_df[cols].values.flatten()) raw_df['b'] = raw_df['w'] * raw_df['r'] + (1 - raw_df['w']) * c raw_df = raw_df.drop(['v', 'w', 'r'], axis=1) mean_df["Bayesian Mean"] = raw_df["b"] mean_df = mean_df.fillna(0) corr_df = pd.DataFrame(columns=[ "Rho (Arithmetic Mean)", "Rho (Geometric Mean)", "Rho (Bayesian Mean)", "Rho (Median)", "Tau (Arithmetic Mean)", "Tau (Geometric Mean)", "Tau (Bayesian Mean)", "Tau (Median)", ]) dict = {} quantity_column = mean_df[quantity_label] columns = mean_df.drop(quantity_label, axis=1).columns for col in columns: mean_column = mean_df[col] pearson_corr_val = pearsonr(quantity_column, mean_column)[0] if math.isnan(pearson_corr_val): pearson_corr_val = 0 kendall_corr_val = kendalltau(quantity_column, mean_column)[0] if math.isnan(kendall_corr_val): kendall_corr_val = 0 dict[f'Rho ({col})'] = pearson_corr_val dict[f'Tau ({col})'] = kendall_corr_val corr_df = corr_df.append(dict, ignore_index=True) return corr_df, mean_df
def test_xk(true_scores, predict_scores): true_inds = np.argsort(true_scores)[::-1] true_scores = np.array(true_scores) reorder_true_scores = true_scores[true_inds] predict_scores = np.array(predict_scores) reorder_predict_scores = predict_scores[true_inds] ranks = np.argsort(reorder_predict_scores)[::-1] num_archs = len(ranks) patks = [] for ratio in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]: k = int(num_archs * ratio) p = len(np.where(ranks[:k] < k)[0]) / float(k) arch_inds = ranks[:k][ranks[:k] < k] patks.append( (k, ratio, len(arch_inds), p, stats.kendalltau(reorder_true_scores[arch_inds], reorder_predict_scores[arch_inds]).correlation)) return patks
def pairwise_valid(val_loader, model, seed=None): if seed is not None: random.seed(seed) np.random.seed(seed) model.eval() true_accs = [] all_archs = [] for step, (archs, accs) in enumerate(val_loader): all_archs += list(archs) true_accs += list(accs[:, -1]) num_valid = len(true_accs) pseudo_scores = np.zeros(num_valid) indexes = model.argsort_list(all_archs, batch_size=512) pseudo_scores[indexes] = np.arange(num_valid) corr = stats.kendalltau(true_accs, pseudo_scores).correlation funcs_res = [func(true_accs, all_scores) for func in funcs] return corr, funcs_res
def _ktau_union(orig_run, rep_run, trim_thresh=TRIM_THRESH, pbar=False): """ Helping function returning a generator to determine Kendall's tau Union (KTU) for all topics. @param orig_run: The original run. @param rep_run: The reproduced/replicated run. @param trim_thresh: Threshold values for the number of documents to be compared. @param pbar: Boolean value indicating if progress bar should be printed. @return: Generator with KTU values. """ generator = tqdm(rep_run.items()) if pbar else rep_run.items() for topic, docs in generator: orig_docs = list(orig_run.get(topic).keys())[:trim_thresh] rep_docs = list(rep_run.get(topic).keys())[:trim_thresh] union = list(sorted(set(orig_docs + rep_docs))) orig_idx = [union.index(doc) for doc in orig_docs] rep_idx = [union.index(doc) for doc in rep_docs] yield topic, kendalltau(orig_idx, rep_idx).correlation
def corr(x, y, method="pearsonr"): """ Compute pearson correlation on time series x and y Parameters ---------- x: time series y: time series Returns ------- corr: correlation between x and y """ if method == "pearsonr": corr = pearsonr(x, y)[0] elif method == "spearmanr": corr = spearmanr(x, y)[0] elif method == "kendalltau": corr = kendalltau(x, y)[0] return corr
def test_xk(true_scores, predict_scores): true_inds = np.argsort(true_scores)[::-1] true_scores = np.array(true_scores) reorder_true_scores = true_scores[true_inds] predict_scores = np.array(predict_scores) reorder_predict_scores = predict_scores[true_inds] ranks = np.argsort(reorder_predict_scores)[::-1] num_archs = len(ranks) patks = [] for ratio in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]: k = int(num_archs * ratio) if k < 1: continue p = len(np.where(ranks[:k] < k)[0]) / float(k) arch_inds = ranks[:k][ranks[:k] < k] # [#samples, #samples/#total_samples, models in top-K, P@K%, Kendall-Tau] patks.append((k, ratio, len(arch_inds), p, stats.kendalltau( reorder_true_scores[arch_inds], reorder_predict_scores[arch_inds]).correlation)) return patks
values = line.split() map = float(values[2]) predictedqrelMap.append(map) retval = p.wait() predictionMapResult = predicted_location_base + str(percentage) + '_bpref.txt' tmp = "" for val in predictedqrelMap: tmp = tmp + str(val) + "," text_file = open(predictionMapResult, "w") text_file.write(tmp) text_file.close() #exit(0) tau, p_value = kendalltau(originalqrelMap, predictedqrelMap) predictedqrelMap = [] # cleaning it for next trains_percenatge list.append(tau) protocol_result[protocol] = list #print len(training_variation) plt.subplot(subplot_loc[var]) '''plt.plot(x_labels_set, protocol_result['SAL'], '-r', label='SAL', linewidth=2.0) #print protocol_result['SAL'] plt.plot(x_labels_set, protocol_result['CAL'], '-b', label='CAL', linewidth=2.0) plt.plot(x_labels_set, protocol_result['SPL'], '-g', label='SPL', linewidth=2.0) ''' plt.plot(x_labels_set, protocol_result['SAL'], '-r', marker='o', label='SAL', linewidth=1.0) plt.plot(x_labels_set, protocol_result['CAL'], '-b', marker='^', label='CAL', linewidth=1.0) plt.plot(x_labels_set, protocol_result['SPL'], '-g', marker='s', label='SPL', linewidth=1.0)
def test_nasbench(nasbench_search_space): import numpy as np from scipy.stats import stats from aw_nas.btcs import nasbench_101 from aw_nas.evaluator.arch_network import PointwiseComparator from aw_nas.rollout.compare import CompareRollout ss = nasbench_search_space # construct controller controller = nasbench_101.NasBench101Controller(ss, device="cuda") compare_controller = nasbench_101.NasBench101CompareController( ss, device="cuda", rollout_type="compare") # construct evaluator evaluator = nasbench_101.NasBench101Evaluator(None, None, None) # test random sample _ = ss.random_sample() # test controller.sample rollouts = controller.sample(n=20) # test genotype print(rollouts[0].genotype) # test evaluator.evaluate_rollout rollouts = evaluator.evaluate_rollouts(rollouts, False) print(rollouts) evaluator.rollout_type = "compare" c_rollouts = compare_controller.sample(n=4) print(c_rollouts[0].genotype) # test evaluator.evaluate_rollout for compare rollouts c_rollouts = evaluator.evaluate_rollouts(c_rollouts, False) print(c_rollouts) # test nb101-gcn embedder comparator = PointwiseComparator(ss, arch_embedder_type="nb101-gcn", arch_embedder_cfg={"hid_dim": 96}) comparator_2 = PointwiseComparator(ss, arch_embedder_type="nb101-gcn", arch_embedder_cfg={"hid_dim": 96}) pred_scores = comparator.predict([r.arch for r in rollouts]) pred_scores_2 = comparator_2.predict([r.arch for r in rollouts]) label_scores = [r.perf["reward"] for r in rollouts] corr_init_1 = stats.kendalltau(label_scores, pred_scores.cpu().data.numpy()).correlation corr_init_2 = stats.kendalltau( label_scores, pred_scores_2.cpu().data.numpy()).correlation # compare_scores = comparator.compare([r.rollout_1.arch for r in c_rollouts], # [r.rollout_2.arch for r in c_rollouts]) # try training for several epochs using update_predict true_scores = np.random.rand(len(rollouts)) for i_step in range(5): loss = comparator.update_predict([r.arch for r in rollouts], true_scores) print("update predict {}: {:.4f}".format(i_step, loss)) # try training for several epochs using update_compare # construct compare rollouts between every pair in rollouts c_rollouts_2 = [ CompareRollout(rollout_1=rollouts[i], rollout_2=rollouts[j]) for i in range(len(rollouts)) for j in range(i) ] better_lst = [ label_scores[j] > label_scores[i] for i in range(len(rollouts)) for j in range(i) ] for i_step in range(5): loss = comparator_2.update_compare_rollouts(c_rollouts_2, better_lst) print("update compare {}: {:.4f}".format(i_step, loss)) # test after training pred_scores_after = comparator.predict([r.arch for r in rollouts]) pred_scores_2_after = comparator_2.predict([r.arch for r in rollouts]) corr_after_1 = stats.kendalltau( label_scores, pred_scores_after.cpu().data.numpy()).correlation corr_after_2 = stats.kendalltau( label_scores, pred_scores_2_after.cpu().data.numpy()).correlation print("True accs: ", label_scores) print( "PREDICT: before training: {} (corr {:.3f}); after training: {} (corr {:.3f})" .format(pred_scores, corr_init_1, pred_scores_after, corr_after_1)) print( "COMPARE: before training: {} (corr {:.3f}); after training: {} (corr {:.3f})" .format(pred_scores_2, corr_init_2, pred_scores_2_after, corr_after_2))
for row in results: counts.append(float(row[1])) page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))]) #for index, row in df.iterrows(): # counts.append(float(row['counts'])) # page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations['pearson']=p print 'spearmanr' s= spearmanr(page_rank_values, counts) print s correlations['spearmanr']=s print 'kendalltau' k= kendalltau(page_rank_values, counts) print k correlations['kendalltau']=k correlations_sem_sim_weighted_pagerank[key]=correlations cor[kk]=correlations_sem_sim_weighted_pagerank write_pickle(HOME+'output/correlations/correlations_pagerank_without_zeros'+network_name+'.obj', cor) def map_to_hyp_indicies(vocab, l): ids = list() for v in l.values: ids.append(vocab[str(v)]) return ids
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False): #load network print struct name = '_'.join(labels) wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") #read counts with zeros if consider_zeros: article_counts = pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t') print TMP+clickstream_data+'article_counts.tsv' correlations_weighted_pagerank = {} for label in labels: if struct: label = label[7:] for damping in [0.8,0.85,0.9]: key = label+"_page_rank_weighted_"+str(damping) pagerank = wikipedia.vertex_properties[key] page_rank_values = list() counts = list() correlations_values = {} for index, row in article_counts.iterrows(): counts.append(float(row['counts'])) page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(page_rank_values, counts) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(page_rank_values, counts) print k correlations_values['kendalltau']=k correlations_weighted_pagerank[key]=correlations_values write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank) else: db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() # wikipedia graph structural statistics results = None try: if clickstream_data != '': results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() else: results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() except MySQLdb.Error, e: print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0])) print 'after sql load' correlations_weighted_pagerank = {} for label in labels: if struct: label = label[7:] for damping in [0.8,0.85,0.9]: key = label+"_page_rank_weighted_"+str(damping) pagerank = wikipedia.vertex_properties[key] correlations={} counts=[] page_rank_values=[] for row in results: counts.append(float(row[1])) page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations['pearson']=p print 'spearmanr' s= spearmanr(page_rank_values, counts) print s correlations['spearmanr']=s print 'kendalltau' k= kendalltau(page_rank_values, counts) print k correlations['kendalltau']=k correlations_weighted_pagerank[key]=correlations write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
def main(): finaldatafile = "finaldata.json" finalData = None try: with open(finaldatafile) as data_file: finalData = json.load(data_file) except: print("Run analysis") exit() for appliName in finalData: cgscore, issuescore, classSize = finalData[appliName] j = 0 issueCallgraphValueForStats = [] callGraphValueForStats = [] issueSizeValueForStats = [] classSizeValueForStats = [] issueForModel = [] callGraphForModel = [] classSizeForModel = [] for key in issuescore: if key in cgscore: j+=1 issueCallgraphValueForStats.append(issuescore[key]) callGraphValueForStats.append(cgscore[key]) for key in issuescore: if key in classSize: issueSizeValueForStats.append(issuescore[key]) classSizeValueForStats.append(classSize[key]) for key in issuescore: if key in classSize: if key in cgscore: issueForModel.append(issuescore[key]) callGraphForModel.append(cgscore[key]) classSizeForModel.append(classSize[key]) if j>3: spearmanCorrelationCoefficient, spearmanpvalue = spearmanr(issueCallgraphValueForStats,callGraphValueForStats) kendalltauCorrelationCoefficient, kendalltaupvalue = kendalltau(issueCallgraphValueForStats,callGraphValueForStats) kstestdissueValueForStats, kstestpvalueissueValueForStats = kstest([issuescore[key] for key in issuescore],"norm") kstestdcgValueForGraph, kstestpvaluecgValueForGraph = kstest([cgscore[key] for key in cgscore],"norm") spearmanCorrelationCoefficient2, spearmanpvalue2 = spearmanr(issueSizeValueForStats,classSizeValueForStats) kendalltauCorrelationCoefficient2, kendalltaupvalue2 = kendalltau(issueSizeValueForStats,classSizeValueForStats) kstestdchissueSizeValueForStats, kstestpvaluechissueSizeValueForStats = kstest([issuescore[key] for key in issuescore],"norm") kstestdclassSizeValueForStats, kstestpvalueclassSizeValueForStats = kstest([classSize[key] for key in classSize],"norm") print(appliName) print("--- API Call <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalue)) print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue)) print(" "*8 + "KS Test D = " + str(kstestdissueValueForStats)) print(" "*8 + "KS p-value = " + str(kstestpvalueissueValueForStats)) print(" "*8 + "KS Test D = " + str(kstestdcgValueForGraph)) print(" "*8 + "KS p-value = " + str(kstestpvaluecgValueForGraph)) print(" "*8 + "dataset size =" + str(j)) print("--- Class Size <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient2)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2)) print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient2)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2)) print(" "*8 + "KS Test D = " + str(kstestdchissueSizeValueForStats)) print(" "*8 + "KS p-value = " + str(kstestpvaluechissueSizeValueForStats)) print(" "*8 + "KS Test D = " + str(kstestdclassSizeValueForStats)) print(" "*8 + "KS p-value = " + str(kstestpvalueclassSizeValueForStats)) y = issueForModel X = np.array([callGraphForModel,classSizeForModel]).transpose() X = list([list(i) for i in X]) model = sm.OLS(y, X) results = model.fit() print(results.summary(yname="issues", xname =("APIcalls", "ClassSize"))) else: print("FAILURE : " + appliName) print("|" * 80) print("-" * 80) print("-" * 80) print("|" * 80) issueForGlobalModel = [] callGraphForGlobalModel = [] classSizeForGlobalModel = [] issueGlobalCallgraphValueForStats = [] callGlobalGraphValueForStats = [] NOissueGlobalCallgraphValueForStats = [] issueGlobalSizeValueForStats = [] classGlobalSizeValueForStats = [] anova1issue = [] anova2issue = [] for appliName in finalData: cgscore, issuescore, classSize = finalData[appliName] for key in issuescore: if key in classSize: if key in cgscore: issueForGlobalModel.append(issuescore[key]) callGraphForGlobalModel.append(cgscore[key]) classSizeForGlobalModel.append(issuescore[key]) for key in issuescore: if key in cgscore: j+=1 issueGlobalCallgraphValueForStats.append(issuescore[key]) callGlobalGraphValueForStats.append(cgscore[key]) else: NOissueGlobalCallgraphValueForStats.append(issuescore[key]) for key in cgscore: if key in issuescore: anova1issue.append(cgscore[key]) else: anova2issue.append(cgscore[key]) for key in issuescore: if key in classSize: issueGlobalSizeValueForStats.append(issuescore[key]) classGlobalSizeValueForStats.append(classSize[key]) spearmanGlobalCorrelationCoefficient, spearmanpvalueGlobal = spearmanr(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats) kendalltauGlobalCorrelationCoefficient, kendalltaupvalueGlobal = kendalltau(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats) spearmanGlobalCorrelationCoefficient2, spearmanpvalue2Global = spearmanr(issueGlobalSizeValueForStats,classGlobalSizeValueForStats) kendalltauGlobalCorrelationCoefficient2, kendalltaupvalue2Global = kendalltau(issueGlobalSizeValueForStats,classGlobalSizeValueForStats) fvalueanova1, pvalueanova1 = f_oneway(issueGlobalCallgraphValueForStats, NOissueGlobalCallgraphValueForStats) fvalueanova2, pvalueanova2 = f_oneway(anova1issue, anova2issue) print(len(NOissueGlobalCallgraphValueForStats)) print("--- Correlation : API Call <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalueGlobal)) print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalueGlobal)) print(" "*8 + "ANOVA F-value = " + str(fvalueanova1)) print(" "*8 + "ANOVA p-value = " + str(pvalueanova1)) print(" "*8 + "ANOVA F-value = " + str(fvalueanova2)) print(" "*8 + "ANOVA p-value = " + str(pvalueanova2)) print("--- Correlation : Class Size <> Issue") print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient2)) print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2Global)) print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient2)) print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2Global)) print("_"*80) print("_"*80) print("-- GLOBAL OLS --") y = issueForGlobalModel X = np.array([callGraphForGlobalModel,classSizeForGlobalModel]).transpose() X = list([list(i) for i in X]) X = sm.add_constant(X,prepend=False) model = sm.OLS(y, X) results = model.fit() print(results.summary(yname="issues", xname =("APIcalls", "ClassSize", "const"))) print("API CALLS only") X = callGraphForGlobalModel X = sm.add_constant(X,prepend=False) model2 = sm.OLS(y, X) results = model2.fit() print(results.summary(yname="issues",xname =["APIcalls","const"])) print("Size only") X = classSizeForGlobalModel X = sm.add_constant(X,prepend=False) model3 = sm.OLS(y, X) results = model3.fit() print(results.summary(yname="issues",xname =["ClassSize","const"]))