def compute_pearson_and_spearman_r(A, B, n_pool, n_test): assert A.shape[0] == n_pool + n_test A_diag = np.diag(A)[:n_pool].tolist() B_diag = np.diag(B)[:n_pool].tolist() A_pool_test = A[:n_pool][:, n_pool:] B_pool_test = B[:n_pool][:, n_pool:] A_offdiag = np.reshape(A_pool_test, -1).tolist() B_offdiag = np.reshape(B_pool_test, -1).tolist() pr_diag, pr_diag_p = pr(A_diag, B_diag) pr_offdiag, pr_offdiag_p = pr(A_offdiag, B_offdiag) spr_diag, spr_diag_p = spr(A_diag, B_diag) spr_offdiag, spr_offdiag_p = spr(A_offdiag, B_offdiag) return pr_diag, pr_offdiag, spr_diag, spr_offdiag, pr_diag_p, pr_offdiag_p, spr_diag_p, spr_offdiag_p
def correlation(df): corrp1 = np.corrcoef(df['Spots Number'], df['Flares Number']) cp1 = 'La correlación de Pearson entre el número de manchas y el número de llamaradas es {:.2f}'.format( corrp1[0][1]) #print (cp1) corrp2 = np.corrcoef(df['Spots Month Mean'], df['F Month Mean']) cp2 = 'La correlación de Pearson entre la media mensual de manchas y de llamaradas es {:.2f}'.format( corrp2[0][1]) #print (cp2) corrp3 = np.corrcoef(df['Spots Month Mean'], df['F Month Maximum']) cp3 = 'La correlación de Pearson entre la media mensual de manchas y el máximo mensual de llamaradas es {:.2f}'.format( corrp3[0][1]) #print (cp3) corrs1 = spr(df['Spots Number'], df['Flares Number']) cs1 = 'La correlación de Spearman entre el número de manchas y el número de llamaradas es {:.2f}'.format( corrs1[0]) #print (cs1) corrs2 = spr(df['Spots Month Mean'], df['F Month Mean']) cs2 = 'La correlación de Spearman entre la media mensual de manchas y de llamaradas es {:.2f}'.format( corrs2[0]) #print (cs2) corrs3 = spr(df['Spots Month Mean'], df['F Month Maximum']) cs3 = 'La correlación de Spearman entre la media mensual de manchas y el máximo mensual de llamaradas es {:.2f}'.format( corrs3[0]) #print (cs3) corrk1 = kdl(df['Spots Number'], df['Flares Number']) ck1 = 'La correlación de Kendall entre el numero de manchas y el número de llamaradas es {:.2f}'.format( corrk1[0]) #print (ck1) corrk2 = kdl(df['Spots Month Mean'], df['F Month Mean']) ck2 = 'La correlación de Kendall entre la media mensual de manchas y de llamaradas es {:.2f}'.format( corrk2[0]) #print (ck2) corrk3 = kdl(df['Spots Month Mean'], df['F Month Maximum']) ck3 = 'La correlación de Kendall entre la media mensual de manchas y el máximo mensual de llamaradas es {:.2f}'.format( corrk3[0]) #print (ck3) return [cp1, cp2, cp3, cs1, cs2, cs3, ck1, ck2, ck3]
def evaluate_spearman_corr(gt, pred, ref='TotalScore'): """Spearman’s ρ rank correlation statistic between features and gt aesthetic score. First we get common elements to compare and order them by image file name. ref: either 'aesthetic score' if its mlsp method or 'TotalScore' if it's AADB dataset """ imgs_to_eval = list(set(gt.ImageFile) & set(pred.ImageFile)) gt = gt[gt['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile']) pred = pred[pred['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile']) for col in pred.columns[1:]: attr_gt = gt.loc[:,ref] attr_pred = pred.loc[:,col] rho,pval = spr(attr_gt,attr_pred) print("{}: rho: {} at p value: {}".format(col, rho, pval))
def plot_corr(gt, pred, name, ref='TotalScore'): """Plot datapoints and regression to see correlation between features and gt aesthetic score. First we get common elements to compare and order them by image file name. ref: either 'aesthetic score' if its mlsp method or 'TotalScore' if it's AADB dataset """ imgs_to_eval = list(set(gt.ImageFile) & set(pred.ImageFile)) gt = gt[gt['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile']) pred = pred[pred['ImageFile'].isin(imgs_to_eval)].sort_values(by=['ImageFile']) for col in pred.columns[1:]: attr_gt = gt.loc[:,ref] attr_pred = pred.loc[:,col] df = pd.DataFrame({"AestheticScore": attr_gt, col: attr_pred}) sns_scatter_plot = sns.jointplot(x="AestheticScore", y=col, data=df, kind="reg") rho, pval = spr(attr_gt,attr_pred) sns_scatter_plot.savefig("./eval/" + name + "_" + col + "_" + "{:.4f}".format(rho) + ".png") plt.close()
# try: df_dwd2 = transform_to_bools( df_dwd2, percentile_level) df_rea2 = transform_to_bools( df_rea2, percentile_level) # except Exception as msg: # print(msg) cmn_vals1 = df_dwd1.loc[cmn_idx].values.ravel() cmn_vals2 = df_dwd2.loc[cmn_idx].values.ravel() cmn_rea1 = df_rea1.loc[cmn_idx].values.ravel() cmn_rea2 = df_rea2.loc[cmn_idx].values.ravel() # np.nansum(df_dwd1) # df_dwd1.max() try: spr_corr = spr(cmn_vals1, cmn_vals2)[0] prs_corr = prs(cmn_vals1, cmn_vals2)[0] sep_dist = distance_sorted[ix2] spr_corr_rea = spr(cmn_rea1, cmn_rea2)[0] prs_corr_rea = prs(cmn_rea1, cmn_rea2)[0] # sep_dist_rea = distance_sorted[ix2] except Exception as msg: print(msg) if np.isnan(spr_corr): print('corr_is_nan') df_distance_corr.loc[stn_id, 'sep_dist_%s' % _id2] = sep_dist df_distance_corr.loc[stn_id, 'pears_corr_%s' % _id2] = spr_corr
import pandas as pd from scipy.stats import spearmanr as spr from sys import argv script,prefix = argv prefix = str(prefix) # example Usage # python getCorrelation.py imgListTestNewRegression_ groundTruth = pd.read_csv(prefix+'.csv') predictAttr = pd.read_csv(prefix+'predict.csv') assert (groundTruth.columns == predictAttr.columns).all() assert groundTruth.shape == predictAttr.shape assert pd.Series.equals(groundTruth.ImageFile,predictAttr.ImageFile) for col in groundTruth.columns[1:]: attrGT = groundTruth.loc[:,col] attrP = predictAttr.loc[:,col] rho,pval = spr(attrGT,attrP) print "For {} rho: {} at p value: {}".format(col,rho,pval)
from scipy.stats import spearmanr as spr import numpy as np #SCIsdatabase = '~/IQA_CNN/SCIs/DistortedImages' for x in range(10,100): gamma = float(x)/1000 os.system('libsvm/svm-scale -l -1 -u 1 -s allrange live_train.txt > train_scale') os.system('libsvm/svm-train -s 3 -g {0} -c 2048 -b 1 -q train_scale allmodel'.format(gamma)) #print str(x) if len(sys.argv)>1 and sys.argv[1]=='retest': with open('live_test.txt') as fp : lines = fp.readlines() lines = map(lambda x : x.replace('\n',''),lines) image_info = map(lambda x : x.split(' '),lines) fp.close() os.system('rm live_test_score.txt') cmds = map(lambda x : './brisquequality -im '+x[0]+' >> live_test_score.txt',image_info) for i,cm in enumerate(cmds): print i os.system(cm) #map(lambda x :os.system(x),cmds) #image_name = map(lambda x : '_'.join(['cim'+str(x/49+1),str((x-x/49*49)/7+1),str((x-x/49*49)%7+1)])+'.bmp',range(0,980)) test_score = np.loadtxt('live_test_score.txt') mos_score = np.loadtxt('live_mos.txt') sp_v = spr(test_score,mos_score) print sp_v fp = open('live_svm_parms.txt','a') fp.write('{0} : spearmanr : {1}\n'.format(gamma,sp_v)) fp.close()
dwd_pcp = dwd_hdf5_de.get_pandas_dataframe(stn_id).dropna() in_df_rea6_stn = in_df_rea6.loc[:, stn_id].dropna() cmn_idx = dwd_pcp.index.intersection(in_df_rea6_stn.index) #break df_dwd1 = resampleDf(dwd_pcp.loc[cmn_idx,:], temp_agg) df_rea1 = resampleDf(in_df_rea6_stn.loc[cmn_idx], temp_agg) if df_dwd1.size > 0: if test_for_extremes: df_dwd1 = transform_to_bools(df_dwd1, percentile_level) df_rea1 = transform_to_bools(df_rea1, percentile_level) spr_corr_dwd_rea = spr(df_dwd1.values.ravel(), df_rea1.values.ravel())[0] prs_corr_dwd_rea = prs(df_dwd1.values.ravel(), df_rea1.values.ravel())[0] else: spr_corr_dwd_rea = spr(df_dwd1.values.ravel(), df_rea1.values.ravel())[0] prs_corr_dwd_rea = prs(df_dwd1.values.ravel(), df_rea1.values.ravel())[0] df_distance_corr.loc[stn_id, 'prs_corr_dwd_rea'] = prs_corr_dwd_rea df_distance_corr.loc[stn_id, 'spr_corr_dwd_rea'] = spr_corr_dwd_rea # all stns
_file = '../data/testing/test.pkl' gt_file = 'imgListTestNewRegression_.csv' assert(exists(_file)) data = joblib.load(_file) groundTruth = pd.read_csv(gt_file, header=0, delimiter=',') n = groundTruth.shape[0] predAtt = pd.DataFrame(index=groundTruth.index, columns=groundTruth.columns) x = data[0] y_true = data[1] model = model2(weights_path=weights_file) y_predict = model.predict(x, batch_size=batch_size, verbose=1) attrs = ['BalacingElements', 'ColorHarmony', 'Content', 'DoF', 'Light', 'MotionBlur', 'Object', 'RuleOfThirds', 'VividColor', 'Repetition', 'Symmetry', 'score'] for i,attr in enumerate(attrs): attr_true = y_true[attr] attr_predict = y_predict[i] rho, p_value = spr(attr_true, attr_predict) error = mse(attr_true, attr_predict) print "for {} the spr correlation: {} with p value {} and error value: {}".format(attr, rho, p_value, error) attr_predict = pd.Series(y_predict[i].reshape(n)) predAtt[attr] = attr_predict.values predAtt['ImageFile'] = groundTruth['ImageFile'] predAtt.to_csv(gt_file[0:-4]+'_predict.csv', index=False)
def compare_pws_prim_netw_indicator_correlations(args): ''' Find then for the pws station the neighboring prim_netw station intersect both stations, for the given probabilistic percentage threshold find the corresponding ppt_thr from the CDF of each station seperatly, make all values boolean (> 1, < 0) and calculate the pearson rank correlation between the two stations Add the result to a new dataframe and return it ''' (path_to_prim_netw_data_hdf5, in_prim_netw_df_coords_utm32, path_pws_ppt_df_hdf5, in_pws_df_coords_utm32, all_pws_ids, prim_netw_points_tree, prim_netw_stns_ids, df_results_correlations, neighbor_to_chose, val_thr_percent, min_req_ppt_vals) = args # get all pws and prim_netw data HDF5_pws = HDF5(infile=path_pws_ppt_df_hdf5) HDF5_prim_netw = HDF5(infile=path_to_prim_netw_data_hdf5) alls_stns_len = len(all_pws_ids) # to count number of stations # iterating through pws ppt stations for ppt_stn_id in all_pws_ids: print('\n**\n pws stations is %d/%d**\n' % (alls_stns_len, len(all_pws_ids))) # reduce number of remaining stations alls_stns_len -= 1 try: # read first pws station try: pws_ppt_stn1_orig = HDF5_pws.get_pandas_dataframe(ppt_stn_id) except Exception as msg: print('error reading pws', msg) pws_ppt_stn1_orig = pws_ppt_stn1_orig[ pws_ppt_stn1_orig < max_ppt_thr] # select df with period pws_ppt_season = select_df_within_period(pws_ppt_stn1_orig, start=start_date, end=end_date) # drop all index with nan values pws_ppt_season.dropna(axis=0, inplace=True) if pws_ppt_season.size > min_req_ppt_vals: # find distance to all prim_netw stations, sort them, select # minimum (xpws, ynetamto) = (in_pws_df_coords_utm32.loc[ppt_stn_id, 'X'], in_pws_df_coords_utm32.loc[ppt_stn_id, 'Y']) # This finds the index of neighbours distances, indices = prim_netw_points_tree.query(np.array( [xpws, ynetamto]), k=2) stn_2_prim_netw = prim_netw_stns_ids[ indices[neighbor_to_chose]] min_dist_ppt_prim_netw = np.round(distances[neighbor_to_chose], 2) if min_dist_ppt_prim_netw <= min_dist_thr_ppt: # check if prim_netw station is near, select and read # prim_netw stn try: df_prim_netw_orig = HDF5_prim_netw.get_pandas_dataframe( stn_2_prim_netw) except Exception as msg: print('error reading prim_netw', msg) df_prim_netw_orig.dropna(axis=0, inplace=True) # select only data within same range df_prim_netw_orig = select_df_within_period( df_prim_netw_orig, pws_ppt_season.index[0], pws_ppt_season.index[-1]) # =============================================== # Check neighboring prim_netw stations # =============================================== # for the prim_netw station, neighboring the pws # get id, coordinates and distances of prim_netw # neighbor (xprim_netw, yprim_netw) = ( in_prim_netw_df_coords_utm32.loc[stn_2_prim_netw, 'X'], in_prim_netw_df_coords_utm32.loc[stn_2_prim_netw, 'Y']) distances_prim_netw, indices_prim_netw = ( prim_netw_points_tree.query(np.array( [xprim_netw, yprim_netw]), k=5)) # +1 to get neighbor not same stn stn_near_prim_netw = prim_netw_stns_ids[indices_prim_netw[ neighbor_to_chose + 1]] min_dist_prim_netw_prim_netw = np.round( distances_prim_netw[neighbor_to_chose + 1], 2) try: # read the neighboring prim_netw station try: df_prim_netw_ngbr = HDF5_prim_netw.get_pandas_dataframe( stn_near_prim_netw) except Exception as msg: print('error reading prim_netw', msg) df_prim_netw_ngbr.dropna(axis=0, inplace=True) # select only data within same range df_prim_netw_ngbr = select_df_within_period( df_prim_netw_ngbr, pws_ppt_season.index[0], pws_ppt_season.index[-1]) except Exception: raise Exception # calculate Indicator correlation between # prim_netw-prim_netw if min_dist_prim_netw_prim_netw < min_dist_thr_ppt: cmn_idx = pws_ppt_season.index.intersection( df_prim_netw_ngbr.index).intersection( df_prim_netw_orig.index) if cmn_idx.size > min_req_ppt_vals: df_prim_netw_cmn_season = df_prim_netw_orig.loc[ cmn_idx, :] df_pws_cmn_season = pws_ppt_season.loc[cmn_idx, :] df_prim_netw_ngbr_season = df_prim_netw_ngbr.loc[ cmn_idx, :] assert (df_prim_netw_cmn_season.isna().sum(). values[0] == 0) assert ( df_pws_cmn_season.isna().sum().values[0] == 0) assert (df_prim_netw_ngbr_season.isna().sum(). values[0] == 0) #====================================== # select only upper tail of values of both dataframes #====================================== val_thr_float = val_thr_percent / 100 # this will calculate the EDF of pws # station pws_cdf_x, pws_cdf_y = get_cdf_part_abv_thr( df_pws_cmn_season.values.ravel(), -0.1) # find ppt value corresponding to quantile # threshold pws_ppt_thr_per = pws_cdf_x[np.where( pws_cdf_y >= val_thr_float)][0] # this will calculate the EDF of prim_netw # station prim_netw_cdf_x, prim_netw_cdf_y = get_cdf_part_abv_thr( df_prim_netw_cmn_season.values.ravel(), -0.1) # find ppt value corresponding to quantile # threshold prim_netw_ppt_thr_per = prim_netw_cdf_x[np.where( prim_netw_cdf_y >= val_thr_float)][0] # print('\n****transform values to booleans*****\n') # if Xi > Ppt_thr then 1 else 0 df_pws_cmn_Bool = (df_pws_cmn_season > pws_ppt_thr_per).astype(int) df_prim_netw_cmn_Bool = ( df_prim_netw_cmn_season > prim_netw_ppt_thr_per).astype(int) # calculate spearman correlations of booleans 1, 0 bool_spr_corr = np.round( spr(df_prim_netw_cmn_Bool.values.ravel(), df_pws_cmn_Bool.values.ravel())[0], 2) #====================================== # select only upper tail both dataframes #===================================== prim_netw2_cdf_x, prim_netw2_cdf_y = ( get_cdf_part_abv_thr( df_prim_netw_ngbr_season.values, -0.1)) # get prim_netw2 ppt thr from cdf prim_netw2_ppt_thr_per = prim_netw2_cdf_x[np.where( prim_netw2_cdf_y >= val_thr_float)][0] df_prim_netw2_cmn_Bool = ( df_prim_netw_ngbr_season > prim_netw2_ppt_thr_per).astype(int) # calculate spearman correlations of booleans # 1, 0 bool_spr_corr_prim_netw = np.round( spr(df_prim_netw_cmn_Bool.values.ravel(), df_prim_netw2_cmn_Bool.values.ravel())[0], 2) # check if df_prim_netw2_cmn_Bool correlation between # pws and prim_netw is higher than between # prim_netw and prim_netw neighbours, if yes, keep # pws if True: # bool_prs_corr >= bool_spr_corr_prim_netw: print('+++keeping pws+++') #================================== # append the result to df_correlations #================================== # df_results_correlations.loc[ # ppt_stn_id, # 'lon'] = lon_stn_pws # df_results_correlations.loc[ # ppt_stn_id, # 'lat'] = lat_stn_pws df_results_correlations.loc[ ppt_stn_id, 'Distance to neighbor'] = min_dist_ppt_prim_netw df_results_correlations.loc[ ppt_stn_id, 'prim_netw neighbor ID'] = stn_2_prim_netw df_results_correlations.loc[ ppt_stn_id, 'prim_netw-prim_netw neighbor ID'] = stn_near_prim_netw df_results_correlations.loc[ ppt_stn_id, 'Distance prim_netw-prim_netw neighbor'] = min_dist_prim_netw_prim_netw df_results_correlations.loc[ ppt_stn_id, 'pws_%s_Per_ppt_thr' % val_thr_percent] = pws_ppt_thr_per df_results_correlations.loc[ ppt_stn_id, 'prim_netw_%s_Per_ppt_thr' % val_thr_percent] = prim_netw_ppt_thr_per df_results_correlations.loc[ ppt_stn_id, 'Bool_Spearman_Correlation_pws_prim_netw'] = bool_spr_corr df_results_correlations.loc[ ppt_stn_id, 'Bool_Spearman_Correlation_prim_netw_prim_netw'] = bool_spr_corr_prim_netw else: pass # print('---Removing pws---') # # df_results_correlations.loc[ # ppt_stn_id, # 'Bool_Pearson_Correlation_pws_prim_netw' # ] = bool_prs_corr # df_results_correlations.loc[ # ppt_stn_id, # 'Bool_Pearson_Correlation_prim_netw_prim_netw' # ] = bool_prs_corr_prim_netw else: print('not enough data') # print('\n********\n ADDED DATA TO DF RESULTS') else: pass # print('After intersecting dataframes not enough data') else: pass # print('prim_netw Station is near but not enough data') else: pass # print('\n********\n prim_netw station is not near') except Exception as msg: print('error while finding neighbours ', msg) continue df_results_correlations.dropna(how='all', inplace=True) return df_results_correlations