def sp_eval(y_pred, y_true): n, m = y_pred.shape total = 0 for i in range(n): rho, pval = sp(y_pred[i, :], y_true[i, :]) total += rho return total / n
def percentile_bucket(vals, bucket_size=10, scale=1.0, shift=0.0): """ returns percentile scores for each value Parameters ---------- bucket_size (float) The size of each bucket, in percentile points 0-100. Actual bucket cutoffs are calculated with numpy.arange(), so if 100 isn't divisible by bucket_size, your top bucket will be small. scale (float) All values will be multiplied by this number after bucketing. shift (float) All values will have this added to them after scaling. """ from scipy.stats import scoreatpercentile as sp import numpy as np from bisect import bisect_left percs = np.concatenate([np.arange(bucket_size,100,bucket_size), [100]]) # arange to get the percentiles cuts = [sp(vals, p) for p in percs] # to get the cutoff score for each percentile new_list = np.array([bisect_left(cuts, val)+1 for val in vals]) * scale + shift # turn values into bucket numbers... +1 since we want 1-indexed buckets return new_list
def data_analysis_and_correlation(df_education, df_gdp): """ Analysis and Correlation education data with gdp. """ print "[Data Analysis and Correlation of Education to GDP data] ==> Begin" common_countries = list(set(df_education['Country'].tolist()) & set(df_gdp['Country'].tolist())) gdp = [] total_school_time = [] men_school_time = [] women_school_time = [] for cntry in common_countries: df1 = df_education[df_education['Country'] == cntry] df2 = df_gdp[df_gdp['Country'] == cntry] if df2['GDP_'+ df1['Year'].iloc[0]].iloc[0] != '': total_school_time.append(int(df1['Total_School_Time'].iloc[0])) men_school_time.append(int(df1['Men_School_Time'].iloc[0])) women_school_time.append(int(df1['Women_School_Time'].iloc[0])) gdp.append(math.log(df2['GDP_'+ df1['Year'].iloc[0]].iloc[0])) df_edu_to_gdp = pd.DataFrame({'Total': total_school_time, 'Men': men_school_time, \ 'Women': women_school_time, 'GDP': gdp}) print df_edu_to_gdp.corr(), "\n" gdp_np_array = np.array(df_edu_to_gdp.GDP.tolist()) for col in ['Women', 'Men', 'Total']: r_val, p_val = sp(gdp_np_array, np.array(df_edu_to_gdp[col].tolist())) print "Correlation of GDP against {}:".format(col) print "Pearsons correlation coefficient: {}".format(r_val) print "2-tailed p-values: {}\n".format(p_val) # Scatter matrix plot with histogram of data plots in the diagonal pd.scatter_matrix(df_edu_to_gdp, alpha=0.05, figsize=(10, 10), diagonal='hist') plt.savefig('figures/education_to_gdp/data_education_gdp_analysis.png') plt.clf() # # ==> Conclusion / Summary # GDP Men Total Women # GDP 1.000000 0.495794 0.479050 0.497923 # Men 0.495794 1.000000 0.971663 0.942572 # Total 0.479050 0.971663 1.000000 0.977217 # Women 0.497923 0.942572 0.977217 1.000000 # print """
def percentile_bucket(vals, bucket_size=10, scale=1.0, shift=0.0): """ returns percentile scores for each value Parameters ---------- bucket_size (float) The size of each bucket, in percentile points 0-100. Actual bucket cutoffs are calculated with numpy.arange(), so if 100 isn't divisible by bucket_size, your top bucket will be small. scale (float) All values will be multiplied by this number after bucketing. shift (float) All values will have this added to them after scaling. """ from scipy.stats import scoreatpercentile as sp import numpy as np from bisect import bisect_left # arange to get the percentiles percs = np.concatenate([np.arange(bucket_size, 100, bucket_size), [100]]) # to get the cutoff score for each percentile cuts = [sp(vals, p) for p in percs] # turn values into bucket numbers... +1 since we want 1-indexed buckets new_list = np.array( [bisect_left(cuts, val) + 1 for val in vals]) * scale + shift return new_list
import gensim from scipy.stats import spearmanr as sp import pandas as pd model = gensim.models.KeyedVectors.load_word2vec_format( '/media/jaya/study stuffs/IITM/2nd_sem/NLP/NLPpa1/google/GoogleNews-vectors-negative300.bin', binary=True) #word = model.similarity(word1,word2) df = pd.read_csv( '/media/jaya/study stuffs/IITM/2nd_sem/NLP/NLPpa1/google/combined.csv') word1 = df.values[:, 0] word2 = df.values[:, 1] score_353 = df.values[:, 2] scores = [] for k in range(len(df)): tmp = model.similarity(word1[k], word2[k]) scores.append(tmp) cor = sp(scores, score_353) print(cor)