def spark_pearson(a, b): rdd_a = sc.parallelize(a) rdd_b = sc.parallelize(b) g = func.func_globals g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson') g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman') func(a, b)
def test_R_implementation_equivalence(self): data = self.sc.parallelize( [ 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501, -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555, -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063, -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691, 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942, ] ) model = Statistics.kolmogorovSmirnovTest(data, "norm") self.assertAlmostEqual(model.statistic, 0.189, 3) self.assertAlmostEqual(model.pValue, 0.422, 3) model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) self.assertAlmostEqual(model.statistic, 0.189, 3) self.assertAlmostEqual(model.pValue, 0.422, 3)
def CorrelationFeature(vectors): matriz = sc.broadcast(Statistics.corr(vectors, method="pearson")) summary = Statistics.colStats(vectors) varianza = summary.variance() #########new heuristic diogo proposal w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = varianza[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) #features sorted index = [] for i in r: index.append(i[1]) index = index[0:6] #tacking the first 6 features return index
def run4(self): from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label raw_data = self.raw_data vector_data = raw_data.map(parse_interaction) # Compute column summary statistics. summary = Statistics.colStats(vector_data) print "Duration Statistics:" print " Mean: {}".format(round(summary.mean()[0],3)) print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3)) print " Max value: {}".format(round(summary.max()[0],3)) print " Min value: {}".format(round(summary.min()[0],3)) print " Total value count: {}".format(summary.count()) print " Number of non-zero values: {}".format(summary.numNonzeros()[0]) label_vector_data = raw_data.map(parse_interaction_with_key) normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.") normal_summary = Statistics.colStats(normal_label_data.values()) print "Duration Statistics for label: {}".format("normal") print " Mean: {}".format(normal_summary.mean()[0],3) print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3)) print " Max value: {}".format(round(normal_summary.max()[0],3)) print " Min value: {}".format(round(normal_summary.min()[0],3)) print " Total value count: {}".format(normal_summary.count()) print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0]) normal_sum = summary_by_label(raw_data, "normal.") print "Duration Statistics for label: {}".format("normal") print " Mean: {}".format(normal_sum.mean()[0],3) print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3)) print " Max value: {}".format(round(normal_sum.max()[0],3)) print " Min value: {}".format(round(normal_sum.min()[0],3)) print " Total value count: {}".format(normal_sum.count()) print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0]) label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.", "imap.","ipsweep.","land.","loadmodule.","multihop.", "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.", "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.", "warezmaster."] stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list] duration_by_label = [ (stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())])) for stat in stats_by_label] pd.set_option('display.max_columns', 50) stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index') print "Duration statistics, by label" stats_by_label_df
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2())) data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x)) summary2 = Statistics.colStats(data2) self.assertEqual(array([45.0]), summary2.normL1()) import math expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10)))) self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
def test_col_with_different_rdds(self): # numpy data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(1000, summary.count()) # array data = self.sc.parallelize([range(10)] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count()) # array data = self.sc.parallelize([pyarray.array("d", range(10))] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count())
def CorrelationFeature(vectors): # print 'Calculation Correlation' matriz = sc.broadcast(Statistics.corr(vectors, method="pearson")) summary = Statistics.colStats(vectors) varianza = summary.variance() #########new heuristic diogo proposal w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = varianza[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) #features sorted #print r # print 'calculating features selections' #Old heuristic # # w={} # # for i in range(len(matriz)): # # w[i]=0 # # for j in np.nan_to_num(matriz[i]): # # k=abs(j) # # w[i]=w[i]+k # r=sorted([(value,key) for (key,value) in w.items()],reverse=True) #####"" #vectors=np.matrix(vectors) #beforeMatrix=vectors.map(lambda x: np.matrix(x)) index = [] for i in r: index.append(i[1]) index = index[0:6] #tacking the first 6 features #MatrixReducer(vectors,index) return index
def test_R_implementation_equivalence(self): data = self.sc.parallelize([ 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501, -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555, -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063, -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691, 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942 ]) model = Statistics.kolmogorovSmirnovTest(data, "norm") self.assertAlmostEqual(model.statistic, 0.189, 3) self.assertAlmostEqual(model.pValue, 0.422, 3) model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) self.assertAlmostEqual(model.statistic, 0.189, 3) self.assertAlmostEqual(model.pValue, 0.422, 3)
def calculateStats(years2stats, op): result = dict() for year in years2stats: stats = sc.parallelize(years2stats[year]) summary = Statistics.colStats(stats) if op == 'mean': means = summary.mean() valuesList = [] for singleElement in means: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList if op == 'variance': variances = summary.variance() valuesList = [] for singleElement in variances: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList if op == 'max': maxValue = summary.max() valuesList = [] for singleElement in maxValue: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList if op == 'min': minValue = summary.min() valuesList = [] for singleElement in minValue: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList return result
def generateFeatureClusters(context, geneExp, samples, headers, numClusters): # Ignore the first item (the diagnosis header) headers = headers[1:] # 1) Generate statistic data for each of the genes/entrez ids # Retrieve the mean, variance, max and min of each gene # The entrez id associate with each gene is the row index (matches to the headers index) cStats = Statistics.colStats(geneExp) print(len(cStats.mean())) data = np.array( [cStats.mean(), cStats.variance(), cStats.max(), cStats.min()]).transpose() # Create a stats array with the index as first column # e_id for e_id in headers dataWithIndex = np.array([[e_id for e_id in headers], cStats.mean(), cStats.variance(), cStats.max(), cStats.min()]).transpose() print(dataWithIndex.shape) # 2) Create dataframes that will be used to train KMeans # Create dataframe for the stats data (with no entrez ids) df = context.parallelize(data) # create dataframe for the stats data (with entrez ids) # Will be used to cluster features later dfWithIndex = context.parallelize(dataWithIndex) # 3) Train KMeans with statistic data # use the stats data to discover clusters for the genes model = KMeans.train(df, numClusters, maxIterations=100, initializationMode="random") # 4) save model model.save(context, './models/clusters') # 5) Label each feature with their cluster # For each gene statistic, map it to (prediction, e_id) clusterLabeledFeatures = dfWithIndex.map( lambda point: (model.predict(point[1:]), point[0])) featuresToCluster = dfWithIndex.map(lambda point: point[0], (model.predict(point[1:]))) # 6) Group together the features by their cluster label clusteredFeatures = clusterLabeledFeatures.groupByKey() #print(clusteredFeatures.count()) #print(clusteredFeatures.take(2)) cF = clusteredFeatures.collectAsMap() # 7) Transform the sample data to use the clusters samplesWithClusters = samples.map(lambda sample: updateSample(sample, cF)) return samplesWithClusters
def do_all(f_path,out_name): sc = SparkContext() data = sc.textFile(f_path) data = data.map(parseKeepD).filter(lambda p: p[0] != None) # Scale Features features = data.map(lambda x: x[0].features) summary = Statistics.colStats(features) global means global varis means = summary.mean() varis = summary.variance() #scale the points data = data.map(lambda y: (conv_label_pt(y[0]),y[1])) #train model model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none') #calculate disparity disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1])) #calculate SSR for later ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum() #keep N N = disparity.count() #shut down SC MSE = ssr/float(N) se = std_errors(data,MSE,N) disparity.saveAsTextFile(out_loc + out_name) sc.stop() return model.intercept,model.weights,se,disparity, ssr, N
def _transform(self, df): for k, v in df.schema[ self.inputCol].metadata["ml_attr"]["attrs"].items(): features_df = pd.DataFrame(v) column_names = list(features_df['name']) df_vector = df.rdd.map(lambda x: x[self.inputCol].toArray()) #self.correlation_type is class parameter matrix = Statistics.corr(df_vector, method=self.correlation_type) # apply pandas dataframe operation on the fit output corr_df = pd.DataFrame(matrix, columns=column_names, index=column_names) final_corr_df = pd.DataFrame(corr_df.abs().unstack().sort_values( kind='quicksort')).reset_index() final_corr_df.rename( { 'level_0': 'col1', 'level_1': 'col2', 0: 'correlation_value' }, axis=1, inplace=True) final_corr_df = final_corr_df[ final_corr_df['col1'] != final_corr_df['col2']] #shortlisted dataframe based on custom cutoff shortlisted_corr_df = final_corr_df[ final_corr_df['correlation_value'] > self.correlation_cutoff] return corr_df, shortlisted_corr_df
def test_matrix_independence(self): data = [ 40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0 ] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)
def compute_correlation_matrix(df, method='pearson'): df_rdd = df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(df_rdd, method=method) corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns) return corr_mat_df
def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table, need_sorting=True) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) self._dataframe_helper.add_chisquare_significant_dimension( testDimension, v_value) return chisquare_result
def summarize(dataset): print "schema: %s" % dataset.schema().json() labels = dataset.map(lambda r: r.label) print "label average: %f" % labels.mean() features = dataset.map(lambda r: r.features) summary = Statistics.colStats(features) print "features average: %r" % summary.mean()
def calculateCorrelation(rdd1, rdd2): joined_rdd = rdd1.join(rdd2).sortByKey() rdd1_values = joined_rdd.map(lambda x:x[1][0]) rdd2_values = joined_rdd.map(lambda x:x[1][1]) correlation_value = Statistics.corr(rdd1_values, rdd2_values) return (joined_rdd,correlation_value)
def compute_correlation_matrix(df, method='pearson'): # wrapper around # https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html df_rdd = df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(df_rdd, method=method) corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns) return corr_mat_df
def compute_correlation_matrix(df, method='pearson'): columns=[item[0] for item in df.dtypes if (item[1].startswith('float') or item[1].startswith('double'))]#need to work according to the datatypes df_filter=df.select(columns) df_rdd = df_filter.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(df_rdd, method=method) corr_mat_df = pd.DataFrame(corr_mat,columns=df_filter.columns,index=df_filter.columns) return corr_mat_df
def scriptJob(self, limit=None, rowstart=None, rowstop=None): start = datetime.datetime.now() # create hbase connection row = self.table.scan(row_start=rowstart, row_stop=rowstop, limit=limit, columns=self.columns) print(type(row)) testRdd = self.sc.parallelize(row) values = testRdd.values() print(values.count()) col = bytes(self.columns.encode("utf-8")) serilizeRdd = values.map(lambda value: float(value.get(col).decode())) # # def hash_domain(url): # return hash(urlparse.urlparse(url).netloc) mlibRDD = self.sc.parallelize( (([Vectors.dense(x)]) for x in serilizeRdd.collect())) cStats = Statistics.colStats(mlibRDD) # print(cStats.mean()) end = datetime.datetime.now() print(end - start) return cStats.mean()
def correlationTemperatureHardness(df,spark): column1 = df.select('temperature').rdd.map(lambda x: x['temperature']).filter(lambda x: x is not None).filter(lambda x: x != '') column2 = df.select('hardness').rdd.map(lambda x: x['hardness']).filter(lambda x: x is not None).filter(lambda x: x != '') data = column1.zip(column2) corr_matrix = Statistics.corr(data) return corr_matrix[1][0]
def info_paragraphs(df, clm): df = df.where(col(clm).isNotNull()) paragraphs = df.rdd.flatMap(lambda x: getattr(x, clm)).filter( lambda p: p != None) paragraphs = paragraphs.map(lambda p: np.array(len(p.split()))) summary = Statistics.colStats(paragraphs) return summary
def compute_correlation_matrix(df,method='spearman'): churn_data3_rdd = df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(churn_data3_rdd, method=method) corr_mat_churn_data3 = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns) return corr_mat_churn_data3
def test_right_number_of_results(self): num_cols = 1001 sparse_data = [ LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) ] chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) self.assertEqual(len(chi), num_cols) self.assertIsNotNone(chi[1000])
def test_goodness_of_fit(self): from numpy import inf observed = Vectors.dense([4, 6, 5]) pearson = Statistics.chiSqTest(observed) # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))` self.assertEqual(pearson.statistic, 0.4) self.assertEqual(pearson.degreesOfFreedom, 2) self.assertAlmostEqual(pearson.pValue, 0.8187, 4) # Different expected and observed sum observed1 = Vectors.dense([21, 38, 43, 80]) expected1 = Vectors.dense([3, 5, 7, 20]) pearson1 = Statistics.chiSqTest(observed1, expected1) # Results validated against the R command # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))` self.assertAlmostEqual(pearson1.statistic, 14.1429, 4) self.assertEqual(pearson1.degreesOfFreedom, 3) self.assertAlmostEqual(pearson1.pValue, 0.002717, 4) # Vectors with different sizes observed3 = Vectors.dense([1.0, 2.0, 3.0]) expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0]) self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3) # Negative counts in observed neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_obs, expected1) # Count = 0.0 in expected but not observed zero_expected = Vectors.dense([1.0, 0.0, 3.0]) pearson_inf = Statistics.chiSqTest(observed, zero_expected) self.assertEqual(pearson_inf.statistic, inf) self.assertEqual(pearson_inf.degreesOfFreedom, 2) self.assertEqual(pearson_inf.pValue, 0.0) # 0.0 in expected and observed simultaneously zero_observed = Vectors.dense([2.0, 0.0, 1.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, zero_observed, zero_expected)
def main(): ###Loading data from sources print 'before preprocess' data = [preprocess(input_file)] print 'after preprocess' #get spark context sc = getSparkContext() print 'before parallelize' data = np.hstack((data[0]['train_data'], data[0]['train_labels'].reshape( (data[0]['train_labels'].shape[0], 1)))) data = [ Vectors.dense(list(data[row, :])) for row in range(0, data.shape[0]) ] samples = sc.parallelize(data) #samples.persist() pearsonCorr = Statistics.corr(samples) print str(pearsonCorr).replace('nan', 'NaN') sys.exit() print Statistics.corr(data, method="pearson")
def estimate_correlation_matrix(df, cols, method='pearson', round_decimals=3): features = df.select(cols).rdd.map(lambda row: row[0:]) corr_mat= pd.DataFrame( Statistics.corr(features, method=method), columns=cols, index=cols) \ .round(round_decimals) \ .style \ .background_gradient(cmap='coolwarm') return corr_mat
def correlations(sdf, colnames, method='pearson', ax=None, plot=True): sdf = sdf.notHandy() correlations = Statistics.corr(sdf.select(colnames).dropna().rdd.map(lambda row: row[0:]), method=method) pdf = pd.DataFrame(correlations, columns=colnames, index=colnames) if plot: if ax is None: fig, ax = plt.subplots(1, 1) return sns.heatmap(round(pdf,2), annot=True, cmap="coolwarm", fmt='.2f', linewidths=.05, ax=ax) else: return pdf
def test_goodness_of_fit(self): from numpy import inf observed = Vectors.dense([4, 6, 5]) pearson = Statistics.chiSqTest(observed) # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))` self.assertEqual(pearson.statistic, 0.4) self.assertEqual(pearson.degreesOfFreedom, 2) self.assertAlmostEqual(pearson.pValue, 0.8187, 4) # Different expected and observed sum observed1 = Vectors.dense([21, 38, 43, 80]) expected1 = Vectors.dense([3, 5, 7, 20]) pearson1 = Statistics.chiSqTest(observed1, expected1) # Results validated against the R command # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))` self.assertAlmostEqual(pearson1.statistic, 14.1429, 4) self.assertEqual(pearson1.degreesOfFreedom, 3) self.assertAlmostEqual(pearson1.pValue, 0.002717, 4) # Vectors with different sizes observed3 = Vectors.dense([1.0, 2.0, 3.0]) expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0]) self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3) # Negative counts in observed neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0]) self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1) # Count = 0.0 in expected but not observed zero_expected = Vectors.dense([1.0, 0.0, 3.0]) pearson_inf = Statistics.chiSqTest(observed, zero_expected) self.assertEqual(pearson_inf.statistic, inf) self.assertEqual(pearson_inf.degreesOfFreedom, 2) self.assertEqual(pearson_inf.pValue, 0.0) # 0.0 in expected and observed simultaneously zero_observed = Vectors.dense([2.0, 0.0, 1.0]) self.assertRaises( IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
def column_statistics(data: pyspark.rdd.RDD): """ Compute vectors of column means and variances of a data frame. ` :param data: an RDD :return: returns column means and variances as vectors """ logger.info("Computing data statistics") summary = Statistics.colStats(data) return summary.mean(), summary.variance()
def column_means(data: pyspark.rdd.RDD): """ Compute vectors of column means. ` :param data: an RDD :return: returns column means as vector """ logger.info("Computing data means") summary = Statistics.colStats(data) return summary.mean()
def average_vector(data): from pyspark.sql.functions import col vectors = data.select("vectors").where(col("vectors").isNotNull()) from pyspark.mllib.linalg import Vectors vectors_v = vectors.map(lambda line: Vectors.dense(line)) from pyspark.mllib.stat import Statistics summary = Statistics.colStats(vectors_v) mean = summary.mean() logger.info(mean) return mean
def DropColsByCor(df, cor_cutoff): tdf = df dsu_dict = {} string_cols = [] for (a, b) in df.dtypes: if b == 'string': string_cols.append(a) for cols in string_cols: tdf = tdf.drop(cols) num_cols = len(tdf.columns) dsu = [i for i in range(num_cols)] size = [1 for i in range(num_cols)] features = tdf.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(features, method="pearson") for i in range(num_cols): for j in range(i): if corr_mat[i][j] > cor_cutoff: union(dsu, size, i, j) drop_cols = [] for i in range(num_cols): if dsu[i] != i: drop_cols.append(tdf.columns[i]) #Setting up dictionary to save up on iterations if dsu[i] == i: dsu_dict[tdf.columns[i]] = [tdf.columns[i]] for i in range(num_cols): if dsu[i] != i: ri = root(dsu, i) dsu_dict[tdf.columns[ri]].append(tdf.columns[i]) for cols in drop_cols: tdf = tdf.drop(cols) string_df = df.select(string_cols) #Adding index to help merge both string and numeric dataframes tdf = tdf.withColumn("RowNoIndex", monotonically_increasing_id()) string_df = string_df.withColumn("RowNoIndex", monotonically_increasing_id()) tdf = tdf.join(string_df, ['RowNoIndex']) tdf = tdf.drop('RowNoIndex') return dsu_dict, tdf
def corrFilter(df,col,excludeCols,target): useFulCol = [] corrScore = [] for col in train.select(col).columns : if col not in excludeCols: if Statistics.chiSqTest(train.select('C2').collect()).pValue < 0.05: colCorr = float(str(train.stat.corr(col,target))[0:5]) if colCorr > 0.03 or colCorr < -0.03: useFulCol.append(col) corrScore.append(colCorr) pearsonTable = pd.DataFrame({'colNmae':useFulCol,'pearson ':corrScore}) pearsonTable.sort_values(by='spearman',ascending=False, inplace=True) return pearsonTable
def recommend2user(self,user_id): query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id) def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [] rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) historyTitleData = self.spark.createDataFrame(rd, table_cols) historyTitleData = historyTitleData.dropna() self.model.createOrReplaceTempView("Database") historyTitleData.registerTempTable("historyTable") pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''') mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray()) mainVec = Statistics.colStats(mainRdd).mean() pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray())))) pager = pageRank.toDF() pager.createOrReplaceTempView("pager") sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''') return sortPageR.take(10)
def create_or_update_week(influencer_tweets, topic_tweets, week): topic_cor = [] influencer_cor = [] for t in topic_tweets: for i in influencer_tweets: if t['time'] == i['time']: topic_cor.append(t['count']) influencer_cor.append(i['count']) if len(topic_cor)<=1: corr = 0 else: sc = SparkContext(appName="CorrelationPerWeek") topic_tweets = sc.parallelize(topic_cor) influencer_tweets = sc.parallelize(influencer_cor) corr = Statistics.corr(topic_tweets, influencer_tweets, "pearson") sc.stop() url = "http://localhost:8000/api/weeks/" today = datetime.fromtimestamp(week/1000.0) payload = '{ "score": %f, "start_date": "%s" }' % ( float(corr), str(today.year) + "-" + str(today.month) + "-" + str(today.day)) headers = { 'authorization': "Basic ZGV2OjEyMzQ=", 'content-type': "application/json", 'cache-control': "no-cache", 'postman-token': "7c8668c0-a4c2-f42d-66a9-95cbfb7806c5" } try: response = requests.request("POST", url, data=payload, headers=headers) return response.json()['id'] except: print "error" return 1
def test_chi_sq_pearson(self): data = [ LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), LabeledPoint(1.0, Vectors.dense([3.5, 40.0])) ] for numParts in [2, 4, 6, 8]: chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts)) feature1 = chi[0] self.assertEqual(feature1.statistic, 0.75) self.assertEqual(feature1.degreesOfFreedom, 2) self.assertAlmostEqual(feature1.pValue, 0.6873, 4) feature2 = chi[1] self.assertEqual(feature2.statistic, 1.5) self.assertEqual(feature2.degreesOfFreedom, 3) self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
def test_matrix_independence(self): data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) # Results validated against R command # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` self.assertAlmostEqual(chi.statistic, 21.9958, 4) self.assertEqual(chi.degreesOfFreedom, 6) self.assertAlmostEqual(chi.pValue, 0.001213, 4) # Negative counts neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts) # Row sum = 0.0 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero) # Column sum = 0.0 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
def get_language_correlation(): """ calculates the correlation between github languages """ #Create Spark Context sc = SparkContext(appName="LanguageCorrelations") #Create SQL Context sqlCtx = SQLContext(sc) #Create a schemaRDD from json datasets stored in HDFS pushes = sqlCtx.jsonFile('git_14_15/git_results') #Register the schemaRDD as a Table pushes.registerTempTable('pushes') #filter the data to get the pushes for the languages from LANG filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG))) #perform map transformation to get the rdd in the format (actor, {lang : pushes}) f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes})) #group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...]) f_group = f_pair.groupByKey() #merge lang dictionries to get single orderd dict per actor f_merged = f_group.map(lambda s: merge_lang_dict(s[1])) #created rdd of vectors from the pushes values, which is required for the correlation algorithm vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values()))) #call the correlation function matrix = Statistics.corr(vectors) print matrix plot_graph(matrix) sc.stop()
print "Converting bigrams to sparse vectors in a dataframe for the train set" t0 = time() features=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema) features.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[323]: from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics print "Computing the chi vector" t0 = time() labeledPoints = features.map(lambda row : LabeledPoint(row.label, row.bigramVectors)) chi = Statistics.chiSqTest(labeledPoints) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[324]: print "Starting bigram selection,broadcasting the newly created bigram dictionary" t0 = time() biSelect = [revDict_broad.value[i] for i,bigram in enumerate(chi) if bigram.pValue <=0.3] dictSelect = {} for i,bigram in enumerate(biSelect): dictSelect[bigram]=i dictSel_broad = sc.broadcast(dictSelect) tt = time() - t0 print "Done in {} second".format(round(tt,3))
""" Testing with Correlation https://spark.apache.org/docs/latest/mllib-statistics.html """ from pyspark.mllib.stat import Statistics from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import SparseVector, Vectors sc = SparkContext("local", "Rubbish") seriesX = sc.parallelize([1.0, 2.0, -2.0], 2) seriesY = sc.parallelize([3.0, 4.0, 5.0], 2) corrXY = Statistics.corr(seriesX, seriesY, method="pearson") # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) print "Correlation between x & y: ", corrXY print "Correlation matrix: ", data
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics sc = SparkContext("local", "Rubbish") """ # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) """ # Sample vector composing of frequency of events vect = Vectors.dense([4,5,0,3]) # Summary of the test including the p-value, degrees of freedom, goodnessOfFitTestResult = Statistics.chiSqTest(vect) sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] matrix = Matrices.dense(3,4, sampleData) # Conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(matrix) # Test statistic, the method used, and the null hypothesis. print "SINGLE VECTOR FIT: " print goodnessOfFitTestResult ## Summary of the test including the p-value, degrees of freedom. print "INDEPENDENCE TEST RESULT: " print independenceTestResult
import numpy as np from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="CorrelationsExample") # SparkContext # $example on$ seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series # seriesY must have the same number of partitions and cardinality as seriesX seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) data = sc.parallelize( [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])] ) # an RDD of Vectors # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print(Statistics.corr(data, method="pearson")) # $example off$ sc.stop()
day = int(x[3:5]) year = int(x[6:10]) return(datetime.date(year,month,day).isocalendar()[1]) violent = ["ASSAULT","BATTERY","CRIM SEXUAL ASSAULT", "DOMESTIC VIOLENCE", "HOMICIDE", "KIDNAPPING"] def setFlags(x): if x in violent: return (0,1) else: return (1,0) beats = parts.map(lambda p:(p[10],p[2][6:10],getWeek(p[2]),1,setFlags(p[5]))) beats2 = beats.filter(lambda x:x[1]=="2015").map(lambda x:((x[0],x[2]),(x[3],x[4][0],x[4][1]))) beats3 = beats2.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2])) standard_vars = beats3.map(lambda row: Vectors.dense((row[0][1],row[1][0],row[1][1],row[1][2]))) summary = Statistics.colStats(standard_vars) mean_wn = summary.mean()[0] sd_wn = math.sqrt(summary.variance()[0]) mean_counts = list(summary.mean()[1:4]) sd_counts = list(np.sqrt(summary.variance()[1:4])) beats_standard = beats3.map(lambda x: (x[0][0],(x[0][1]-mean_wn)/(sd_wn),(x[1][0]-mean_counts[0])/sd_counts[0],(x[1][1]-mean_counts[1])/sd_counts[1], \ (x[1][2]-mean_counts[2])/sd_counts[2])) beats_list = beats_standard.map(lambda x: ((x[0]),1)).keys().distinct().collect() beats_list = beats_list[0:50] def parsePoint(tuple): values = [float(x) for x in tuple] return LabeledPoint(values[0], values[1:]) def deNorm(val,mean,sd): return(val*sd + mean) maxWeek = (21 - mean_wn) / sd_wn curWeek = (20 - mean_wn) / sd_wn
merged_final = merged.reduceByKey(lambda x,y : int(x) + int(y)) #sort by month-year # Map each year to all beats and their corresponding crime counts for that year, and sort the counts # by beat groupedbeatCountsbymonthyear = merged_final.map( lambda row: ( row[ 0 ][ 1 ], ( row[ 0 ][ 0 ], row[ 1 ] ) ) ) \ .groupByKey( ) \ .mapValues( lambda val: sorted( list( val ), key = lambda t: t[ 0 ] ) ); # Create a list of all beats groupbeats = [ elem[ 0 ] for elem in groupedbeatCountsbymonthyear.values( ).first( ) ]; beatvectorCounts = groupedbeatCountsbymonthyear.values( ) \ .map( lambda row: Vectors.dense( [ elem[ 1 ] for elem in row ] ) ); # Compute correlation between all beats for yearly crime counts corrMatrix = Statistics.corr( beatvectorCounts, method = 'pearson' ); # Fill the diagonal of correlation matrix with 0's corrMatrix.flags[ 'WRITEABLE' ] = True; np.fill_diagonal( corrMatrix, 0.0 ); # Get the 10 largest correlation values from the matrixr The correlation matrix is symmetric so # we take the largest 20 and step by 2. Finally, the index of the corresponding beat pairs for # top 10 correlation values is obtained. sortOrder = corrMatrix.argsort( axis = None ); indices = np.unravel_index( sortOrder[ -20::2 ], corrMatrix.shape ); # The corresponding beats names are obtained for the top 10 correlated beat pairs topBeatPairs = [ ( groupbeats[ i ], groupbeats[ j ] ) for i, j in zip( indices[ 0 ], indices[ 1 ] ) ]; for i, j in topBeatPairs:
from pyspark import SparkContext # $example on$ from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. print("%s\n" % independenceTestResult) obs = sc.parallelize(
def dist_corr(v1, v2): """ Function to compute correlation between two Spark RDDs """ return Statistics.corr(v1,v2)
# Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() print("Schema from LIBSVM:") df.printSchema() print("Loaded training data as a DataFrame with " + str(df.count()) + " records.") # Show statistical summary of labels. labelSummary = df.describe("label") labelSummary.show() # Convert features column to an RDD of vectors. features = MLUtils.convertVectorColumnsFromML(df, "features") \ .select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) # Save the records in a parquet file. tempdir = tempfile.NamedTemporaryFile(delete=False).name os.unlink(tempdir) print("Saving to " + tempdir + " as Parquet file.") df.write.parquet(tempdir) # Load the records back. print("Loading Parquet file with UDT from " + tempdir) newDF = spark.read.parquet(tempdir) print("Schema from Parquet:") newDF.printSchema() try:
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext # $example on$ import numpy as np from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext # $example on$ mat = sc.parallelize( [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])] ) # an RDD of Vectors # Compute column summary statistics. summary = Statistics.colStats(mat) print(summary.mean()) # a dense vector containing the mean value for each column print(summary.variance()) # column-wise variance print(summary.numNonzeros()) # number of nonzeros in each column # $example off$ sc.stop()
from datetime import datetime sc = SparkContext(appName= "Run 1 - Corr-Wide - Data95-08 - AWS") data_file = "s3://aws-logs-012060642840-us-west-2/elasticmapreduce/cloud_proj/95-08.csv" raw_data = sc.textFile (data_file).cache () #extract header header = raw_data.first () raw_data = raw_data.filter (lambda x:x != header) def parse_interaction(line): #split lines based on the delimeter, and create a list line_split = line.split (",") #replace NA with zeros line_split = [w.replace ('NA', '0') for w in line_split] #remove year, and other non-numeric data """ 0 = Year """ symbolic_indexes = [0, 8, 10, 16, 17, 22] clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes] return np.array ([float (x) for x in clean_line_split]) vector_data = raw_data.map (parse_interaction) #start timer at this point startTime = datetime.now() print (Statistics.corr (vector_data, method="pearson")) print ('Time consumed = '), (datetime.now() - startTime) sc.stop()
##### En trichant ##### # Utilisation de pandas pour résumer les données + afficher la matrice de corrélation df = pd.read_csv("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF+".csv", sep = ";",header=0) df.describe() # Matrice de corrélation # print(df.corr()) # ### Mllib Statistics # In[5]: from pyspark.mllib.stat import Statistics # Basics Statistics partsNum = parts.map(lambda line: line[0:8]) summary = Statistics.colStats(partsNum) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros()) Statistics.corr(partsNum, method="pearson") # # Classification supervisée # ## Naive Bayes # In[6]: from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel import utils_mesure nomF_svm = "glass_svm"
def readRankMatrix(): import numpy as np lines = sc.textFile('../yelp_trans.csv') rawData = lines.mapPartitionsWithIndex(removeHeader) mydata = rawData.map(removeColumns).cache() return mydata from pyspark.mllib.stat import Statistics from pandas import Series import pandas as pd import numpy as np import math mydata = readRankMatrix() corr = Statistics.corr(mydata) # set up the columns names and add a new names called user_id lines2 = sc.textFile('../yelp.csv') names = lines2.map(lambda line:line.split(",")).map(lambda a:a[0]).collect()[1:] s = Series([str for str in names]) pddata = pd.DataFrame(corr, columns=s) pddata['user_id'] = names df_corr = sqlContext.createDataFrame(pddata) # df_corr.cache() df_corr.registerTempTable("corr") def getTopReviewUsers(n): # n: the nth highest user ord_user = sqlContext.sql("select user_id, count(review_id) as count from reviews_json group by user_id order by count desc")
For dense vectors, MLlib uses either Python lists or the NumPy array type. The later is recommended, so you can simply pass NumPy arrays around. For sparse vectors, users can construct a SparseVector object from MLlib or pass SciPy scipy.sparse column vectors if SciPy is available in their environment. The easiest way to create sparse vectors is to use the factory methods imlpemented in Vectors. """ def parse_interaction (line): #split lines based on the delimeter, and create a list line_split = line.split (",") #replace NA with zeros line_split = [w.replace ('NA', '0') for w in line_split] #line_split = [w.replace ('', '0') for w in line_split] #keep all except year, and non-numeric values symbolic_indexes = [0, 8, 10,16, 17, 22] clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes] return np.array ([float (x) for x in clean_line_split]) vector_data = raw_data.map (parse_interaction) #start timer at this point startTime = datetime.now() summary = Statistics.colStats(vector_data) print ('Time consumed = '), (datetime.now() - startTime) print ('Mean of columns\n'), summary.mean () print ('Variances of columns\n'), summary.variance() print ('Non zero values\n'), summary.numNonzeros() print ('Max value\n'), summary.max () print ('Min value\n'), summary.min ()
bundle_pearson_dict = {} #dictionary to hold the bundle as key and the coeff as value for bundle_name in actual_bundle_list: final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = \""+bundle_name+"\"") food_metric_only= final_table_by_bundle.map(lambda p: p.zip_AGI_foodmetric[2]) food_metric_list = food_metric_only.collect() weighted_AGI_only= final_table_by_bundle.map(lambda p: p.zip_AGI_foodmetric[1]) weighted_AGI_list = weighted_AGI_only.collect() if not food_metric_list and not weighted_AGI_list: print 'pass' else: x=sc.parallelize(weighted_AGI_list,2) y=sc.parallelize(food_metric_list,2) correlation_coeff = Statistics.corr(x,y, method="pearson") # -0.128161962745 or is it -0.0965926041863?? bundle_pearson_dict[bundle_name]= correlation_coeff bundle_pearson_dict #to get all coeff values by bundle # In[53]: #Here I have an example scatter plot for bundle_name = 'vegetables' to have an idea of how the plot looks # x is the AGI for every zip code # y is the food metric #an example plot is also available to be viewed in the parent folder final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = 'vegetables'") food_metric_only= final_table_by_bundle.map(lambda p: p.zip_AGI_foodmetric[2]) food_metric_list = food_metric_only.collect()
from pyspark.sql import HiveContext from pyspark.mllib.stat import Statistics from pyspark import SparkContext sc = SparkContext() sqlContext = HiveContext(sc) initialquery = sqlContext.sql("SELECT A.avg_procedure_score, B.patientsurveyscore FROM (SELECT p.hospitalid, avg(p.score) as avg_procedure_score FROM procedures p GROUP BY p.hospitalid) A JOIN survey_results B ON B.hospitalid = A.hospitalid") survey_score = initialquery.map(lambda x: x.patientsurveyscore) avg_procedure_scores = initialquery.map(lambda x: x.avg_procedure_score) print Statistics.corr(avg_procedure_scores, survey_score, method="pearson")