def spark_pearson(a, b):
     rdd_a = sc.parallelize(a)
     rdd_b = sc.parallelize(b)
     g = func.func_globals
     g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson')
     g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman')
     func(a, b)
 def spark_pearson(a, b):
     rdd_a = sc.parallelize(a)
     rdd_b = sc.parallelize(b)
     g = func.func_globals
     g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson')
     g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman')
     func(a, b)
Example #3
0
    def test_R_implementation_equivalence(self):
        data = self.sc.parallelize(
            [
                1.1626852897838,
                -0.585924465893051,
                1.78546500331661,
                -1.33259371048501,
                -0.446566766553219,
                0.569606122374976,
                -2.88971761441412,
                -0.869018343326555,
                -0.461702683149641,
                -0.555540910137444,
                -0.0201353678515895,
                -0.150382224136063,
                -0.628126755843964,
                1.32322085193283,
                -1.52135057001199,
                -0.437427868856691,
                0.970577579543399,
                0.0282226444247749,
                -0.0857821886527593,
                0.389214404984942,
            ]
        )
        model = Statistics.kolmogorovSmirnovTest(data, "norm")
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)

        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)
Example #4
0
def CorrelationFeature(vectors):

    matriz = sc.broadcast(Statistics.corr(vectors, method="pearson"))

    summary = Statistics.colStats(vectors)

    varianza = summary.variance()

    #########new heuristic diogo proposal
    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = varianza[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()],
               reverse=True)  #features sorted

    index = []
    for i in r:
        index.append(i[1])

    index = index[0:6]  #tacking the first 6 features

    return index
Example #5
0
    def run4(self):
        from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label

        raw_data = self.raw_data
        vector_data = raw_data.map(parse_interaction)
        # Compute column summary statistics.
        summary = Statistics.colStats(vector_data)

        print "Duration Statistics:"
        print " Mean: {}".format(round(summary.mean()[0],3))
        print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
        print " Max value: {}".format(round(summary.max()[0],3))
        print " Min value: {}".format(round(summary.min()[0],3))
        print " Total value count: {}".format(summary.count())
        print " Number of non-zero values: {}".format(summary.numNonzeros()[0])

        label_vector_data = raw_data.map(parse_interaction_with_key)
        normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")

        normal_summary = Statistics.colStats(normal_label_data.values())

        print "Duration Statistics for label: {}".format("normal")
        print " Mean: {}".format(normal_summary.mean()[0],3)
        print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
        print " Max value: {}".format(round(normal_summary.max()[0],3))
        print " Min value: {}".format(round(normal_summary.min()[0],3))
        print " Total value count: {}".format(normal_summary.count())
        print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])

        normal_sum = summary_by_label(raw_data, "normal.")

        print "Duration Statistics for label: {}".format("normal")
        print " Mean: {}".format(normal_sum.mean()[0],3)
        print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
        print " Max value: {}".format(round(normal_sum.max()[0],3))
        print " Min value: {}".format(round(normal_sum.min()[0],3))
        print " Total value count: {}".format(normal_sum.count())
        print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])

        label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
                      "imap.","ipsweep.","land.","loadmodule.","multihop.",
                      "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
                      "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
                      "warezmaster."]
        stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]

        duration_by_label = [
            (stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())]))
            for stat in stats_by_label]

        pd.set_option('display.max_columns', 50)

        stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')

        print "Duration statistics, by label"
        stats_by_label_df
Example #6
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Example #7
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Example #8
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Example #9
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Example #10
0
def CorrelationFeature(vectors):

    #	print 'Calculation Correlation'

    matriz = sc.broadcast(Statistics.corr(vectors, method="pearson"))

    summary = Statistics.colStats(vectors)

    varianza = summary.variance()

    #########new heuristic diogo proposal
    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = varianza[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()],
               reverse=True)  #features sorted

    #print r

    #	print 'calculating features selections'

    #Old heuristic
    # # w={}
    # # for i in range(len(matriz)):
    # # 	w[i]=0
    # # 	for j in np.nan_to_num(matriz[i]):
    # # 		k=abs(j)
    # # 		w[i]=w[i]+k

    # r=sorted([(value,key) for (key,value) in w.items()],reverse=True)

    #####""
    #vectors=np.matrix(vectors)
    #beforeMatrix=vectors.map(lambda x: np.matrix(x))

    index = []
    for i in r:
        index.append(i[1])

    index = index[0:6]  #tacking the first 6 features

    #MatrixReducer(vectors,index)
    return index
Example #11
0
    def test_R_implementation_equivalence(self):
        data = self.sc.parallelize([
            1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
            -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
            -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
            -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
            0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
        ])
        model = Statistics.kolmogorovSmirnovTest(data, "norm")
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)

        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)
Example #12
0
def calculateStats(years2stats, op):
	result = dict()
	for year in years2stats:
		stats = sc.parallelize(years2stats[year])
		summary = Statistics.colStats(stats)
		if op == 'mean':
			means = summary.mean()
			valuesList = []
			for singleElement in means:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
		if op == 'variance':
			variances = summary.variance()
			valuesList = []
			for singleElement in variances:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
		if op == 'max':
			maxValue = summary.max()
			valuesList = []
			for singleElement in maxValue:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
		if op == 'min':
			minValue = summary.min()
			valuesList = []
			for singleElement in minValue:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
	return result
Example #13
0
def generateFeatureClusters(context, geneExp, samples, headers, numClusters):

    # Ignore the first item (the diagnosis header)
    headers = headers[1:]
    # 1) Generate statistic data for each of the genes/entrez ids

    # Retrieve the mean, variance, max and min of each gene
    # The entrez id associate with each gene is the row index (matches to the headers index)
    cStats = Statistics.colStats(geneExp)
    print(len(cStats.mean()))
    data = np.array(
        [cStats.mean(),
         cStats.variance(),
         cStats.max(),
         cStats.min()]).transpose()
    # Create a stats array with the index as first column
    # e_id for e_id in headers
    dataWithIndex = np.array([[e_id for e_id in headers],
                              cStats.mean(),
                              cStats.variance(),
                              cStats.max(),
                              cStats.min()]).transpose()
    print(dataWithIndex.shape)
    # 2) Create dataframes that will be used to train KMeans

    # Create dataframe for the stats data (with no entrez ids)
    df = context.parallelize(data)
    # create dataframe for the stats data (with entrez ids)
    # Will be used to cluster features later
    dfWithIndex = context.parallelize(dataWithIndex)

    # 3) Train KMeans with statistic data
    # use the stats data to discover clusters for the genes
    model = KMeans.train(df,
                         numClusters,
                         maxIterations=100,
                         initializationMode="random")

    # 4) save model
    model.save(context, './models/clusters')

    # 5) Label each feature with their cluster
    # For each gene statistic, map it to (prediction, e_id)
    clusterLabeledFeatures = dfWithIndex.map(
        lambda point: (model.predict(point[1:]), point[0]))

    featuresToCluster = dfWithIndex.map(lambda point: point[0],
                                        (model.predict(point[1:])))

    # 6) Group together the features by their cluster label
    clusteredFeatures = clusterLabeledFeatures.groupByKey()
    #print(clusteredFeatures.count())
    #print(clusteredFeatures.take(2))

    cF = clusteredFeatures.collectAsMap()

    # 7) Transform the sample data to use the clusters
    samplesWithClusters = samples.map(lambda sample: updateSample(sample, cF))

    return samplesWithClusters
Example #14
0
def do_all(f_path,out_name):
	sc = SparkContext()
	data = sc.textFile(f_path)

	data = data.map(parseKeepD).filter(lambda p: p[0] != None)

	# Scale Features
	features = data.map(lambda x: x[0].features)
	summary = Statistics.colStats(features)
	global means
	global varis
	means = summary.mean()
	varis = summary.variance()

	#scale the points
	data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))

	#train model
	model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')

	#calculate disparity
	disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))  

	#calculate SSR for later
	ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()

	#keep N
	N = disparity.count()
	#shut down SC
	MSE = ssr/float(N)
	se = std_errors(data,MSE,N)
	disparity.saveAsTextFile(out_loc + out_name)

	sc.stop()
	return model.intercept,model.weights,se,disparity, ssr, N
    def _transform(self, df):

        for k, v in df.schema[
                self.inputCol].metadata["ml_attr"]["attrs"].items():
            features_df = pd.DataFrame(v)

        column_names = list(features_df['name'])
        df_vector = df.rdd.map(lambda x: x[self.inputCol].toArray())

        #self.correlation_type is class parameter
        matrix = Statistics.corr(df_vector, method=self.correlation_type)

        # apply pandas dataframe operation on the fit output
        corr_df = pd.DataFrame(matrix,
                               columns=column_names,
                               index=column_names)
        final_corr_df = pd.DataFrame(corr_df.abs().unstack().sort_values(
            kind='quicksort')).reset_index()
        final_corr_df.rename(
            {
                'level_0': 'col1',
                'level_1': 'col2',
                0: 'correlation_value'
            },
            axis=1,
            inplace=True)
        final_corr_df = final_corr_df[
            final_corr_df['col1'] != final_corr_df['col2']]

        #shortlisted dataframe based on custom cutoff
        shortlisted_corr_df = final_corr_df[
            final_corr_df['correlation_value'] > self.correlation_cutoff]
        return corr_df, shortlisted_corr_df
Example #16
0
    def test_matrix_independence(self):
        data = [
            40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0,
            12.0
        ]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest,
                          col_zero)
Example #17
0
def compute_correlation_matrix(df, method='pearson'):
    df_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(df_rdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat,
                    columns=df.columns, 
                    index=df.columns)
    return corr_mat_df
Example #18
0
 def test_dimension(self, targetDimension, testDimension):
     if not targetDimension in self._dataframe_helper.get_string_columns():
         raise BIException.non_string_column(testDimension)
     chisquare_result = ChiSquareResult()
     pivot_table = self._data_frame.stat.crosstab(
         "{}".format(targetDimension), testDimension)
     # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
     rdd = list(
         chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
     data_matrix = Matrices.dense(pivot_table.count(),
                                  len(pivot_table.columns) - 1, rdd)
     result = Statistics.chiSqTest(data_matrix)
     chisquare_result.set_params(result)
     freq_table = self._get_contingency_table_of_freq(pivot_table,
                                                      need_sorting=True)
     freq_table.set_tables()
     chisquare_result.set_table_result(freq_table)
     # Cramers V Calculation
     stat_value = result.statistic
     n = freq_table.get_total()
     t = min(len(freq_table.column_one_values),
             len(freq_table.column_two_values))
     v_value = math.sqrt(float(stat_value) / (n * float(t)))
     chisquare_result.set_v_value(v_value)
     self._dataframe_helper.add_chisquare_significant_dimension(
         testDimension, v_value)
     return chisquare_result
Example #19
0
def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()
Example #20
0
def calculateCorrelation(rdd1, rdd2):
    joined_rdd = rdd1.join(rdd2).sortByKey()

    rdd1_values = joined_rdd.map(lambda x:x[1][0])
    rdd2_values = joined_rdd.map(lambda x:x[1][1])
    correlation_value = Statistics.corr(rdd1_values, rdd2_values)
    return (joined_rdd,correlation_value)
Example #21
0
def compute_correlation_matrix(df, method='pearson'):
    # wrapper around
    # https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
    df_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(df_rdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns)
    return corr_mat_df
Example #22
0
def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()
 def compute_correlation_matrix(df, method='pearson'):
     columns=[item[0] for item in df.dtypes if (item[1].startswith('float') or item[1].startswith('double'))]#need to work according to the datatypes
     df_filter=df.select(columns)
     df_rdd = df_filter.rdd.map(lambda row: row[0:])
     corr_mat = Statistics.corr(df_rdd, method=method)
     corr_mat_df = pd.DataFrame(corr_mat,columns=df_filter.columns,index=df_filter.columns)
     return corr_mat_df
Example #24
0
    def scriptJob(self, limit=None, rowstart=None, rowstop=None):
        start = datetime.datetime.now()
        # create hbase connection

        row = self.table.scan(row_start=rowstart,
                              row_stop=rowstop,
                              limit=limit,
                              columns=self.columns)
        print(type(row))

        testRdd = self.sc.parallelize(row)
        values = testRdd.values()
        print(values.count())

        col = bytes(self.columns.encode("utf-8"))
        serilizeRdd = values.map(lambda value: float(value.get(col).decode()))

        #
        # def hash_domain(url):
        #     return hash(urlparse.urlparse(url).netloc)

        mlibRDD = self.sc.parallelize(
            (([Vectors.dense(x)]) for x in serilizeRdd.collect()))

        cStats = Statistics.colStats(mlibRDD)
        # print(cStats.mean())

        end = datetime.datetime.now()
        print(end - start)
        return cStats.mean()
def correlationTemperatureHardness(df,spark):
    column1 = df.select('temperature').rdd.map(lambda x: x['temperature']).filter(lambda x: x is not None).filter(lambda x: x != '')
    column2 = df.select('hardness').rdd.map(lambda x: x['hardness']).filter(lambda x: x is not None).filter(lambda x: x != '')
    data = column1.zip(column2)
    corr_matrix = Statistics.corr(data)
    
    return corr_matrix[1][0]
Example #26
0
def info_paragraphs(df, clm):
    df = df.where(col(clm).isNotNull())
    paragraphs = df.rdd.flatMap(lambda x: getattr(x, clm)).filter(
        lambda p: p != None)
    paragraphs = paragraphs.map(lambda p: np.array(len(p.split())))
    summary = Statistics.colStats(paragraphs)

    return summary
def compute_correlation_matrix(df,method='spearman'):
    
    churn_data3_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(churn_data3_rdd, method=method)
    corr_mat_churn_data3 = pd.DataFrame(corr_mat,
                    columns=df.columns, 
                    index=df.columns)
    return corr_mat_churn_data3
Example #28
0
 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])
Example #29
0
 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])
Example #30
0
    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3,
                          expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_obs,
                          expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, zero_observed,
                          zero_expected)
Example #31
0
def main():
    ###Loading data from sources
    print 'before  preprocess'
    data = [preprocess(input_file)]
    print 'after preprocess'
    #get spark context
    sc = getSparkContext()
    print 'before parallelize'
    data = np.hstack((data[0]['train_data'], data[0]['train_labels'].reshape(
        (data[0]['train_labels'].shape[0], 1))))
    data = [
        Vectors.dense(list(data[row, :])) for row in range(0, data.shape[0])
    ]
    samples = sc.parallelize(data)
    #samples.persist()
    pearsonCorr = Statistics.corr(samples)
    print str(pearsonCorr).replace('nan', 'NaN')
    sys.exit()
    print Statistics.corr(data, method="pearson")
Example #32
0
def estimate_correlation_matrix(df, cols, method='pearson', round_decimals=3):

    features = df.select(cols).rdd.map(lambda row: row[0:])
    corr_mat= pd.DataFrame(
        Statistics.corr(features, method=method), columns=cols, index=cols) \
        .round(round_decimals) \
        .style \
        .background_gradient(cmap='coolwarm')

    return corr_mat
Example #33
0
def correlations(sdf, colnames, method='pearson', ax=None, plot=True):
    sdf = sdf.notHandy()
    correlations = Statistics.corr(sdf.select(colnames).dropna().rdd.map(lambda row: row[0:]), method=method)
    pdf = pd.DataFrame(correlations, columns=colnames, index=colnames)
    if plot:
        if ax is None:
            fig, ax = plt.subplots(1, 1)
        return sns.heatmap(round(pdf,2), annot=True, cmap="coolwarm", fmt='.2f', linewidths=.05, ax=ax)
    else:
        return pdf
Example #34
0
    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(
            IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
Example #35
0
def column_statistics(data: pyspark.rdd.RDD):
    """
    Compute vectors of column means and variances of a data frame.
`
    :param data: an RDD
    :return: returns column means and variances as vectors
    """

    logger.info("Computing data statistics")
    summary = Statistics.colStats(data)
    return summary.mean(), summary.variance()
Example #36
0
def column_means(data: pyspark.rdd.RDD):
    """
    Compute vectors of column means.
`
    :param data: an RDD
    :return: returns column means as vector
    """

    logger.info("Computing data means")
    summary = Statistics.colStats(data)
    return summary.mean()
def average_vector(data):
	from pyspark.sql.functions import col
	vectors = data.select("vectors").where(col("vectors").isNotNull())

	from pyspark.mllib.linalg import Vectors
	vectors_v = vectors.map(lambda line: Vectors.dense(line))

	from pyspark.mllib.stat import Statistics
	summary = Statistics.colStats(vectors_v)
	mean = summary.mean()
	logger.info(mean)
	return mean
Example #38
0
def DropColsByCor(df, cor_cutoff):

    tdf = df
    dsu_dict = {}

    string_cols = []
    for (a, b) in df.dtypes:
        if b == 'string':
            string_cols.append(a)

    for cols in string_cols:
        tdf = tdf.drop(cols)

    num_cols = len(tdf.columns)
    dsu = [i for i in range(num_cols)]
    size = [1 for i in range(num_cols)]

    features = tdf.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(features, method="pearson")

    for i in range(num_cols):
        for j in range(i):
            if corr_mat[i][j] > cor_cutoff:
                union(dsu, size, i, j)

    drop_cols = []
    for i in range(num_cols):
        if dsu[i] != i:
            drop_cols.append(tdf.columns[i])

        #Setting up dictionary to save up on iterations
        if dsu[i] == i:
            dsu_dict[tdf.columns[i]] = [tdf.columns[i]]

    for i in range(num_cols):
        if dsu[i] != i:
            ri = root(dsu, i)
            dsu_dict[tdf.columns[ri]].append(tdf.columns[i])

    for cols in drop_cols:
        tdf = tdf.drop(cols)

    string_df = df.select(string_cols)

    #Adding index to help merge both string and numeric dataframes
    tdf = tdf.withColumn("RowNoIndex", monotonically_increasing_id())
    string_df = string_df.withColumn("RowNoIndex",
                                     monotonically_increasing_id())
    tdf = tdf.join(string_df, ['RowNoIndex'])
    tdf = tdf.drop('RowNoIndex')

    return dsu_dict, tdf
Example #39
0
def corrFilter(df,col,excludeCols,target):
    useFulCol = []
    corrScore = []
    for col in train.select(col).columns :
        if col not in excludeCols:
            if Statistics.chiSqTest(train.select('C2').collect()).pValue < 0.05:
                colCorr = float(str(train.stat.corr(col,target))[0:5])
                if colCorr > 0.03 or colCorr < -0.03:
                    useFulCol.append(col)
                    corrScore.append(colCorr)
    pearsonTable = pd.DataFrame({'colNmae':useFulCol,'pearson ':corrScore})
    pearsonTable.sort_values(by='spearman',ascending=False, inplace=True)
    return pearsonTable
Example #40
0
	def recommend2user(self,user_id):
		
		query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id)

		def SQLtoURL(query):
			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
			return data


		def QueryXXXXX(query, file = None):
			session = Session()
			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
			return response.content
		


		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  

		def convert_row(row):
			rowlist = []
			rowlist = [d['v'] for d in row]
			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		historyTitleData = self.spark.createDataFrame(rd, table_cols)
		historyTitleData = historyTitleData.dropna()
		
		self.model.createOrReplaceTempView("Database")
		historyTitleData.registerTempTable("historyTable")
		
		pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''')
		
		mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray())
		mainVec = Statistics.colStats(mainRdd).mean()

		pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray()))))
		pager = pageRank.toDF()
		pager.createOrReplaceTempView("pager")
		sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''')

		return sortPageR.take(10)
Example #41
0
def create_or_update_week(influencer_tweets, topic_tweets, week):

    topic_cor = []
    influencer_cor = []
    for t in topic_tweets:
        for i in influencer_tweets:
            if t['time'] == i['time']:
                topic_cor.append(t['count'])
                influencer_cor.append(i['count'])

    if len(topic_cor)<=1:
        corr = 0
    else:

        sc = SparkContext(appName="CorrelationPerWeek")

        topic_tweets = sc.parallelize(topic_cor)
        influencer_tweets = sc.parallelize(influencer_cor)

        corr = Statistics.corr(topic_tweets, influencer_tweets, "pearson")

        sc.stop()

    url = "http://localhost:8000/api/weeks/"

    today = datetime.fromtimestamp(week/1000.0)
    payload = '{    "score": %f,    "start_date": "%s"  }' % (
        float(corr), str(today.year) + "-" + str(today.month) + "-" + str(today.day))
    headers = {
        'authorization': "Basic ZGV2OjEyMzQ=",
        'content-type': "application/json",
        'cache-control': "no-cache",
        'postman-token': "7c8668c0-a4c2-f42d-66a9-95cbfb7806c5"
    }

    try:
        response = requests.request("POST", url, data=payload, headers=headers)
        return  response.json()['id']
    except:
        print "error"

    return 1
Example #42
0
    def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
Example #43
0
    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(Py4JJavaError, Statistics.chiSqTest, col_zero)
def get_language_correlation():
    """
        calculates the correlation between github languages
    """
    #Create Spark Context
    sc = SparkContext(appName="LanguageCorrelations")

    #Create SQL Context
    sqlCtx = SQLContext(sc)

    #Create a schemaRDD from json datasets stored in HDFS
    pushes = sqlCtx.jsonFile('git_14_15/git_results')

    #Register the schemaRDD as a Table
    pushes.registerTempTable('pushes')

    #filter the data to get the pushes for the languages from LANG
    filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG)))

    #perform map transformation to get the rdd in the format (actor, {lang : pushes})
    f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes}))

    #group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...])
    f_group = f_pair.groupByKey()

    #merge lang dictionries to get single orderd dict per actor
    f_merged = f_group.map(lambda s: merge_lang_dict(s[1]))

    #created rdd of vectors from the pushes values, which is required for the correlation algorithm
    vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values())))  
    
    #call the correlation function
    matrix = Statistics.corr(vectors)
    print matrix
    plot_graph(matrix)
    sc.stop()
Example #45
0
print "Converting bigrams to sparse vectors in a dataframe for the train set"
t0 = time()
features=dfTrain.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
features.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[323]:

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
print "Computing the chi vector"
t0 = time()
labeledPoints = features.map(lambda row : LabeledPoint(row.label, row.bigramVectors))
chi = Statistics.chiSqTest(labeledPoints)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[324]:

print "Starting bigram selection,broadcasting the newly created bigram dictionary"
t0 = time()
biSelect = [revDict_broad.value[i] for i,bigram in enumerate(chi) if bigram.pValue <=0.3]
dictSelect = {}
for i,bigram in enumerate(biSelect):
    dictSelect[bigram]=i
dictSel_broad = sc.broadcast(dictSelect)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
Example #46
0
"""

Testing with Correlation
https://spark.apache.org/docs/latest/mllib-statistics.html

"""

from pyspark.mllib.stat import Statistics
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector, Vectors


sc = SparkContext("local", "Rubbish")

seriesX = sc.parallelize([1.0, 2.0, -2.0], 2)
seriesY = sc.parallelize([3.0, 4.0, 5.0], 2)
corrXY =  Statistics.corr(seriesX, seriesY, method="pearson")

# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])

print "Correlation between x & y: ", corrXY
print "Correlation matrix: ", data
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics


sc = SparkContext("local", "Rubbish")

"""
# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])
"""

# Sample vector composing of frequency of events
vect = Vectors.dense([4,5,0,3])

# Summary of the test including the p-value, degrees of freedom,
goodnessOfFitTestResult = Statistics.chiSqTest(vect)

sampleData = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
matrix = Matrices.dense(3,4, sampleData)
# Conduct Pearson's independence test on the input contingency matrix
independenceTestResult = Statistics.chiSqTest(matrix)


# Test statistic, the method used, and the null hypothesis.
print "SINGLE VECTOR FIT: "
print goodnessOfFitTestResult 
## Summary of the test including the p-value, degrees of freedom.
print "INDEPENDENCE TEST RESULT: "
print independenceTestResult
import numpy as np

from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="CorrelationsExample")  # SparkContext

    # $example on$
    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
    # seriesY must have the same number of partitions and cardinality as seriesX
    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))

    data = sc.parallelize(
        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
    )  # an RDD of Vectors

    # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print(Statistics.corr(data, method="pearson"))
    # $example off$

    sc.stop()
Example #49
0
    day = int(x[3:5])
    year = int(x[6:10])
    return(datetime.date(year,month,day).isocalendar()[1])

violent = ["ASSAULT","BATTERY","CRIM SEXUAL ASSAULT", "DOMESTIC VIOLENCE", "HOMICIDE", "KIDNAPPING"]
def setFlags(x):
        if x in violent:
                return (0,1)
        else:
                return (1,0)

beats = parts.map(lambda p:(p[10],p[2][6:10],getWeek(p[2]),1,setFlags(p[5])))
beats2 = beats.filter(lambda x:x[1]=="2015").map(lambda x:((x[0],x[2]),(x[3],x[4][0],x[4][1])))
beats3 = beats2.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2]))
standard_vars = beats3.map(lambda row: Vectors.dense((row[0][1],row[1][0],row[1][1],row[1][2])))
summary = Statistics.colStats(standard_vars)
mean_wn = summary.mean()[0]
sd_wn = math.sqrt(summary.variance()[0])
mean_counts = list(summary.mean()[1:4])
sd_counts = list(np.sqrt(summary.variance()[1:4]))
beats_standard = beats3.map(lambda x: (x[0][0],(x[0][1]-mean_wn)/(sd_wn),(x[1][0]-mean_counts[0])/sd_counts[0],(x[1][1]-mean_counts[1])/sd_counts[1], \
 (x[1][2]-mean_counts[2])/sd_counts[2]))
beats_list = beats_standard.map(lambda x: ((x[0]),1)).keys().distinct().collect()
beats_list = beats_list[0:50]
def parsePoint(tuple):
        values = [float(x) for x in tuple]
        return LabeledPoint(values[0], values[1:])
def deNorm(val,mean,sd):
        return(val*sd + mean)
maxWeek = (21 - mean_wn) / sd_wn
curWeek = (20 - mean_wn) / sd_wn
Example #50
0
    merged_final = merged.reduceByKey(lambda x,y : int(x) + int(y))

    #sort by month-year
    # Map each year to all beats and their corresponding crime counts for that year, and sort the counts 
    # by beat
    groupedbeatCountsbymonthyear = merged_final.map( lambda row: ( row[ 0 ][ 1 ], ( row[ 0 ][ 0 ], row[ 1 ] ) ) ) \
                                   .groupByKey( ) \
                                   .mapValues( lambda val: sorted( list( val ), key = lambda t: t[ 0 ] ) );
    # Create a list of all beats
    groupbeats = [ elem[ 0 ] for elem in groupedbeatCountsbymonthyear.values( ).first( ) ];
    
    beatvectorCounts = groupedbeatCountsbymonthyear.values( ) \
                                .map( lambda row: Vectors.dense( [ elem[ 1 ] for elem in row ] ) );
    
    # Compute correlation between all beats for yearly crime counts
    corrMatrix = Statistics.corr( beatvectorCounts, method = 'pearson' );
    
     # Fill the diagonal of correlation matrix with 0's
    corrMatrix.flags[ 'WRITEABLE' ] = True;
    np.fill_diagonal( corrMatrix, 0.0 );

    # Get the 10 largest correlation values from the matrixr The correlation matrix is symmetric so
    # we take the largest 20 and step by 2. Finally, the index of the corresponding beat pairs for
    # top 10 correlation values is obtained.
    sortOrder = corrMatrix.argsort( axis = None );
    indices = np.unravel_index( sortOrder[ -20::2 ], corrMatrix.shape  );

    # The corresponding beats names are obtained for the top 10 correlated beat pairs
    topBeatPairs = [ ( groupbeats[ i ], groupbeats[ j ] ) for i, j in zip( indices[ 0 ], indices[ 1 ] ) ];

    for i, j in topBeatPairs:
from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="HypothesisTestingExample")

    # $example on$
    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events

    # compute the goodness of fit. If a second vector to test against
    # is not supplied as a parameter, the test runs against a uniform distribution.
    goodnessOfFitTestResult = Statistics.chiSqTest(vec)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize(
Example #52
0
def dist_corr(v1, v2):
    """
    Function to compute correlation between two Spark RDDs
    """

    return Statistics.corr(v1,v2)
    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()
    print("Schema from LIBSVM:")
    df.printSchema()
    print("Loaded training data as a DataFrame with " +
          str(df.count()) + " records.")

    # Show statistical summary of labels.
    labelSummary = df.describe("label")
    labelSummary.show()

    # Convert features column to an RDD of vectors.
    features = MLUtils.convertVectorColumnsFromML(df, "features") \
        .select("features").rdd.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("Selected features column with average values:\n" +
          str(summary.mean()))

    # Save the records in a parquet file.
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print("Saving to " + tempdir + " as Parquet file.")
    df.write.parquet(tempdir)

    # Load the records back.
    print("Loading Parquet file with UDT from " + tempdir)
    newDF = spark.read.parquet(tempdir)
    print("Schema from Parquet:")
    newDF.printSchema()
    try:
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
# $example on$
import numpy as np

from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext

    # $example on$
    mat = sc.parallelize(
        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
    )  # an RDD of Vectors

    # Compute column summary statistics.
    summary = Statistics.colStats(mat)
    print(summary.mean())  # a dense vector containing the mean value for each column
    print(summary.variance())  # column-wise variance
    print(summary.numNonzeros())  # number of nonzeros in each column
    # $example off$

    sc.stop()
Example #55
0
from datetime import datetime

sc = SparkContext(appName= "Run 1 - Corr-Wide - Data95-08 - AWS")

data_file = "s3://aws-logs-012060642840-us-west-2/elasticmapreduce/cloud_proj/95-08.csv"
raw_data = sc.textFile (data_file).cache ()
#extract header
header = raw_data.first () 
raw_data = raw_data.filter (lambda x:x != header)

def parse_interaction(line):
	#split lines based on the delimeter, and create a list
	line_split = line.split (",")
	#replace NA with zeros
	line_split = [w.replace ('NA', '0') for w in line_split]
	#remove year, and other non-numeric data
	"""
	0 = Year
	"""
	symbolic_indexes = [0, 8, 10, 16, 17, 22]
	clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes]
	return np.array ([float (x) for x in clean_line_split])

vector_data = raw_data.map (parse_interaction)

#start timer at this point
startTime = datetime.now()
print (Statistics.corr (vector_data, method="pearson"))
print ('Time consumed = '), (datetime.now() - startTime)
sc.stop()
Example #56
0
##### En trichant #####
# Utilisation de pandas pour résumer les données + afficher la matrice de corrélation
df = pd.read_csv("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF+".csv", sep = ";",header=0)
df.describe()
# Matrice de corrélation
# print(df.corr())


# ### Mllib Statistics

# In[5]:

from pyspark.mllib.stat import Statistics
# Basics Statistics
partsNum = parts.map(lambda line: line[0:8])
summary = Statistics.colStats(partsNum)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
Statistics.corr(partsNum, method="pearson")


# # Classification supervisée

# ## Naive Bayes

# In[6]:

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
import utils_mesure
nomF_svm = "glass_svm"
Example #57
0
def readRankMatrix():
    import numpy as np
    lines = sc.textFile('../yelp_trans.csv')
    rawData = lines.mapPartitionsWithIndex(removeHeader)
    mydata = rawData.map(removeColumns).cache()
    return mydata


from pyspark.mllib.stat import Statistics
from pandas import Series
import pandas as pd
import numpy as np
import math

mydata = readRankMatrix()
corr = Statistics.corr(mydata)

# set up the columns names and add a new names called user_id
lines2 = sc.textFile('../yelp.csv')
names = lines2.map(lambda line:line.split(",")).map(lambda a:a[0]).collect()[1:]

s = Series([str for str in names])
pddata = pd.DataFrame(corr, columns=s)
pddata['user_id'] = names
df_corr = sqlContext.createDataFrame(pddata)
# df_corr.cache()
df_corr.registerTempTable("corr")

def getTopReviewUsers(n):
    # n: the nth highest user
    ord_user = sqlContext.sql("select user_id, count(review_id) as count from reviews_json group by user_id order by count desc")
Example #58
0
For dense vectors, MLlib uses either Python lists or the NumPy array type. 
The later is recommended, so you can simply pass NumPy arrays around.
For sparse vectors, users can construct a SparseVector object from MLlib 
or pass SciPy scipy.sparse column vectors if SciPy is available in their environment. 
The easiest way to create sparse vectors is to use the factory methods imlpemented in Vectors. 
"""

def parse_interaction (line):
	#split lines based on the delimeter, and create a list
	line_split = line.split (",")
	#replace NA with zeros
	line_split = [w.replace ('NA', '0') for w in line_split]
	#line_split = [w.replace ('', '0') for w in line_split]
	#keep all except year, and non-numeric values
	symbolic_indexes = [0, 8, 10,16, 17, 22]
	clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes]
	return np.array ([float (x) for x in clean_line_split])

vector_data = raw_data.map (parse_interaction)

#start timer at this point
startTime = datetime.now()
summary = Statistics.colStats(vector_data)
print ('Time consumed = '), (datetime.now() - startTime)

print ('Mean of columns\n'), summary.mean ()
print ('Variances of columns\n'), summary.variance()
print ('Non zero values\n'), summary.numNonzeros()
print ('Max value\n'), summary.max ()
print ('Min value\n'), summary.min ()
bundle_pearson_dict = {} #dictionary to hold the bundle as key and the coeff as value

for bundle_name in actual_bundle_list:
    final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = \""+bundle_name+"\"")
    food_metric_only= final_table_by_bundle.map(lambda p:  p.zip_AGI_foodmetric[2])
    food_metric_list = food_metric_only.collect()
    weighted_AGI_only= final_table_by_bundle.map(lambda p:  p.zip_AGI_foodmetric[1])
    weighted_AGI_list = weighted_AGI_only.collect()
    if not food_metric_list and not weighted_AGI_list:
        print 'pass'
    else:
        
        x=sc.parallelize(weighted_AGI_list,2)
        y=sc.parallelize(food_metric_list,2)
        
        correlation_coeff =  Statistics.corr(x,y, method="pearson") # -0.128161962745 or is it -0.0965926041863??
        bundle_pearson_dict[bundle_name]= correlation_coeff
    
        
bundle_pearson_dict  #to get all coeff values by bundle

# In[53]:

#Here I have an example scatter plot for bundle_name = 'vegetables' to have an idea of how the plot looks
# x is the AGI for every zip code
# y is the food metric
#an example plot is also available to be viewed in the parent folder

final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = 'vegetables'")
food_metric_only= final_table_by_bundle.map(lambda p:  p.zip_AGI_foodmetric[2])
food_metric_list = food_metric_only.collect()
from pyspark.sql import HiveContext
from pyspark.mllib.stat import Statistics
from pyspark import SparkContext

sc = SparkContext()

sqlContext = HiveContext(sc)

initialquery = sqlContext.sql("SELECT         A.avg_procedure_score,         B.patientsurveyscore FROM         (SELECT                 p.hospitalid,                 avg(p.score) as avg_procedure_score         FROM                 procedures p         GROUP BY                 p.hospitalid) A JOIN         survey_results B ON B.hospitalid = A.hospitalid")

survey_score = initialquery.map(lambda x: x.patientsurveyscore)
avg_procedure_scores = initialquery.map(lambda x: x.avg_procedure_score)

print Statistics.corr(avg_procedure_scores, survey_score, method="pearson")