def cal(data): df = spark.createDataFrame(data, ["features"]) print(df.show()) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
def main(): data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = spark.createDataFrame(data, ["features"]) df.show() r1 = Correlation.corr(df, "features").head() print("Pearson Correlation matrix: ", str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman Correlation matrix:", str(r2[0]))
def learn1(): from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = ss.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
def pearsonTest(self): dataset=self.dataset labelColm=self.labelColm features=self.features labelColm = [labelColm] allColms = labelColm + features featureAssembler = VectorAssembler( inputCols=allColms, outputCol="allColmsVectorized", handleInvalid="skip") allColmsVectorizedDataset = featureAssembler.transform(dataset) allColmsVectorizedDataset.show() r1p = Correlation.corr(allColmsVectorizedDataset, "allColmsVectorized").head() print("pearson correlation matrix : \n : " + str(r1p[0])) pearson_matrix = r1p[0].toArray().tolist() pearsonMatrix = [] for everylist in pearson_matrix: insideList = [] for listinlist in everylist: insideList.append(round(listinlist, 4)) pearsonMatrix.append(insideList) pearson_value_d = [] for x in r1p[0].toArray(): pearson_value_d.append(round(x[0], 4)) # pearson_value_d.append(x[0]) pearson_value = {} for col, val in zip(allColms, pearson_value_d): pearson_value[col] = val print(pearson_value) # # r1s = Correlation.corr(allColmsVectorizedDataset, "allColmsVectorized", "spearman").head() # print(" spearman correlation...: \n" + str(r1s[0])) result_pearson = {'pearson_value': pearson_value, 'matrix': pearsonMatrix} return result_pearson
def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame: """ The correlation matrix of all the numerical columns of this dataframe. Only accepts scalar numerical values for now. :param kdf: the koalas dataframe. :param method: {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation :return: :class:`pandas.DataFrame` >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr() A B A 1.0 -1.0 B -1.0 1.0 """ assert method in ('pearson', 'spearman') ndf, column_index = to_numeric_df(kdf) corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method) pcorr = corr.toPandas() arr = pcorr.iloc[0, 0].toArray() if column_index_level(column_index) > 1: idx = pd.MultiIndex.from_tuples(column_index) else: idx = pd.Index([idx[0] for idx in column_index]) return pd.DataFrame(arr, columns=idx, index=idx)
def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame: """ The correlation matrix of all the numerical columns of this dataframe. Only accepts scalar numerical values for now. :param kdf: the koalas dataframe. :param method: {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation :return: :class:`pandas.DataFrame` >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr() A B A 1.0 -1.0 B -1.0 1.0 """ assert method in ('pearson', 'spearman') ndf, fields = to_numeric_df(kdf) corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method) pcorr = corr.toPandas() arr = pcorr.iloc[0, 0].toArray() arr = pd.DataFrame(arr) arr.columns = fields arr = arr.set_index(pd.Index(fields)) return arr
def corr(psdf: "ps.DataFrame", method: str = "pearson") -> pd.DataFrame: """ The correlation matrix of all the numerical columns of this dataframe. Only accepts scalar numerical values for now. :param psdf: the pandas-on-Spark dataframe. :param method: {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation :return: :class:`pandas.DataFrame` >>> ps.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr() A B A 1.0 -1.0 B -1.0 1.0 """ assert method in ("pearson", "spearman") ndf, column_labels = to_numeric_df(psdf) corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method) pcorr = cast(pd.DataFrame, corr.toPandas()) arr = pcorr.iloc[0, 0].toArray() if column_labels_level(column_labels) > 1: idx = pd.MultiIndex.from_tuples(column_labels) else: idx = pd.Index([label[0] for label in column_labels]) return pd.DataFrame(arr, columns=idx, index=idx)
def correlacion(Finales, corte=.7): Cor = [i[0] for i in Finales.dtypes if 'double' in i[1]] C = [i[0] for i in Finales.dtypes if 'string' in i[1]] vector_col = "corr_features" assembler = VectorAssembler(inputCols=[c for c in Cor], outputCol=vector_col) df_vector = assembler.transform(Finales).select(vector_col) matrix = Correlation.corr(df_vector, vector_col, "spearman").head() a = matrix[0] Arreglo = a.toArray() CorrArrpd = pd.DataFrame(data=Arreglo[0:, 0:]) CorrArrpd.columns = [c for c in Cor] col_corr = [] corte = .7 for i in range(len(CorrArrpd)): for j in range(i): if (CorrArrpd.iloc[i, j] >= corte) and (CorrArrpd.columns[j] not in col_corr): colname = CorrArrpd.columns[i] col_corr.append(colname) CorrArrpd.drop(col_corr, inplace=True, axis=1) sinCorrelacion = CorrArrpd.columns joinedlist = list(set().union(sinCorrelacion, C)) Finales = Finales.select( [sinCor for sinCor in Finales.columns if sinCor in joinedlist]) return Finales
def correlation(request,project_id): template = loader.get_template('data/datacorrelation.html') project = get_object_or_404(Project, pk=project_id) #df = getTransformedData(project_id,0) df = mysite.dataoperation.readfromcassandra(project_id,0) df = mysite.dataoperation.transformdataframe(project,df,0) df = applyfeaturetransition(project,df,project.features.all(),project.target,project.targets.all()) from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.stat import Correlation from pyspark.ml.stat import ChiSquareTest feat = list(map(lambda x:x.fieldname,project.features.all())) targ = list(map(lambda x:x.fieldname,project.targets.all())) features = feat+targ assembler = VectorAssembler( inputCols=features, outputCol="correlation") output = assembler.transform(df) corr_mat=Correlation.corr(output,"correlation", method="pearson") corr_html = corr_mat.toPandas().iloc[0]['pearson(correlation)'] #chi_sqr = ChiSquareTest.test(output, "correlation", "label").head() context = { "project" : project, "project_id" : project_id, "menuactive":3, "correlation":corr_html } return HttpResponse(template.render(context, request))
def correlation(self, columns, method="pearson", strategy="mean", output="json"): """ Calculate the correlation between columns. It will try to cast a column to float where necessary and impute missing values :param self: :param columns: Columns to be processed :param method: Method used to calculate the correlation :param strategy: Imputing strategy :param output: array or json :return: """ columns = parse_columns(self, columns) # try to parse the select column to float and create a vector df = self for col_name in columns: df = df.cols.cast(col_name, "float") logging.info( "Casting {col_name} to float...".format(col_name=col_name)) # Impute missing values imputed_cols = [c + "_imputed" for c in columns] df = df.cols.impute(columns, imputed_cols, strategy) logging.info("Imputing {columns}, Using '{strategy}'...".format( columns=columns, strategy=strategy)) # Create Vector necessary to calculate the correlation df = df.cols.nest(imputed_cols, "features", "vector") corr = Correlation.corr(df, "features", method).head()[0].toArray() if output is "array": result = corr elif output is "json": # Parse result to json col_pair = [] for col_name in columns: for col_name_2 in columns: col_pair.append({"between": col_name, "an": col_name_2}) # flat array values = corr.flatten('F').tolist() result = [] for n, v in zip(col_pair, values): # Remove correlation between the same column if n["between"] is not n["an"]: n["value"] = v result.append(n) result = sorted(result, key=lambda k: k['value'], reverse=True) return result
def corr_test(self, group_by, pivot_col, agg_dict, method='pearson'): matrix_df = self.toWide(group_by=group_by, pivot_col=pivot_col, agg_dict=agg_dict).cache() vec_df = CorrMat.vector_assembler(self=matrix_df, cols_name=matrix_df.columns[1:]) return SparkCorrelation.corr(vec_df, 'features', method)\ .select(col(f'{method}(features)').alias(f'{method}_features'))
def correl(): spark = SparkSession\ .builder\ .appName("Test")\ .getOrCreate() rawData = spark.sql("select * from matus_marton.googleplay") df_gp = rawData.toPandas() df_gp.corr() df_gp.describe() rawData.show() Correlation.corr(rawData, ["rating","reviews"]) rawData.describe() return ('Correlation computed')
def find_correlation(spark: SparkSession, to_correlate: List[str], features_file=TRACK_FEATURES): df = spark.read.json(features_file, multiLine=True) vector_mapper = udf(lambda row: Vectors.dense(row), VectorUDT()) df = df.withColumn("features", vector_mapper(array(*to_correlate))) corr_matrix = Correlation.corr(df, "features").head()[0] return df, corr_matrix
def CorrelationMatrix(df): vector_col = "corr_features" assembler = VectorAssembler(inputCols=df.columns, outputCol=vector_col) df_vector = assembler.transform(df).select(vector_col) matrix = Correlation.corr(df_vector, vector_col) cor = matrix.collect()[0]["pearson({})".format(vector_col)].values cor2 = pd.DataFrame( np.array(cor).reshape(len(df.columns), len(df.columns))).iloc[:15, :15] heatmap2d(cor2)
def correlation(self, input_cols, method="pearson", output="json"): """ Calculate the correlation between columns. It will try to cast a column to float where necessary and impute missing values :param self: :param input_cols: Columns to be processed :param method: Method used to calculate the correlation :param output: array or json :return: """ df = self # Values in columns can not be null. Warn user input_cols = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) # try to parse the select column to float and create a vector # print(self.cols.count_na(input_cols)) # Input is not a vector transform to a vector output_col = name_col(input_cols, "correlation") if len(input_cols) > 1: for col_name in input_cols: df = df.cols.cast(col_name, "float") logger.print( "Casting {col_name} to float...".format(col_name=col_name)) df = df.cols.nest(input_cols, "vector", output_cols=output_col) corr = Correlation.corr(df, output_col, method).head()[0].toArray() if output is "array": result = corr elif output is "json": # Parse result to json col_pair = [] for col_name in input_cols: for col_name_2 in input_cols: col_pair.append({"between": col_name, "an": col_name_2}) # flat array values = corr.flatten('F').tolist() result = [] for n, v in zip(col_pair, values): # Remove correlation between the same column if n["between"] is not n["an"]: n["value"] = v result.append(n) result = sorted(result, key=lambda k: k['value'], reverse=True) return {"cols": input_cols, "data": result}
def mldemo(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
def correlation_matrix(df, corr_columns, method='pearson'): vector_col = "corr_features" assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col) df_vector = assembler.transform(df).select(vector_col) matrix = Correlation.corr(df_vector, vector_col, method) result = matrix.collect()[0]["pearson({})".format(vector_col)].values return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns).to_string()
def correlation(data): dfe = data.drop(*["Artist", "SongID"]) vector_col = "corr_features" assembler = VectorAssembler(inputCols=dfe.columns, outputCol=vector_col) df_vector = assembler.transform(dfe).select(vector_col) matrix = Correlation.corr(df_vector, vector_col) mat_array = np.reshape( matrix.collect()[0]["pearson({})".format(vector_col)].values, [14, 14]) m = pd.DataFrame(mat_array, columns=dfe.columns, index=dfe.columns) # ax = sns.heatmap(m) print(dfe.columns)
def Correlation_test_imp(dataset, features, label_col): label_col = [label_col] All_colms = label_col + features featureassembler_correlation = VectorAssembler( inputCols=All_colms, outputCol="correlation_colm", handleInvalid="skip") output_corr = featureassembler_correlation.transform(dataset) output_corr.show() finalized_corr = output_corr.select("correlation_colm") finalized_corr.show() from pyspark.ml.stat import Correlation r1p = Correlation.corr(output_corr, "correlation_colm").head() print("pearson correlation matrix : \n : " + str(r1p[0])) pearson_matrix = r1p[0].toArray().tolist() pearsonMatrix = [] for everylist in pearson_matrix: insideList = [] for listinlist in everylist: insideList.append(round(listinlist, 4)) pearsonMatrix.append(insideList) print(pearsonMatrix) pearson_value_d = [] for x in r1p[0].toArray(): pearson_value_d.append(round(x[0], 4)) # pearson_value_d.append(x[0]) print(pearson_value_d) pearson_value = {} for col, val in zip(All_colms, pearson_value_d): pearson_value[col] = val print(pearson_value) # # r1s = Correlation.corr(output_corr, "correlation_colm", "spearman").head() # print(" spearman correlation...: \n" + str(r1s[0])) result_pearson = {'pearson_value': pearson_value, 'matrix': pearsonMatrix} # print(json_response) return result_pearson
def correlation(self, columns, method="pearson", output="json"): """ Calculate the correlation between columns. It will try to cast a column to float where necessary and impute missing values :param self: :param columns: Columns to be processed :param method: Method used to calculate the correlation :param output: array or json :return: """ columns = parse_columns(self, columns) # try to parse the select column to float and create a vector df = self if len(columns) == 1: if is_column_a(df, columns, "vector"): output_col = one_list_to_val(columns) else: output_col = "_correlation_features" for col_name in columns: df = df.cols.cast(col_name, "float") logger.print( "Casting {col_name} to float...".format(col_name=col_name)) df = df.cols.nest(columns, "vector", output_cols=output_col) # Create Vector necessary to calculate the correlation corr = Correlation.corr(df, output_col, method).head()[0].toArray() if output is "array": result = corr elif output is "json": # Parse result to json col_pair = [] for col_name in columns: for col_name_2 in columns: col_pair.append({"between": col_name, "an": col_name_2}) # flat array values = corr.flatten('F').tolist() result = [] for n, v in zip(col_pair, values): # Remove correlation between the same column if n["between"] is not n["an"]: n["value"] = v result.append(n) result = sorted(result, key=lambda k: k['value'], reverse=True) return result
def basic_statistics(): """Basic statistics.""" df = sql.read.parquet(str(DATA_PARQUET)) numeric = ['cost', 'call_duration_minutes', 'data_volume_mb'] assemble = VectorAssembler(inputCols=numeric, outputCol='features') features = assemble.transform(df.dropna(subset=numeric + ['target'])) breakpoint() # summarize summarize = Summarizer().metrics('mean', 'variance', 'count', 'numNonZeros', 'max', 'min', 'normL2', 'normL1') features.select(summarize.summary( features['features'])).show(truncate=False) # correlations r1 = Correlation.corr(features, 'features', 'pearson').head()[0] small = features.sample(fraction=0.1, seed=100500) r2 = Correlation.corr(small, 'features', 'spearman').head()[0]
def correlation_matrix(df: DataFrame, feature_columns: list, output_column: str) -> np.ndarray: """ generates the Pearson correlation coefficients matrix for feature ranking """ vec_assembler = VectorAssembler(inputCols=feature_columns + [output_column], outputCol="all_columns") df_vectorized = vec_assembler.transform(df) cor_matrix = Correlation.corr(df_vectorized, "all_columns").collect()[0][0].toArray() return cor_matrix
def calc_correlation(feature_columns, feature_data): """ Calculates the Spearman Correlation Coefficient between all given columns. """ print("-- Calculating correlation --") print("Features: ", feature_columns) vector_col = "features" assembler = VectorAssembler(inputCols=feature_columns, outputCol=vector_col) df_vector = assembler.transform(feature_data).select(vector_col) corr_mat = Correlation.corr(df_vector, vector_col, "spearman").head() print("-- Done calculating correlation -- ") return corr_mat[0]
def correlate(uri1, uri2, conf): spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() df1 = spark.read.format("csv").options(header=True, inferschema=True).load(uri1) df2 = spark.read.format("csv").options(header=True, inferschema=True).load(uri2) df1.printSchema() df2.printSchema() """ For Spearman, a rank correlation, we need to create an RDD[Double] for each column and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector], which is fairly costly. Cache the input Dataset before calling corr with method = ‘spearman’ to avoid recomputing the common lineage. """ # join 2 datasets and ignore first resolution columns joined = df1.join(df2, ["temp_res", "spat_res"], 'inner') feature_types = joined.dtypes[2:] # print(feature_types) # drop non numeric features just in case num_feature_types = filter(lambda t: t[1] == "int" or t[1] == "double" or t[1] == "float", feature_types) features = [f_t[0] for f_t in num_feature_types] # print(features) joined = joined.select(features) joined.printSchema() # assemble the Vectors for Correlation.corr(), np.array is equivalent to dense venctors vecAssembler = VectorAssembler( inputCols=features, outputCol="features" ) joinedVec = vecAssembler.transform(joined) spearmanCorr = Correlation.corr(joinedVec, 'features', method='spearman').collect()[0][0] # turn into pandas dataframe spearmanCorr = spearmanCorr.toArray() print(spearmanCorr) # prepare and write correlation result out_dir = spark.conf.get("output") out_dir = "correlations/" + out_dir print("output directory is: " + out_dir) pandasDF = pd.DataFrame(spearmanCorr, index=features, columns=features) pandasDF.to_csv(out_dir) spark.stop()
def correlation_scaled_checker(scaled_dataset): ''' Try 1: Attributes artist_latitude and artist_longitude seem to be very sparse so they it require to skip a lot of values. However they seem irrelevant with the year prediction of a song, so omit them Try 2: After examining the first results in our subset, values analysis_sample_rate, danceability, energy contain always the same value So the correlation map, they get NaN values which is expected. Omit them as well ''' vector_col = "scaled_features" matrix = Correlation.corr(dataset=scaled_dataset, column=vector_col, method='pearson').collect()[0][0] corrmatrix = matrix.toArray().tolist() print(corrmatrix) correlation_heatmap(corrmatrix, columns)
def correlation_checker(parquetFile): # # feature_selector = parquetFile.select('artist_familiarity', 'artist_hotttnesss', 'artist_latitude', # 'artist_longitude', 'song_hotttnesss', 'analysis_sample_rate', 'danceability','duration', 'end_of_fade_in', # 'energy', 'key_confidence', 'start_of_fade_out', 'tempo', 'time_signature_confidence', 'artist_playmeid', # 'artist_7digitalid', 'release_7digitalid', 'track_7digitalid', 'key', 'mode', 'time_signature', 'year', 'label') # parquetFile.select("segments_loudness_max").show(10, False) ''' Try 1: Attributes artist_latitude and artist_longitude seem to be very sparse so they it require to skip a lot of values. However they seem irrelevant with the year prediction of a song, so omit them Try 2: After examining the first results in our subset, values analysis_sample_rate, danceability, energy contain always the same value So the correlation map, they get NaN values which is expected. Omit them as well ''' # TODO check what to do with file song_hotttnesss. For the time being omit that feature_selector = parquetFile.select( 'artist_familiarity', 'artist_hotttnesss', 'song_hotttnesss', 'duration', 'end_of_fade_in', 'key_confidence', 'start_of_fade_out', 'tempo', 'time_signature_confidence', 'artist_playmeid', 'artist_7digitalid', 'release_7digitalid', 'track_7digitalid', 'key', 'loudness', 'mode', 'mode_confidence', 'time_signature', 'year', 'label') feature_selector.describe().show() columns = [ 'artist_familiarity', 'artist_hotttnesss', 'song_hotttnesss', 'duration', 'end_of_fade_in', 'key_confidence', 'start_of_fade_out', 'tempo', 'time_signature_confidence', 'artist_playmeid', 'artist_7digitalid', 'release_7digitalid', 'track_7digitalid', 'key', 'loudness', 'mode', 'mode_confidence', 'time_signature', 'year', 'label' ] vector_col = "corr_features" assembler = VectorAssembler(inputCols=columns, outputCol=vector_col).setHandleInvalid("skip") corr_vector = assembler.transform(feature_selector).select(vector_col) # matrix = Correlation.corr(myGraph_vector, vector_col) matrix = Correlation.corr(dataset=corr_vector, column=vector_col, method='pearson').collect()[0][0] corrmatrix = matrix.toArray().tolist() print(corrmatrix) # Check what if scaling causes any differences. This heatmap should showcase only the non-highly correlated elements correlation_heatmap(corrmatrix, columns)
def getCorrelationMatrix(self,sales_df): vector_col = "corr_features" print(sales_df.printSchema()) columns=[] for c,d in zip(sales_df.columns, sales_df.dtypes) : if d[1] in ("int","double"): columns.append(c) print(columns) assembler = VectorAssembler(inputCols=columns, outputCol=vector_col) df_vector = assembler.transform(sales_df).select(vector_col) # get correlation matrix matrix = Correlation.corr(df_vector, vector_col) print(matrix.collect())
def correlation_matrix(df: sql.DataFrame, corr_columns: list, method: str = 'pearson'): """ Args: df: pyspark dataframe, corr_columns: method: Returns: """ vector_col = "corr_features" assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col) df_vector = assembler.transform(df).select(vector_col) matrix = Correlation.corr(df_vector, vector_col, method) result = matrix.collect()[0]["pearson({})".format(vector_col)].values return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)
def Correlation(dataset_add, feature_colm, label_colm): # dataset_add = str(dataset_add).replace("10.171.0.181", "dhiraj") print("Dataset Name ", dataset_add) dataset = spark.read.parquet(dataset_add) #dataset = spark.read.parquet("hdfs://dhiraj:9000/dev/dmxdeepinsight/datasets/123_AUTOMILES.parquet") dataset.show() All_colms = label_colm + feature_colm # correlation featureassembler_correlation = VectorAssembler( inputCols=All_colms, outputCol="correlation_colm") output_corr = featureassembler_correlation.transform(dataset) output_corr.show() finalized_corr = output_corr.select("correlation_colm") finalized_corr.show() from pyspark.ml.stat import Correlation r1p = Correlation.corr(output_corr, "correlation_colm").head() print("pearson correlation matrix \n : " + str(r1p[0])) print("pearson correlation matrix \n : " + str(r1p[0].toArray())) pearson_matrix = r1p[0].toArray().tolist() pearson_value = [] for x in r1p[0].toArray(): pearson_value.append(x[0]) print(pearson_value) # # r1s = Correlation.corr(output_corr, "correlation_colm", "spearman").head() # print(" spearman correlation...: \n" + str(r1s[0])) json_response = {'pearson_value' : pearson_value, 'matrix': pearson_matrix} print(json_response) return json_response # Correlation(dataset_address, feature_colm, label_colm)
def mahalanobis(sdf, colnames): """Computes Mahalanobis distance from origin and compares to critical values using Chi-Squared distribution to identify possible outliers. """ check_columns(sdf, colnames) # Builds pipeline to assemble feature columns and scale them assembler = VectorAssembler(inputCols=colnames, outputCol='__features') scaler = StandardScaler(inputCol='__features', outputCol='__scaled', withMean=True) pipeline = Pipeline(stages=[assembler, scaler]) features = pipeline.fit(sdf).transform(sdf) # Computes correlation between features and inverts it # Since we scaled the features, we can assume they have unit variance # and therefore, correlation and covariance matrices are the same! mat = Correlation.corr(features, '__scaled').head()[0].toArray() inv_mat = inv(mat) # Computes critical value critical_value = chi2.ppf(0.999, len(colnames)) # Builds Pandas UDF to compute Mahalanobis distance from origin # sqrt((V - 0) * inv_M * (V - 0)) try: import pyarrow @F.pandas_udf('double') def pudf_mult(v): return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v))) except: @F.udf('double') def pudf_mult(v): return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v))) # Convert feature vector into array features = dense_to_array(features, '__scaled', '__array_scaled') # Computes Mahalanobis distance and flags as outliers all elements above critical value distance = (features.withColumn( '__mahalanobis', pudf_mult('__array_scaled')).withColumn( '__outlier', F.col('__mahalanobis') > critical_value).drop( '__features', '__scaled', '__array_scaled')) return distance
def main(max_num_users: int): logger.info('Start generating similar user matrix') try: listenbrainz_spark.init_spark_session('User Similarity') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) raise try: playcounts_df = utils.read_files_from_HDFS( path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME) users_df = utils.read_files_from_HDFS( path.USER_SIMILARITY_USERS_DATAFRAME) except PathNotFoundException as err: logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: logger.error(str(err), exc_info=True) raise vectors_df = get_vectors_df(playcounts_df) similarity_matrix = Correlation.corr( vectors_df, 'vector', 'pearson').first()['pearson(vector)'].toArray() similar_users = threshold_similar_users(similarity_matrix, max_num_users) # Due to an unresolved bug in Spark (https://issues.apache.org/jira/browse/SPARK-10925), we cannot join twice on # the same dataframe. Hence, we create a modified dataframe with the columns renamed. other_users_df = users_df\ .withColumnRenamed('user_id', 'other_user_id')\ .withColumnRenamed('user_name', 'other_user_name') similar_users_df = listenbrainz_spark.session.createDataFrame(similar_users, ['user_id', 'other_user_id', 'similarity', 'global_similarity'])\ .join(users_df, 'user_id', 'inner')\ .join(other_users_df, 'other_user_id', 'inner')\ .select('user_name', struct('other_user_name', 'similarity', 'global_similarity').alias('similar_user'))\ .groupBy('user_name')\ .agg(collect_list('similar_user').alias('similar_users')) logger.info('Finishing generating similar user matrix') return create_messages(similar_users_df)
""" from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("CorrelationExample") \ .getOrCreate() # $example on$ data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0])) # $example off$ spark.stop()