Esempio n. 1
0
def cal(data):
    df = spark.createDataFrame(data, ["features"])
    print(df.show())
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
def main():
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]

    df = spark.createDataFrame(data, ["features"])
    df.show()

    r1 = Correlation.corr(df, "features").head()
    print("Pearson Correlation matrix: ", str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman Correlation matrix:", str(r2[0]))
Esempio n. 3
0
def learn1():
    from pyspark.ml.linalg import Vectors
    from pyspark.ml.stat import Correlation

    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
    df = ss.createDataFrame(data, ["features"])

    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    def pearsonTest(self):
        dataset=self.dataset
        labelColm=self.labelColm
        features=self.features
        labelColm = [labelColm]

        allColms = labelColm + features

        featureAssembler = VectorAssembler(
            inputCols=allColms, outputCol="allColmsVectorized", handleInvalid="skip")
        allColmsVectorizedDataset = featureAssembler.transform(dataset)
        allColmsVectorizedDataset.show()
        r1p = Correlation.corr(allColmsVectorizedDataset, "allColmsVectorized").head()
        print("pearson correlation matrix : \n : " + str(r1p[0]))
        pearson_matrix = r1p[0].toArray().tolist()
        pearsonMatrix = []
        for everylist in pearson_matrix:
            insideList = []
            for listinlist in everylist:
                insideList.append(round(listinlist, 4))
            pearsonMatrix.append(insideList)
        pearson_value_d = []
        for x in r1p[0].toArray():
            pearson_value_d.append(round(x[0], 4))
            # pearson_value_d.append(x[0])
        pearson_value = {}
        for col, val in zip(allColms, pearson_value_d):
            pearson_value[col] = val
        print(pearson_value)
        #
        # r1s = Correlation.corr(allColmsVectorizedDataset, "allColmsVectorized", "spearman").head()
        # print(" spearman correlation...: \n" + str(r1s[0]))
        result_pearson = {'pearson_value': pearson_value,
                          'matrix': pearsonMatrix}
        return result_pearson
Esempio n. 5
0
def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame:
    """
    The correlation matrix of all the numerical columns of this dataframe.

    Only accepts scalar numerical values for now.

    :param kdf: the koalas dataframe.
    :param method: {'pearson', 'spearman'}
                   * pearson : standard correlation coefficient
                   * spearman : Spearman rank correlation
    :return: :class:`pandas.DataFrame`

    >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
         A    B
    A  1.0 -1.0
    B -1.0  1.0
    """
    assert method in ('pearson', 'spearman')
    ndf, column_index = to_numeric_df(kdf)
    corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
    pcorr = corr.toPandas()
    arr = pcorr.iloc[0, 0].toArray()
    if column_index_level(column_index) > 1:
        idx = pd.MultiIndex.from_tuples(column_index)
    else:
        idx = pd.Index([idx[0] for idx in column_index])
    return pd.DataFrame(arr, columns=idx, index=idx)
Esempio n. 6
0
def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame:
    """
    The correlation matrix of all the numerical columns of this dataframe.

    Only accepts scalar numerical values for now.

    :param kdf: the koalas dataframe.
    :param method: {'pearson', 'spearman'}
                   * pearson : standard correlation coefficient
                   * spearman : Spearman rank correlation
    :return: :class:`pandas.DataFrame`

    >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
         A    B
    A  1.0 -1.0
    B -1.0  1.0
    """
    assert method in ('pearson', 'spearman')
    ndf, fields = to_numeric_df(kdf)
    corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
    pcorr = corr.toPandas()
    arr = pcorr.iloc[0, 0].toArray()
    arr = pd.DataFrame(arr)
    arr.columns = fields
    arr = arr.set_index(pd.Index(fields))
    return arr
Esempio n. 7
0
def corr(psdf: "ps.DataFrame", method: str = "pearson") -> pd.DataFrame:
    """
    The correlation matrix of all the numerical columns of this dataframe.

    Only accepts scalar numerical values for now.

    :param psdf: the pandas-on-Spark dataframe.
    :param method: {'pearson', 'spearman'}
                   * pearson : standard correlation coefficient
                   * spearman : Spearman rank correlation
    :return: :class:`pandas.DataFrame`

    >>> ps.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
         A    B
    A  1.0 -1.0
    B -1.0  1.0
    """
    assert method in ("pearson", "spearman")
    ndf, column_labels = to_numeric_df(psdf)
    corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
    pcorr = cast(pd.DataFrame, corr.toPandas())
    arr = pcorr.iloc[0, 0].toArray()
    if column_labels_level(column_labels) > 1:
        idx = pd.MultiIndex.from_tuples(column_labels)
    else:
        idx = pd.Index([label[0] for label in column_labels])
    return pd.DataFrame(arr, columns=idx, index=idx)
Esempio n. 8
0
def correlacion(Finales, corte=.7):
    Cor = [i[0] for i in Finales.dtypes if 'double' in i[1]]
    C = [i[0] for i in Finales.dtypes if 'string' in i[1]]
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=[c for c in Cor],
                                outputCol=vector_col)
    df_vector = assembler.transform(Finales).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col, "spearman").head()
    a = matrix[0]
    Arreglo = a.toArray()
    CorrArrpd = pd.DataFrame(data=Arreglo[0:, 0:])
    CorrArrpd.columns = [c for c in Cor]
    col_corr = []
    corte = .7
    for i in range(len(CorrArrpd)):
        for j in range(i):
            if (CorrArrpd.iloc[i, j] >= corte) and (CorrArrpd.columns[j]
                                                    not in col_corr):
                colname = CorrArrpd.columns[i]
                col_corr.append(colname)

    CorrArrpd.drop(col_corr, inplace=True, axis=1)
    sinCorrelacion = CorrArrpd.columns
    joinedlist = list(set().union(sinCorrelacion, C))
    Finales = Finales.select(
        [sinCor for sinCor in Finales.columns if sinCor in joinedlist])
    return Finales
Esempio n. 9
0
def correlation(request,project_id):
    template = loader.get_template('data/datacorrelation.html')
    project = get_object_or_404(Project, pk=project_id)
    #df = getTransformedData(project_id,0)
    df = mysite.dataoperation.readfromcassandra(project_id,0)
    df = mysite.dataoperation.transformdataframe(project,df,0)
    df = applyfeaturetransition(project,df,project.features.all(),project.target,project.targets.all())

    from pyspark.ml.linalg import Vectors
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.stat import Correlation
    from pyspark.ml.stat import ChiSquareTest
    feat = list(map(lambda x:x.fieldname,project.features.all()))
    targ = list(map(lambda x:x.fieldname,project.targets.all()))

    features = feat+targ
    assembler = VectorAssembler(
        inputCols=features,
        outputCol="correlation")
    
    output = assembler.transform(df)
    corr_mat=Correlation.corr(output,"correlation", method="pearson")
    corr_html = corr_mat.toPandas().iloc[0]['pearson(correlation)']
    #chi_sqr = ChiSquareTest.test(output, "correlation", "label").head()
    context = {
        "project" : project,
        "project_id" : project_id,
        "menuactive":3,
        "correlation":corr_html
        
    }
    return HttpResponse(template.render(context, request))
Esempio n. 10
0
def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame:
    """
    The correlation matrix of all the numerical columns of this dataframe.

    Only accepts scalar numerical values for now.

    :param kdf: the koalas dataframe.
    :param method: {'pearson', 'spearman'}
                   * pearson : standard correlation coefficient
                   * spearman : Spearman rank correlation
    :return: :class:`pandas.DataFrame`

    >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
         A    B
    A  1.0 -1.0
    B -1.0  1.0
    """
    assert method in ('pearson', 'spearman')
    ndf, fields = to_numeric_df(kdf)
    corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
    pcorr = corr.toPandas()
    arr = pcorr.iloc[0, 0].toArray()
    arr = pd.DataFrame(arr)
    arr.columns = fields
    arr = arr.set_index(pd.Index(fields))
    return arr
Esempio n. 11
0
def correlation(self,
                columns,
                method="pearson",
                strategy="mean",
                output="json"):
    """
    Calculate the correlation between columns. It will try to cast a column to float where necessary and impute
    missing values
    :param self:
    :param columns: Columns to be processed
    :param method: Method used to calculate the correlation
    :param strategy: Imputing strategy
    :param output: array or json
    :return:
    """
    columns = parse_columns(self, columns)
    # try to parse the select column to float and create a vector

    df = self
    for col_name in columns:
        df = df.cols.cast(col_name, "float")
        logging.info(
            "Casting {col_name} to float...".format(col_name=col_name))

    # Impute missing values
    imputed_cols = [c + "_imputed" for c in columns]
    df = df.cols.impute(columns, imputed_cols, strategy)
    logging.info("Imputing {columns}, Using '{strategy}'...".format(
        columns=columns, strategy=strategy))

    # Create Vector necessary to calculate the correlation
    df = df.cols.nest(imputed_cols, "features", "vector")

    corr = Correlation.corr(df, "features", method).head()[0].toArray()

    if output is "array":
        result = corr

    elif output is "json":

        # Parse result to json
        col_pair = []
        for col_name in columns:
            for col_name_2 in columns:
                col_pair.append({"between": col_name, "an": col_name_2})

        # flat array
        values = corr.flatten('F').tolist()

        result = []
        for n, v in zip(col_pair, values):
            # Remove correlation between the same column
            if n["between"] is not n["an"]:
                n["value"] = v
                result.append(n)

        result = sorted(result, key=lambda k: k['value'], reverse=True)

    return result
Esempio n. 12
0
 def corr_test(self, group_by, pivot_col, agg_dict, method='pearson'):
     matrix_df = self.toWide(group_by=group_by,
                             pivot_col=pivot_col,
                             agg_dict=agg_dict).cache()
     vec_df = CorrMat.vector_assembler(self=matrix_df,
                                       cols_name=matrix_df.columns[1:])
     return SparkCorrelation.corr(vec_df, 'features', method)\
                            .select(col(f'{method}(features)').alias(f'{method}_features'))
Esempio n. 13
0
def correl():
    spark = SparkSession\
            .builder\
            .appName("Test")\
            .getOrCreate()

    rawData = spark.sql("select * from matus_marton.googleplay")
    df_gp = rawData.toPandas()
    df_gp.corr()
    df_gp.describe()


    rawData.show()
    Correlation.corr(rawData, ["rating","reviews"])
    rawData.describe()
    
    return ('Correlation computed')
Esempio n. 14
0
def find_correlation(spark: SparkSession,
                     to_correlate: List[str],
                     features_file=TRACK_FEATURES):
    df = spark.read.json(features_file, multiLine=True)
    vector_mapper = udf(lambda row: Vectors.dense(row), VectorUDT())

    df = df.withColumn("features", vector_mapper(array(*to_correlate)))
    corr_matrix = Correlation.corr(df, "features").head()[0]
    return df, corr_matrix
Esempio n. 15
0
def CorrelationMatrix(df):
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=df.columns, outputCol=vector_col)
    df_vector = assembler.transform(df).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col)
    cor = matrix.collect()[0]["pearson({})".format(vector_col)].values
    cor2 = pd.DataFrame(
        np.array(cor).reshape(len(df.columns), len(df.columns))).iloc[:15, :15]
    heatmap2d(cor2)
Esempio n. 16
0
def correlation(self, input_cols, method="pearson", output="json"):
    """
    Calculate the correlation between columns. It will try to cast a column to float where necessary and impute
    missing values
    :param self:
    :param input_cols: Columns to be processed
    :param method: Method used to calculate the correlation
    :param output: array or json
    :return:
    """

    df = self

    # Values in columns can not be null. Warn user
    input_cols = parse_columns(self,
                               input_cols,
                               filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
    # try to parse the select column to float and create a vector

    # print(self.cols.count_na(input_cols))

    # Input is not a vector transform to a vector
    output_col = name_col(input_cols, "correlation")
    if len(input_cols) > 1:
        for col_name in input_cols:
            df = df.cols.cast(col_name, "float")
            logger.print(
                "Casting {col_name} to float...".format(col_name=col_name))

        df = df.cols.nest(input_cols, "vector", output_cols=output_col)

    corr = Correlation.corr(df, output_col, method).head()[0].toArray()

    if output is "array":
        result = corr

    elif output is "json":

        # Parse result to json
        col_pair = []
        for col_name in input_cols:
            for col_name_2 in input_cols:
                col_pair.append({"between": col_name, "an": col_name_2})

        # flat array
        values = corr.flatten('F').tolist()

        result = []
        for n, v in zip(col_pair, values):
            # Remove correlation between the same column
            if n["between"] is not n["an"]:
                n["value"] = v
                result.append(n)

        result = sorted(result, key=lambda k: k['value'], reverse=True)

    return {"cols": input_cols, "data": result}
Esempio n. 17
0
def mldemo():

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])
    
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
Esempio n. 18
0
def correlation_matrix(df, corr_columns, method='pearson'):
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
    df_vector = assembler.transform(df).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col, method)

    result = matrix.collect()[0]["pearson({})".format(vector_col)].values
    return pd.DataFrame(result.reshape(-1, len(corr_columns)),
                        columns=corr_columns,
                        index=corr_columns).to_string()
Esempio n. 19
0
def correlation(data):
    dfe = data.drop(*["Artist", "SongID"])
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=dfe.columns, outputCol=vector_col)
    df_vector = assembler.transform(dfe).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col)
    mat_array = np.reshape(
        matrix.collect()[0]["pearson({})".format(vector_col)].values, [14, 14])
    m = pd.DataFrame(mat_array, columns=dfe.columns, index=dfe.columns)
    # ax = sns.heatmap(m)
    print(dfe.columns)
def Correlation_test_imp(dataset, features, label_col):

    label_col = [label_col]

    All_colms = label_col + features

    featureassembler_correlation = VectorAssembler(
        inputCols=All_colms,
        outputCol="correlation_colm",
        handleInvalid="skip")
    output_corr = featureassembler_correlation.transform(dataset)
    output_corr.show()

    finalized_corr = output_corr.select("correlation_colm")
    finalized_corr.show()
    from pyspark.ml.stat import Correlation

    r1p = Correlation.corr(output_corr, "correlation_colm").head()
    print("pearson correlation matrix : \n : " + str(r1p[0]))
    pearson_matrix = r1p[0].toArray().tolist()

    pearsonMatrix = []
    for everylist in pearson_matrix:
        insideList = []
        for listinlist in everylist:
            insideList.append(round(listinlist, 4))

        pearsonMatrix.append(insideList)

    print(pearsonMatrix)

    pearson_value_d = []

    for x in r1p[0].toArray():
        pearson_value_d.append(round(x[0], 4))
        # pearson_value_d.append(x[0])

    print(pearson_value_d)

    pearson_value = {}
    for col, val in zip(All_colms, pearson_value_d):
        pearson_value[col] = val

    print(pearson_value)

    #
    # r1s = Correlation.corr(output_corr, "correlation_colm", "spearman").head()
    # print(" spearman correlation...: \n" + str(r1s[0]))

    result_pearson = {'pearson_value': pearson_value, 'matrix': pearsonMatrix}
    # print(json_response)

    return result_pearson
Esempio n. 21
0
def correlation(self, columns, method="pearson", output="json"):
    """
    Calculate the correlation between columns. It will try to cast a column to float where necessary and impute
    missing values
    :param self:
    :param columns: Columns to be processed
    :param method: Method used to calculate the correlation
    :param output: array or json
    :return:
    """
    columns = parse_columns(self, columns)
    # try to parse the select column to float and create a vector

    df = self
    if len(columns) == 1:
        if is_column_a(df, columns, "vector"):
            output_col = one_list_to_val(columns)
    else:
        output_col = "_correlation_features"
        for col_name in columns:
            df = df.cols.cast(col_name, "float")
            logger.print(
                "Casting {col_name} to float...".format(col_name=col_name))

        df = df.cols.nest(columns, "vector", output_cols=output_col)

    # Create Vector necessary to calculate the correlation
    corr = Correlation.corr(df, output_col, method).head()[0].toArray()

    if output is "array":
        result = corr

    elif output is "json":

        # Parse result to json
        col_pair = []
        for col_name in columns:
            for col_name_2 in columns:
                col_pair.append({"between": col_name, "an": col_name_2})

        # flat array
        values = corr.flatten('F').tolist()

        result = []
        for n, v in zip(col_pair, values):
            # Remove correlation between the same column
            if n["between"] is not n["an"]:
                n["value"] = v
                result.append(n)

        result = sorted(result, key=lambda k: k['value'], reverse=True)

    return result
Esempio n. 22
0
def basic_statistics():
    """Basic statistics."""

    df = sql.read.parquet(str(DATA_PARQUET))

    numeric = ['cost', 'call_duration_minutes', 'data_volume_mb']
    assemble = VectorAssembler(inputCols=numeric, outputCol='features')
    features = assemble.transform(df.dropna(subset=numeric + ['target']))

    breakpoint()

    # summarize
    summarize = Summarizer().metrics('mean', 'variance', 'count',
                                     'numNonZeros', 'max', 'min', 'normL2',
                                     'normL1')
    features.select(summarize.summary(
        features['features'])).show(truncate=False)

    # correlations
    r1 = Correlation.corr(features, 'features', 'pearson').head()[0]
    small = features.sample(fraction=0.1, seed=100500)
    r2 = Correlation.corr(small, 'features', 'spearman').head()[0]
Esempio n. 23
0
def correlation_matrix(df: DataFrame, feature_columns: list,
                       output_column: str) -> np.ndarray:
    """
    generates the Pearson correlation coefficients matrix for feature ranking
    """

    vec_assembler = VectorAssembler(inputCols=feature_columns +
                                    [output_column],
                                    outputCol="all_columns")
    df_vectorized = vec_assembler.transform(df)
    cor_matrix = Correlation.corr(df_vectorized,
                                  "all_columns").collect()[0][0].toArray()
    return cor_matrix
Esempio n. 24
0
def calc_correlation(feature_columns, feature_data):
    """
    Calculates the Spearman Correlation Coefficient between all given columns.
    """
    print("-- Calculating correlation --")
    print("Features: ", feature_columns)
    vector_col = "features"
    assembler = VectorAssembler(inputCols=feature_columns,
                                outputCol=vector_col)
    df_vector = assembler.transform(feature_data).select(vector_col)
    corr_mat = Correlation.corr(df_vector, vector_col, "spearman").head()
    print("-- Done calculating correlation -- ")
    return corr_mat[0]
Esempio n. 25
0
def correlate(uri1, uri2, conf):
    spark = SparkSession.builder \
                        .config(conf=conf) \
                        .getOrCreate()

    df1 = spark.read.format("csv").options(header=True, inferschema=True).load(uri1)
    df2 = spark.read.format("csv").options(header=True, inferschema=True).load(uri2)

    df1.printSchema()
    df2.printSchema()

    """
    For Spearman, a rank correlation, we need to create an RDD[Double] for each column and sort it
    in order to retrieve the ranks and then join the columns back into an RDD[Vector], which is fairly costly.
    Cache the input Dataset before calling corr with method = ‘spearman’ to avoid recomputing the common lineage.
    """
    # join 2 datasets and ignore first resolution columns
    joined = df1.join(df2, ["temp_res", "spat_res"], 'inner')

    feature_types = joined.dtypes[2:]
    # print(feature_types)

    # drop non numeric features just in case
    num_feature_types = filter(lambda t: t[1] == "int" or t[1] == "double" or t[1] == "float", feature_types)
    features = [f_t[0] for f_t in num_feature_types]
    # print(features)

    joined = joined.select(features)
    joined.printSchema()

    # assemble the Vectors for Correlation.corr(), np.array is equivalent to dense venctors
    vecAssembler = VectorAssembler(
        inputCols=features,
        outputCol="features"
    )
    joinedVec = vecAssembler.transform(joined)
    spearmanCorr = Correlation.corr(joinedVec, 'features', method='spearman').collect()[0][0]

    # turn into pandas dataframe
    spearmanCorr = spearmanCorr.toArray()
    print(spearmanCorr)

    # prepare and write correlation result
    out_dir = spark.conf.get("output")
    out_dir = "correlations/" + out_dir
    print("output directory is: " + out_dir)

    pandasDF = pd.DataFrame(spearmanCorr, index=features, columns=features)
    pandasDF.to_csv(out_dir)

    spark.stop()
Esempio n. 26
0
def correlation_scaled_checker(scaled_dataset):
    '''
    Try 1: Attributes artist_latitude and artist_longitude seem to be very sparse so they it require to skip a lot of values.
        However they seem irrelevant with the year prediction of a song, so omit them
    Try 2: After examining the first results in our subset, values analysis_sample_rate, danceability, energy contain always the same value
        So the correlation map, they get NaN values which is expected. Omit them as well
    '''
    vector_col = "scaled_features"

    matrix = Correlation.corr(dataset=scaled_dataset, column=vector_col, method='pearson').collect()[0][0]
    corrmatrix = matrix.toArray().tolist()
    print(corrmatrix)

    correlation_heatmap(corrmatrix, columns)
def correlation_checker(parquetFile):
    #
    # feature_selector = parquetFile.select('artist_familiarity', 'artist_hotttnesss', 'artist_latitude',
    # 'artist_longitude', 'song_hotttnesss', 'analysis_sample_rate', 'danceability','duration', 'end_of_fade_in',
    # 'energy', 'key_confidence', 'start_of_fade_out', 'tempo', 'time_signature_confidence', 'artist_playmeid',
    # 'artist_7digitalid', 'release_7digitalid', 'track_7digitalid', 'key', 'mode', 'time_signature', 'year', 'label')

    # parquetFile.select("segments_loudness_max").show(10, False)
    '''
    Try 1: Attributes artist_latitude and artist_longitude seem to be very sparse so they it require to skip a lot of values.
        However they seem irrelevant with the year prediction of a song, so omit them
    Try 2: After examining the first results in our subset, values analysis_sample_rate, danceability, energy contain always the same value
        So the correlation map, they get NaN values which is expected. Omit them as well
    '''

    #   TODO check what to do with file song_hotttnesss. For the time being omit that
    feature_selector = parquetFile.select(
        'artist_familiarity', 'artist_hotttnesss', 'song_hotttnesss',
        'duration', 'end_of_fade_in', 'key_confidence', 'start_of_fade_out',
        'tempo', 'time_signature_confidence', 'artist_playmeid',
        'artist_7digitalid', 'release_7digitalid', 'track_7digitalid', 'key',
        'loudness', 'mode', 'mode_confidence', 'time_signature', 'year',
        'label')

    feature_selector.describe().show()

    columns = [
        'artist_familiarity', 'artist_hotttnesss', 'song_hotttnesss',
        'duration', 'end_of_fade_in', 'key_confidence', 'start_of_fade_out',
        'tempo', 'time_signature_confidence', 'artist_playmeid',
        'artist_7digitalid', 'release_7digitalid', 'track_7digitalid', 'key',
        'loudness', 'mode', 'mode_confidence', 'time_signature', 'year',
        'label'
    ]

    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=columns,
                                outputCol=vector_col).setHandleInvalid("skip")
    corr_vector = assembler.transform(feature_selector).select(vector_col)
    # matrix = Correlation.corr(myGraph_vector, vector_col)

    matrix = Correlation.corr(dataset=corr_vector,
                              column=vector_col,
                              method='pearson').collect()[0][0]
    corrmatrix = matrix.toArray().tolist()
    print(corrmatrix)

    # Check what if scaling causes any differences. This heatmap should showcase only the non-highly correlated elements
    correlation_heatmap(corrmatrix, columns)
Esempio n. 28
0
    def getCorrelationMatrix(self,sales_df):
        vector_col = "corr_features"
        print(sales_df.printSchema())
        columns=[]
        for c,d in zip(sales_df.columns, sales_df.dtypes) :
            if d[1] in ("int","double"):
                columns.append(c)
        print(columns)

        assembler = VectorAssembler(inputCols=columns, outputCol=vector_col)
        df_vector = assembler.transform(sales_df).select(vector_col)

        # get correlation matrix
        matrix = Correlation.corr(df_vector, vector_col)
        print(matrix.collect())
Esempio n. 29
0
def correlation_matrix(df: sql.DataFrame, corr_columns: list, method: str = 'pearson'):
    """

    Args:
        df: pyspark dataframe,
        corr_columns:
        method:

    Returns:

    """
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
    df_vector = assembler.transform(df).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col, method)
    result = matrix.collect()[0]["pearson({})".format(vector_col)].values
    return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)
Esempio n. 30
0
def Correlation(dataset_add, feature_colm, label_colm):

    # dataset_add = str(dataset_add).replace("10.171.0.181", "dhiraj")
    print("Dataset Name  ", dataset_add)
    dataset = spark.read.parquet(dataset_add)
    #dataset = spark.read.parquet("hdfs://dhiraj:9000/dev/dmxdeepinsight/datasets/123_AUTOMILES.parquet")

    dataset.show()

    All_colms =  label_colm + feature_colm

    # correlation

    featureassembler_correlation = VectorAssembler(
        inputCols=All_colms, outputCol="correlation_colm")
    output_corr = featureassembler_correlation.transform(dataset)
    output_corr.show()

    finalized_corr = output_corr.select("correlation_colm")
    finalized_corr.show()
    from pyspark.ml.stat import Correlation

    r1p = Correlation.corr(output_corr, "correlation_colm").head()
    print("pearson correlation matrix \n : " + str(r1p[0]))
    print("pearson correlation matrix \n : " + str(r1p[0].toArray()))
    pearson_matrix = r1p[0].toArray().tolist()
    pearson_value = []

    for x in r1p[0].toArray():
        pearson_value.append(x[0])

    print(pearson_value)

    #
    # r1s = Correlation.corr(output_corr, "correlation_colm", "spearman").head()
    # print(" spearman correlation...: \n" + str(r1s[0]))

    json_response = {'pearson_value' : pearson_value,
                     'matrix': pearson_matrix}
    print(json_response)


    return json_response

# Correlation(dataset_address, feature_colm, label_colm)
Esempio n. 31
0
def mahalanobis(sdf, colnames):
    """Computes Mahalanobis distance from origin and compares to critical values
    using Chi-Squared distribution to identify possible outliers.
    """
    check_columns(sdf, colnames)
    # Builds pipeline to assemble feature columns and scale them
    assembler = VectorAssembler(inputCols=colnames, outputCol='__features')
    scaler = StandardScaler(inputCol='__features',
                            outputCol='__scaled',
                            withMean=True)
    pipeline = Pipeline(stages=[assembler, scaler])
    features = pipeline.fit(sdf).transform(sdf)

    # Computes correlation between features and inverts it
    # Since we scaled the features, we can assume they have unit variance
    # and therefore, correlation and covariance matrices are the same!
    mat = Correlation.corr(features, '__scaled').head()[0].toArray()
    inv_mat = inv(mat)

    # Computes critical value
    critical_value = chi2.ppf(0.999, len(colnames))

    # Builds Pandas UDF to compute Mahalanobis distance from origin
    # sqrt((V - 0) * inv_M * (V - 0))
    try:
        import pyarrow

        @F.pandas_udf('double')
        def pudf_mult(v):
            return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v)))
    except:

        @F.udf('double')
        def pudf_mult(v):
            return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v)))

    # Convert feature vector into array
    features = dense_to_array(features, '__scaled', '__array_scaled')
    # Computes Mahalanobis distance and flags as outliers all elements above critical value
    distance = (features.withColumn(
        '__mahalanobis', pudf_mult('__array_scaled')).withColumn(
            '__outlier',
            F.col('__mahalanobis') > critical_value).drop(
                '__features', '__scaled', '__array_scaled'))
    return distance
Esempio n. 32
0
def main(max_num_users: int):

    logger.info('Start generating similar user matrix')
    try:
        listenbrainz_spark.init_spark_session('User Similarity')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME)
        users_df = utils.read_files_from_HDFS(
            path.USER_SIMILARITY_USERS_DATAFRAME)
    except PathNotFoundException as err:
        logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        logger.error(str(err), exc_info=True)
        raise

    vectors_df = get_vectors_df(playcounts_df)

    similarity_matrix = Correlation.corr(
        vectors_df, 'vector', 'pearson').first()['pearson(vector)'].toArray()
    similar_users = threshold_similar_users(similarity_matrix, max_num_users)

    # Due to an unresolved bug in Spark (https://issues.apache.org/jira/browse/SPARK-10925), we cannot join twice on
    # the same dataframe. Hence, we create a modified dataframe with the columns renamed.
    other_users_df = users_df\
        .withColumnRenamed('user_id', 'other_user_id')\
        .withColumnRenamed('user_name', 'other_user_name')

    similar_users_df = listenbrainz_spark.session.createDataFrame(similar_users, ['user_id', 'other_user_id',
        'similarity', 'global_similarity'])\
        .join(users_df, 'user_id', 'inner')\
        .join(other_users_df, 'other_user_id', 'inner')\
        .select('user_name', struct('other_user_name', 'similarity', 'global_similarity').alias('similar_user'))\
        .groupBy('user_name')\
        .agg(collect_list('similar_user').alias('similar_users'))

    logger.info('Finishing generating similar user matrix')

    return create_messages(similar_users_df)
Esempio n. 33
0
"""
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("CorrelationExample") \
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])

    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    # $example off$

    spark.stop()