コード例 #1
0
ファイル: describe-spark.py プロジェクト: sabariask/functions
    def describe_float_1d(df, column, current_result, nrows):
        stats_df = df.select(column).na.drop().agg(
            mean(col(column)).alias("mean"),
            df_min(col(column)).alias("min"),
            df_max(col(column)).alias("max"),
            variance(col(column)).alias("variance"),
            kurtosis(col(column)).alias("kurtosis"),
            stddev(col(column)).alias("std"),
            skewness(col(column)).alias("skewness"),
            df_sum(col(column)).alias("sum")).toPandas()

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr(
                "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format(
                    col=column, n=x)).toPandas().iloc[:, 0])
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column).na.drop().select(
            df_abs(col(column) - stats["mean"]).alias("delta")).agg(
                df_sum(col("delta"))).toPandas().iloc[0, 0] /
                        float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"],
                                     bins)

        return stats
コード例 #2
0
ファイル: describe-spark.py プロジェクト: sabariask/functions
    def describe_date_1d(df, column):
        stats_df = df.select(column).na.drop().agg(
            df_min(col(column)).alias("min"),
            df_max(col(column)).alias("max")).toPandas()
        stats = stats_df.iloc[0].copy()
        stats.name = column

        if isinstance(stats["max"], pd.Timestamp):
            stats = stats.astype(object)
            stats["max"] = str(stats["max"].to_pydatetime())
            stats["min"] = str(stats["min"].to_pydatetime())

        else:
            stats["range"] = stats["max"] - stats["min"]
        stats["type"] = "DATE"
        return stats
コード例 #3
0
    def describe_date_1d(df, column):
        stats_df = df.select(column).na.drop().agg(df_min(col(column)).alias("min"),
                                                   df_max(col(column)).alias("max")
                                                  ).toPandas()
        stats = stats_df.ix[0].copy()
        stats.name = column

        # Convert Pandas timestamp object to regular datetime:
        if isinstance(stats["max"], pd.tslib.Timestamp):
            stats = stats.astype(object)
            stats["max"] = str(stats["max"].to_pydatetime())
            stats["min"] = str(stats["min"].to_pydatetime())
        # Range only got when type is date
        else:
            stats["range"] = stats["max"] - stats["min"]
        stats["type"] = "DATE"
        return stats
コード例 #4
0
    def describe_date_1d(df, column):
        stats_df = df.select(column).na.drop().agg(
            df_min(col(column)).alias("min"),
            df_max(col(column)).alias("max")).toPandas()
        stats = stats_df.ix[0].copy()
        stats.name = column

        # Convert Pandas timestamp object to regular datetime:
        if isinstance(stats["max"], pd.tslib.Timestamp):
            stats = stats.astype(object)
            stats["max"] = str(stats["max"].to_pydatetime())
            stats["min"] = str(stats["min"].to_pydatetime())
        # Range only got when type is date
        else:
            stats["range"] = stats["max"] - stats["min"]
        stats["type"] = "DATE"
        return stats
コード例 #5
0
ファイル: base.py プロジェクト: yamnihcg/spark-df-profiling
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum"),
                                                       count(col(column) == 0.0).alias('n_zeros')
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum"),
                                                       count(col(column) == 0.0).alias('n_zeros')
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]

        for x in [0.05, 0.25, 0.5, 0.75, 0.95]:
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
コード例 #6
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().ix[:,0]
                                        )
        stats = stats_df.ix[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats