def filter_snow_monthly(weather_list):
    if os.path.isdir("{}/monthly".format(weather_list["weather_dir"])):
        weather = spark.read.csv(
            "{}/monthly".format(weather_list["weather_dir"]),
            monthly_weather_schema)
        filtered_weather = weather.filter(weather['Date/Time'].like("%-%"))\
            .filter(weather['Year'] >= 2010)\
            .select('Date/Time','Year','Month', "Total Snow (cm)", "Snow Grnd Last Day (cm)")

        # adapted from: https://stackoverflow.com/questions/48229043/python-pyspark-count-null-empty-and-nan?rq=1
        snow_null_count = filtered_weather.filter((filtered_weather["Total Snow (cm)"] == "") |
                                                  filtered_weather["Total Snow (cm)"].isNull() |
                                                  functions.isnan(filtered_weather["Total Snow (cm)"]))\
                                         .count()

        groundsnow_null_count = filtered_weather.filter((filtered_weather["Snow Grnd Last Day (cm)"] == "") |
                                                  filtered_weather["Snow Grnd Last Day (cm)"].isNull() |
                                                  functions.isnan(filtered_weather["Snow Grnd Last Day (cm)"]))\
                                         .count()

        if snow_null_count <= groundsnow_null_count:
            null_count = snow_null_count
        else:
            null_count = groundsnow_null_count

        if null_count == 0 or (
            (null_count / filtered_weather.count()) <= 1 / 2):
            return 0

    return 1
    def test_result(self):
        result = nanProcess(self.dataDF, "a", "Mean_Completer")
        num_null_nan = result.filter(
            result["a"].isNull()).count() + result.filter(isnan("a")).count()
        self.assertEqual(num_null_nan, 0)

        result1 = nanProcess(self.dataDF, "a", "Min_Completer")
        num_null_nan1 = result1.filter(
            result1["a"].isNull()).count() + result1.filter(
                isnan("a")).count()
        self.assertEqual(num_null_nan1, 0)

        result2 = nanProcess(self.dataDF, "a", "Max_Completer")
        num_null_nan2 = result2.filter(
            result2["a"].isNull()).count() + result2.filter(
                isnan("a")).count()
        self.assertEqual(num_null_nan2, 0)

        result3 = nanProcess(self.dataDF, "a", "Mode_Completer")
        num_null_nan3 = result3.filter(
            result3["a"].isNull()).count() + result3.filter(
                isnan("a")).count()
        self.assertEqual(num_null_nan3, 0)

        result4 = nanProcess(self.dataDF, "a", "Filling_Manually", 2.0)
        num_null_nan4 = result4.filter(
            result4["a"].isNull()).count() + result4.filter(
                isnan("a")).count()
        self.assertEqual(num_null_nan4, 0)
Esempio n. 3
0
    def mark_missing(cls, df_spark, missing_dict):
        """
    Mark a row if missing values in specified column

    @input
    df_spark -- dictionary of spark dataframe
    missing_dict -- dictionary of table, columns pair
    @output
    Dictionary of Spark Dataframe
    """
        for filename in df_spark:
            df = df_spark[filename]
            if filename in missing_dict:
                tmp = df.select([
                    (SQL.when(SQL.isnan(c) | SQL.col(c).isNull(),
                              1).otherwise(0)).alias(c)
                    for c in missing_dict[filename]
                ])
            else:
                tmp = df.select([
                    (SQL.when(SQL.isnan(c) | SQL.col(c).isNull(),
                              1).otherwise(0)).alias(c) for c in df.columns
                ])
            tmp = tmp.withColumn('total', sum(
                tmp[col] for col in tmp.columns)).select(
                    SQL.when(SQL.col("total") > 0,
                             1).otherwise(0).alias("flag_missing"))
            df = cls.add_column_index(df)
            tmp = cls.add_column_index(tmp)
            df_spark[filename] = df.join(tmp,
                                         on="columnindex").drop("columnindex")
        return df_spark
def filter_rain_daily(weather_list):
    if os.path.isdir("{}/daily".format(weather_list["weather_dir"])):
        weather = spark.read.csv(
            "{}/daily".format(weather_list["weather_dir"]),
            daily_weather_schema)
        filtered_weather = weather.filter(weather['Date/Time'].like("%-%-%"))\
            .filter(weather['Year'] >= 2010)\
            .select('Date/Time','Year','Month','Day', "Total Rain (mm)", "Total Precip (mm)")

        # adapted from: https://stackoverflow.com/questions/48229043/python-pyspark-count-null-empty-and-nan?rq=1
        precip_null_count = filtered_weather.filter((filtered_weather["Total Precip (mm)"] == "") |
                                                  filtered_weather["Total Precip (mm)"].isNull() |
                                                  functions.isnan(filtered_weather["Total Precip (mm)"]))\
                                         .count()

        rain_null_count = filtered_weather.filter((filtered_weather["Total Rain (mm)"] == "") |
                                                  filtered_weather["Total Rain (mm)"].isNull() |
                                                  functions.isnan(filtered_weather["Total Rain (mm)"]))\
                                         .count()

        if rain_null_count <= precip_null_count:
            null_count = rain_null_count
        else:
            null_count = precip_null_count

        if null_count == 0 or (
            (null_count / filtered_weather.count()) <= 1 / 2):
            return 0

    return 1
Esempio n. 5
0
def view_missing_values(df):
    """
    Identify and visualize missing values for a given dataframe (Spark or pandas).
    """
    # create a dataframe with missing values count per column
    if type(df) == pyspark.sql.dataframe.DataFrame:
        nulls_df = df.select([
            count(when(isnan(c) | col(c).isNull(), c)).alias(c)
            for c in df.columns
        ]).toPandas()
        nulls_df = pd.melt(nulls_df, var_name='cols', value_name='values')
        nulls_df['% missing values'] = 100 * nulls_df['values'] / df.count()
    elif type(df) == pd.core.frame.DataFrame:
        nulls_df = pd.DataFrame(data=df.isnull().sum(), columns=['values'])
        nulls_df = nulls_df.reset_index()
        nulls_df.columns = ['cols', 'values']
        nulls_df['% missing values'] = 100 * nulls_df['values'] / df.shape[0]

    plt.rcdefaults()
    plt.figure(figsize=(10, 5))
    ax = sns.barplot(x="cols", y="% missing values", data=nulls_df)
    ax.set_ylim(0, 100)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.show()
    return nulls_df
def clean_spark(df, dropna_mode, idx):
    """clean a dataframe by 
    removing cols with many missing values,
    and rows which contains missing values only
    and duplicated index rows
    :param df: spark dataframe
    :param idx: a list of string, identifier
    """

    print(f'df.shape before cleaning ({df.count()},{len(df.columns)})')

    # create a dataframe with missing values count per column
    dfnull = df.select([
        count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns
    ])

    # select cols with <50% missing balues
    cols = [
        k for (k, v) in dfnull.collect()[0].asDict().items()
        if v / df.count() < 0.5
    ]
    df = df.select(cols)
    print('len(df.columns) after dropping columns with >50% nan',
          len(df.columns))

    # drop row with missing values ONLY
    df = df.dropna(how=dropna_mode)
    print('df.count after dropping empty rows', df.count())

    # drop duplicated rows
    df = df.dropDuplicates(subset=idx)
    print('df.count after dropping duplicated rows', df.count())

    return df
Esempio n. 7
0
def write_rowsandnulls(spark, data_path, data_dir, year, month, df, logger):
    """Write out the total number of rows plus counts of any nulls, nans, empty strings
        and unknown values to csv.
    """
    totRows = df.select(df.columns[0]).count()
    checknulls_df = (df.select([count(when(isnan(c), c)).alias(c) for c in df.columns])
                            .withColumn('checktype', lit('isnan'))
                            .withColumn('totalrows', lit(totRows))
                    .union(df
                            .select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
                            .withColumn('checktype', lit('isnull'))
                            .withColumn('totalrows', lit(totRows))
                            )
                    .union(df
                            .select([count(when(col(c) == '', c)).alias(c) for c in df.columns])
                            .withColumn('checktype', lit('emptystring'))
                            .withColumn('totalrows', lit(totRows))
                            )
                    .union(df
                            .select([count(when(col(c) == 'unknown', c)).alias(c) for c in df.columns])
                            .withColumn('checktype', lit('unknownstring'))
                            .withColumn('totalrows', lit(totRows))
                            )
                    .union(df
                            .select([count(when(col(c) == -1, c)).alias(c) for c in df.columns])
                            .withColumn('checktype', lit('nan_as_-1'))
                            .withColumn('totalrows', lit(totRows))
                            )
                    )
    checknulls_data = os.path.join(data_path, data_dir,
                                    "{:d}".format(year), "{:02d}".format(month),
                                    'checknulls')
    checknulls_df.coalesce(1).write.csv(checknulls_data, mode='overwrite', header=True)
    logger.info("Wrote data summary for {} - checknulls".format(data_dir))
Esempio n. 8
0
def all_columns_null(df):
    for c in df.columns:
        if c == "timestamp":
            continue
        if df.filter(F.col(c).isNull() | F.isnan(c)).count() != df.count():
            return False
    return True
Esempio n. 9
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError(
                    "Cannot convert %s with missing values to integer" % self.pretty_name
                )
        elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
    def test_one_iteration_v2(self):
        actual_new_label = [[1.0, 0.0], [0.0, 1.0], [0.73480, 0.26520],
                            [0.25392, 0.74608]]

        new_test_df = self.test_df.withColumn(
            colName='label',
            col=F.when(F.isnan(F.col('label')),
                       None).otherwise(F.col('label')))
        computed_labels = depLabelPropagation.label_propagation(
            self.sc,
            new_test_df,
            'label',
            'id', ['a', 'b', 'c'],
            k=2,
            sigma=0.5,
            max_iters=1,
            standardize=False)
        pandas_comp_labels = computed_labels.toPandas()
        print(pandas_comp_labels)

        for idx, vec in enumerate(actual_new_label):
            computed_value = list(pandas_comp_labels['initial_label'][idx])
            for jdx, val in enumerate(vec):
                self.assertAlmostEqual(val, computed_value[jdx], 4)

        print(computed_labels.toPandas())
Esempio n. 11
0
def profile_dataframe(df):

    columns = df.columns

    # get general statistics provided by spark
    # it will have 5 rows for count, mean, stddev, min, max
    # each column will have those 5 values
    stats = df.describe().collect()

    ## get either nan or null counts
    nan_null_columns = [count(when(isnan(c) | col(c).isNull()), c).alias(c) for c in columns]
    nan_null_counts = df.select(*nan_null_columns)

    ## get distinct value counts
    distinct_columns = [countDistinct(col(c)).alias(c) for c in columns]
    disinct_counts = df.select(*distinct_columns)

    format_string = "%-30s %12s %12s %12s %12s %12s %12s %12s"

    print(format_string % ("column", "count", "count", "stddev", "min", "max", "null", "distinct count"))

    for i in range(len(columns)):
        print(format_string %
              (column[i][:30],
               str(stats[0][i])[:13],
               str(stats[0][i])[:13],
               str(stats[0][i])[:13],
               str(stats[0][i])[:13],
               str(stats[0][i])[:13],
               str(nan_null_counts[i]),
               str(disinct_counts[i])))
Esempio n. 12
0
    def shift(self, periods=1, fill_value=None):
        """
        Shift Series/Index by desired number of periods.

        .. note:: the current implementation of shift uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Parameters
        ----------
        periods : int
            Number of periods to shift. Can be positive or negative.
        fill_value : object, optional
            The scalar value to use for newly introduced missing values.
            The default depends on the dtype of self. For numeric data, np.nan is used.

        Returns
        -------
        Copy of input Series/Index, shifted.

        Examples
        --------
        >>> df = ks.DataFrame({'Col1': [10, 20, 15, 30, 45],
        ...                    'Col2': [13, 23, 18, 33, 48],
        ...                    'Col3': [17, 27, 22, 37, 52]},
        ...                   columns=['Col1', 'Col2', 'Col3'])

        >>> df.Col1.shift(periods=3)
        0     NaN
        1     NaN
        2     NaN
        3    10.0
        4    20.0
        Name: Col1, dtype: float64

        >>> df.Col2.shift(periods=3, fill_value=0)
        0     0
        1     0
        2     0
        3    13
        4    23
        Name: Col2, dtype: int64

        """
        if len(self._internal.index_columns) == 0:
            raise ValueError("Index must be set.")

        if not isinstance(periods, int):
            raise ValueError('periods should be an int; however, got [%s]' % type(periods))

        col = self._scol
        index_columns = self._kdf._internal.index_columns
        window = Window.orderBy(index_columns).rowsBetween(-periods, -periods)
        shifted_col = F.lag(col, periods).over(window)
        col = F.when(
            shifted_col.isNull() | F.isnan(shifted_col), fill_value
        ).otherwise(shifted_col)

        return self._with_new_scol(col).alias(self.name)
    def test_create_nan_labels(self):
        fraction = 0.1
        input_data_frame = self.data_frame.filter(F.col('label').isin([0, 1]))
        output_data_frame = depSemisupervisedMnist.create_nan_labels(
            self.sc,
            dataframe=input_data_frame,
            label_col='label',
            fraction=fraction)

        # TEST 1: Does it contain missing_*label_name*?
        self.assertIn(member='missing_label',
                      container=output_data_frame.columns)

        # TEST 2: Does the missing_factor correspond to the actual amount of missings?

        computed_fractions = (
            output_data_frame.filter(~F.isnan('missing_label')).groupBy(
                'missing_label').count().rdd.collectAsMap())

        desired_frac = input_data_frame.groupBy('label').count().collect()
        desired_fractions = dict(
            map(lambda x: (x['label'], fraction * x['count']), desired_frac))

        for key, val in computed_fractions.items():
            self.assertAlmostEqual(val,
                                   desired_fractions[key],
                                   delta=input_data_frame.count() *
                                   0.01)  # 1 percent deviation
Esempio n. 14
0
def main(spark):
    events = spark.read.json(READ_PATH)

    events.cache()

    null_keys = events.select([
        f.count(f.when(f.isnan(c), c)).alias(c) for c in events.columns
    ]).collect()

    total_count = events.count()

    count_multiple_group_keys = (events.groupBy("anonymous_id").agg(
        f.countDistinct("browser_family").alias("browser_family_uniques"),
        f.countDistinct("device_family").alias("device_family_uniques"),
        f.countDistinct("os_family").alias("os_family_uniques")).filter(
            (f.col("browser_family_uniques") > 1)
            | (f.col("device_family_uniques") > 1)
            | (f.col("os_family_uniques") > 1)).count())
    print("Quantidade de linhas com valores nulos por coluna:")
    print(null_keys)

    print("Quantidade total de eventos:")
    print(total_count)

    print(
        "Quantidade de anonymous_id's com mais de um valor possível de browser_family, device_family ou os_family:"
    )
    print(count_multiple_group_keys)
Esempio n. 15
0
def spark_count_nulls(spark, schehma_name, table_name, query_args=''):
    select_query = f"""SELECT * FROM {schehma_name}.{table_name} """

    if len(query_args) > 0:
        select_query = select_query + f""" WHERE {query_args}"""

    spark_df = spark.sql(select_query)

    pd_df_nulls = spark_df.select([
        count(when(isnan(c) | col(c).isNull(), c)).alias(c)
        for c in spark_df.columns
    ]).toPandas().T

    pd_df_nulls.reset_index(inplace=True)

    pd_df_nulls.rename(columns={
        'index': 'COLUMN_NAME',
        0: 'NULL_COUNT'
    },
                       inplace=True)

    df_null_columns = pd_df_nulls[pd_df_nulls.NULL_COUNT != 0]

    if len(df_null_columns) > 0:
        raise Exception(
            f'{df_null_columns} NULLs exist in {schehma_name}.{table_name}\n{df_null_columns.to_string()}'
        )
    else:
        logging.info(pd_df_nulls.to_string())
def visualize_missing_values_spark(df):
    """Visualize missing values in a spark dataframe
    
    :param df: spark dataframe
    """
    # create a dataframe with missing values count per column
    nan_count_df = df.select([
        count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns
    ]).toPandas()

    # convert dataframe from wide format to long format
    nan_count_df = pd.melt(nan_count_df, var_name='cols', value_name='values')

    # count total records in df
    total = df.count()

    # now lets add % missing values column
    nan_count_df['% missing values'] = 100 * nan_count_df['values'] / total

    plt.rcdefaults()
    plt.figure(figsize=(10, 5))
    ax = sns.barplot(x="cols", y="% missing values", data=nan_count_df)
    ax.set_ylim(0, 100)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.show()
Esempio n. 17
0
    def isnull(self):
        """
        Detect existing (non-missing) values.

        Return a boolean same-sized object indicating if the values are NA.
        NA values, such as None or numpy.NaN, gets mapped to True values.
        Everything else gets mapped to False values. Characters such as empty strings '' or
        numpy.inf are not considered NA values
        (unless you set pandas.options.mode.use_inf_as_na = True).

        Returns
        -------
        Series : Mask of bool values for each element in Series
            that indicates whether an element is not an NA value.

        Examples
        --------
        >>> ser = ks.Series([5, 6, np.NaN])
        >>> ser.isna()  # doctest: +NORMALIZE_WHITESPACE
        0    False
        1    False
        2     True
        Name: 0, dtype: bool

        >>> ser.rename("a").to_frame().set_index("a").index.isna()
        Index([False, False, True], dtype='object', name='a')
        """
        from databricks.koalas.indexes import MultiIndex
        if isinstance(self, MultiIndex):
            raise NotImplementedError("isna is not defined for MultiIndex")
        if isinstance(self.spark_type, (FloatType, DoubleType)):
            return self._with_new_scol(self._scol.isNull() | F.isnan(self._scol)).rename(self.name)
        else:
            return self._with_new_scol(self._scol.isNull()).rename(self.name)
Esempio n. 18
0
    def isnull(self):
        """
        Detect existing (non-missing) values.

        Return a boolean same-sized object indicating if the values are NA.
        NA values, such as None or numpy.NaN, gets mapped to True values.
        Everything else gets mapped to False values. Characters such as empty strings '' or
        numpy.inf are not considered NA values
        (unless you set pandas.options.mode.use_inf_as_na = True).

        Returns
        -------
        Series : Mask of bool values for each element in Series
            that indicates whether an element is not an NA value.

        Examples
        --------
        >>> ser = ks.Series([5, 6, np.NaN])
        >>> ser.isna()  # doctest: +NORMALIZE_WHITESPACE
        0    False
        1    False
        2     True
        Name: 0, dtype: bool
        """
        if isinstance(self.spark_type, (FloatType, DoubleType)):
            return self._with_new_scol(self._scol.isNull() | F.isnan(self._scol)).alias(self.name)
        else:
            return self._with_new_scol(self._scol.isNull()).alias(self.name)
Esempio n. 19
0
    def astype(self, index_ops: T_IndexOps, dtype: Union[str, type,
                                                         Dtype]) -> T_IndexOps:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                if isinstance(index_ops.spark.data_type,
                              (FloatType, DoubleType)):
                    scol = F.when(
                        index_ops.spark.column.isNull()
                        | F.isnan(index_ops.spark.column),
                        F.lit(True),
                    ).otherwise(index_ops.spark.column.cast(spark_type))
                else:  # DecimalType
                    scol = F.when(index_ops.spark.column.isNull(),
                                  F.lit(False)).otherwise(
                                      index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Esempio n. 20
0
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :return:
        """

        columns = parse_columns(self, columns)
        df = self
        expr = []

        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean
            if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")

            if is_(df.cols.schema_dtype(col_name), (float, int)):
                expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name))

            elif is_(df.cols.schema_dtype(col_name), (NullType)):
                expr.append(F.count(col_name).alias(col_name))

            else:
                expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name))

        result = format_dict(df.select(*expr).to_json())
        return result
def count_not_null(c, nan_as_null=True):
    """"""
    ## False -> 0
    ##True -> 1
    """"""
    pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True))
    return sum(pred.cast("integer").alias(c))
Esempio n. 22
0
    def transform(self, X):
        '''
        Transforms a given Spark dataframe containing playtime into a z-score based on a previously defined fit.

        Parameters: self (which contains a table per the previous fit that is used to transform a dataframe X) and
                    X, a dataframe to be transformed by subtracting the average from self.table and dividing the
                    difference by the standard deviation from self.table.

        Output: a transformed dataframe with z-scores in the playtime_scaled column.

        '''

        X = X.alias('X')
        self.table = self.table.alias('self')
        X2 = (X.join(self.table,
                     on=X['appid'] == self.table['appid'],
                     how='left').select('X.*', 'self.avg', 'self.std_dev'))

        X2 = X2.withColumn('playtime_scaled',
                           (X2['playtime_forever'] - X2['avg']) /
                           X2['std_dev'])
        X2 = X2.drop('avg')
        X2 = X2.drop('std_dev')
        X2 = X2.filter(func.isnan('playtime_scaled') == False)

        return X2
Esempio n. 23
0
def print_unique_and_missing(df, name_df='imigration'):
    len_df = df.count()
    cols_names = df.columns

    nuniques = df.agg(
        *(countDistinct(col(c)).alias(c)
          for c in df.columns)).rdd.flatMap(lambda x: x).collect()

    isnan_or_isnull = df.select([
        count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns
    ]).rdd.flatMap(lambda x: x).collect()

    for col_, uniq, nan in zip(cols_names, nuniques, isnan_or_isnull):
        if name_df == 'imigration':
            print(
                f'Column {col_:<8} has {uniq:>7} unique values and {nan/len_df*100:<8.3}% NaN values'
            )
        if name_df == 'airport':
            print(
                f'Column {col_:<12} has {uniq:>7} unique values and {nan/len_df*100:<4.3}% NaN values'
            )
        if name_df == 'demographics':
            print(
                f'Column {col_:<22} has {uniq:>4} unique values and {nan/len_df*100:<5.3}% NaN values'
            )
        if name_df == 'temperature':
            print(
                f'Column {col_:<29} has {uniq:>6} unique values and {nan/len_df*100:<4.3}% NaN values'
            )
Esempio n. 24
0
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :param type: Accepts integer, float, string or None
        :return:
        """

        columns = parse_columns(self, columns)

        df = self
        expr = []
        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure

            if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")
            expr.append(
                F.count(
                    F.when(
                        F.isnan(col_name) | F.col(col_name).isNull(),
                        col_name)).alias(col_name))

        result = format_dict(collect_as_dict(df.select(*expr).collect()))

        return result
Esempio n. 25
0
 def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike:
     return index_ops._with_new_scol(
         index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
         field=index_ops._internal.data_fields[0].copy(
             dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False
         ),
     )
Esempio n. 26
0
def recommend_n_comics(top_n, new_comics_ids, account_id, als_model, comics_df,
                       spark_instance):
    """
    Given a list of new comics (to the user) and requested number N
    Return list of N comics, ordered descending by recommendation score
    """

    # Create spark Df of new rows
    comics_to_predict = (spark_instance.createDataFrame([
        (account_id, 1, comic_id) for comic_id in new_comics_ids
    ]).select(
        col('_1').alias('account_id'),
        col('_2').alias('bought'),
        col('_3').alias('comic_id')))

    # Get predictions
    test_preds = als_model.transform(comics_to_predict)
    test_preds.persist()

    # Alias
    cdf = comics_df.alias('cdf')
    tp = test_preds.alias('tp')

    # Query results
    results = (tp.join(
        cdf,
        tp.comic_id == cdf.comic_id).filter(~isnan(col('prediction'))).orderBy(
            'prediction',
            ascending=False).select('comic_title',
                                    'img_url').limit(top_n)).toPandas()

    return results
Esempio n. 27
0
    def hasnans(self):
        """
        Return True if it has any missing values. Otherwise, it returns False.

        >>> ks.DataFrame({}, index=list('abc')).index.hasnans
        False

        >>> ks.Series(['a', None]).hasnans
        True

        >>> ks.Series([1.0, 2.0, np.nan]).hasnans
        True

        >>> ks.Series([1, 2, 3]).hasnans
        False

        >>> (ks.Series([1.0, 2.0, np.nan]) + 1).hasnans
        True

        >>> ks.Series([1, 2, 3]).rename("a").to_frame().set_index("a").index.hasnans
        False
        """
        sdf = self._internal.spark_frame
        scol = self.spark.column

        if isinstance(self.spark.data_type, (DoubleType, FloatType)):
            return sdf.select(F.max(scol.isNull()
                                    | F.isnan(scol))).collect()[0][0]
        else:
            return sdf.select(F.max(scol.isNull())).collect()[0][0]
Esempio n. 28
0
 def isnull(self):
     if isinstance(self.schema[self.name].dataType,
                   (FloatType, DoubleType)):
         return Series(self._scol.isNull() | F.isnan(self._scol), self._kdf,
                       self._index_info)
     else:
         return Series(self._scol.isNull(), self._kdf, self._index_info)
Esempio n. 29
0
def count_not_null(c, nan_as_null=False):
    """Use conversion between boolean and integer
    - False -> 0
    - True ->  1
    """
    pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True))
    return sum(pred.cast("integer")).alias(c)
    def execute(self, context):
        self.log.info('Getting the movie detials')
        spark = SparkSession.builder.appName('moviedb-etl')\
                .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
                .getOrCreate()

        #path
        s3_path = "s3://{}".format(self.s3_bucket)
        s3_path = s3_path + '/' + self.s3_key
        #read the dataset
        df = spark.read.csv(s3_path, header=True)

        ## Prepare movie and director tables
        # extract columns to create movie table
        movie_fields = [
            "movie_title as title", "imdb_score as rating",
            "title_year as year", "duration", "director_name as director",
            "gross", "genres", "num_user_for_reviews as votes",
            "content_rating as content", "budget"
        ]
        movie_table = df.selectExpr(movie_fields).dropDuplicates()
        movie_table.show(5)

        # extract columns to create director table
        director_fields = [
            "director_name", "gross", "genres", "movie_title",
            "content_rating", "budget", "imdb_score as rating"
        ]
        director_table = df.selectExpr(director_fields).dropDuplicates()
        director_table.show(5)

        # null value check
        director_table.select([
            count(when(isnan(c), c)).alias(c) for c in director_table.columns
        ]).show()
        movie_table.select([
            count(when(isnan(c), c)).alias(c) for c in movie_table.columns
        ]).show()

        # write the generated dataframe back to s3
        s3_processed = "s3://{}".format(self.s3_bucket)
        s3_processed = s3_processed + '/' + 'processed'
        s3_movies = s3_processed + '/' + 'movies.csv'
        s3_direcor = s3_processed + '/' + 'director.csv'
        movie_table.write.csv(s3_movies, mode="overwrite")
        director_table.write.csv(s3_direcor, mode="overwrite")