def filter_snow_monthly(weather_list): if os.path.isdir("{}/monthly".format(weather_list["weather_dir"])): weather = spark.read.csv( "{}/monthly".format(weather_list["weather_dir"]), monthly_weather_schema) filtered_weather = weather.filter(weather['Date/Time'].like("%-%"))\ .filter(weather['Year'] >= 2010)\ .select('Date/Time','Year','Month', "Total Snow (cm)", "Snow Grnd Last Day (cm)") # adapted from: https://stackoverflow.com/questions/48229043/python-pyspark-count-null-empty-and-nan?rq=1 snow_null_count = filtered_weather.filter((filtered_weather["Total Snow (cm)"] == "") | filtered_weather["Total Snow (cm)"].isNull() | functions.isnan(filtered_weather["Total Snow (cm)"]))\ .count() groundsnow_null_count = filtered_weather.filter((filtered_weather["Snow Grnd Last Day (cm)"] == "") | filtered_weather["Snow Grnd Last Day (cm)"].isNull() | functions.isnan(filtered_weather["Snow Grnd Last Day (cm)"]))\ .count() if snow_null_count <= groundsnow_null_count: null_count = snow_null_count else: null_count = groundsnow_null_count if null_count == 0 or ( (null_count / filtered_weather.count()) <= 1 / 2): return 0 return 1
def test_result(self): result = nanProcess(self.dataDF, "a", "Mean_Completer") num_null_nan = result.filter( result["a"].isNull()).count() + result.filter(isnan("a")).count() self.assertEqual(num_null_nan, 0) result1 = nanProcess(self.dataDF, "a", "Min_Completer") num_null_nan1 = result1.filter( result1["a"].isNull()).count() + result1.filter( isnan("a")).count() self.assertEqual(num_null_nan1, 0) result2 = nanProcess(self.dataDF, "a", "Max_Completer") num_null_nan2 = result2.filter( result2["a"].isNull()).count() + result2.filter( isnan("a")).count() self.assertEqual(num_null_nan2, 0) result3 = nanProcess(self.dataDF, "a", "Mode_Completer") num_null_nan3 = result3.filter( result3["a"].isNull()).count() + result3.filter( isnan("a")).count() self.assertEqual(num_null_nan3, 0) result4 = nanProcess(self.dataDF, "a", "Filling_Manually", 2.0) num_null_nan4 = result4.filter( result4["a"].isNull()).count() + result4.filter( isnan("a")).count() self.assertEqual(num_null_nan4, 0)
def mark_missing(cls, df_spark, missing_dict): """ Mark a row if missing values in specified column @input df_spark -- dictionary of spark dataframe missing_dict -- dictionary of table, columns pair @output Dictionary of Spark Dataframe """ for filename in df_spark: df = df_spark[filename] if filename in missing_dict: tmp = df.select([ (SQL.when(SQL.isnan(c) | SQL.col(c).isNull(), 1).otherwise(0)).alias(c) for c in missing_dict[filename] ]) else: tmp = df.select([ (SQL.when(SQL.isnan(c) | SQL.col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns ]) tmp = tmp.withColumn('total', sum( tmp[col] for col in tmp.columns)).select( SQL.when(SQL.col("total") > 0, 1).otherwise(0).alias("flag_missing")) df = cls.add_column_index(df) tmp = cls.add_column_index(tmp) df_spark[filename] = df.join(tmp, on="columnindex").drop("columnindex") return df_spark
def filter_rain_daily(weather_list): if os.path.isdir("{}/daily".format(weather_list["weather_dir"])): weather = spark.read.csv( "{}/daily".format(weather_list["weather_dir"]), daily_weather_schema) filtered_weather = weather.filter(weather['Date/Time'].like("%-%-%"))\ .filter(weather['Year'] >= 2010)\ .select('Date/Time','Year','Month','Day', "Total Rain (mm)", "Total Precip (mm)") # adapted from: https://stackoverflow.com/questions/48229043/python-pyspark-count-null-empty-and-nan?rq=1 precip_null_count = filtered_weather.filter((filtered_weather["Total Precip (mm)"] == "") | filtered_weather["Total Precip (mm)"].isNull() | functions.isnan(filtered_weather["Total Precip (mm)"]))\ .count() rain_null_count = filtered_weather.filter((filtered_weather["Total Rain (mm)"] == "") | filtered_weather["Total Rain (mm)"].isNull() | functions.isnan(filtered_weather["Total Rain (mm)"]))\ .count() if rain_null_count <= precip_null_count: null_count = rain_null_count else: null_count = precip_null_count if null_count == 0 or ( (null_count / filtered_weather.count()) <= 1 / 2): return 0 return 1
def view_missing_values(df): """ Identify and visualize missing values for a given dataframe (Spark or pandas). """ # create a dataframe with missing values count per column if type(df) == pyspark.sql.dataframe.DataFrame: nulls_df = df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns ]).toPandas() nulls_df = pd.melt(nulls_df, var_name='cols', value_name='values') nulls_df['% missing values'] = 100 * nulls_df['values'] / df.count() elif type(df) == pd.core.frame.DataFrame: nulls_df = pd.DataFrame(data=df.isnull().sum(), columns=['values']) nulls_df = nulls_df.reset_index() nulls_df.columns = ['cols', 'values'] nulls_df['% missing values'] = 100 * nulls_df['values'] / df.shape[0] plt.rcdefaults() plt.figure(figsize=(10, 5)) ax = sns.barplot(x="cols", y="% missing values", data=nulls_df) ax.set_ylim(0, 100) ax.set_xticklabels(ax.get_xticklabels(), rotation=90) plt.show() return nulls_df
def clean_spark(df, dropna_mode, idx): """clean a dataframe by removing cols with many missing values, and rows which contains missing values only and duplicated index rows :param df: spark dataframe :param idx: a list of string, identifier """ print(f'df.shape before cleaning ({df.count()},{len(df.columns)})') # create a dataframe with missing values count per column dfnull = df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns ]) # select cols with <50% missing balues cols = [ k for (k, v) in dfnull.collect()[0].asDict().items() if v / df.count() < 0.5 ] df = df.select(cols) print('len(df.columns) after dropping columns with >50% nan', len(df.columns)) # drop row with missing values ONLY df = df.dropna(how=dropna_mode) print('df.count after dropping empty rows', df.count()) # drop duplicated rows df = df.dropDuplicates(subset=idx) print('df.count after dropping duplicated rows', df.count()) return df
def write_rowsandnulls(spark, data_path, data_dir, year, month, df, logger): """Write out the total number of rows plus counts of any nulls, nans, empty strings and unknown values to csv. """ totRows = df.select(df.columns[0]).count() checknulls_df = (df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]) .withColumn('checktype', lit('isnan')) .withColumn('totalrows', lit(totRows)) .union(df .select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]) .withColumn('checktype', lit('isnull')) .withColumn('totalrows', lit(totRows)) ) .union(df .select([count(when(col(c) == '', c)).alias(c) for c in df.columns]) .withColumn('checktype', lit('emptystring')) .withColumn('totalrows', lit(totRows)) ) .union(df .select([count(when(col(c) == 'unknown', c)).alias(c) for c in df.columns]) .withColumn('checktype', lit('unknownstring')) .withColumn('totalrows', lit(totRows)) ) .union(df .select([count(when(col(c) == -1, c)).alias(c) for c in df.columns]) .withColumn('checktype', lit('nan_as_-1')) .withColumn('totalrows', lit(totRows)) ) ) checknulls_data = os.path.join(data_path, data_dir, "{:d}".format(year), "{:02d}".format(month), 'checknulls') checknulls_df.coalesce(1).write.csv(checknulls_data, mode='overwrite', header=True) logger.info("Wrote data summary for {} - checknulls".format(data_dir))
def all_columns_null(df): for c in df.columns: if c == "timestamp": continue if df.filter(F.col(c).isNull() | F.isnan(c)).count() != df.count(): return False return True
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name ) elif is_bool_dtype(dtype) and not isinstance(dtype, extension_dtypes): if index_ops.hasnans: raise ValueError("Cannot convert %s with missing values to bool" % self.pretty_name) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def test_one_iteration_v2(self): actual_new_label = [[1.0, 0.0], [0.0, 1.0], [0.73480, 0.26520], [0.25392, 0.74608]] new_test_df = self.test_df.withColumn( colName='label', col=F.when(F.isnan(F.col('label')), None).otherwise(F.col('label'))) computed_labels = depLabelPropagation.label_propagation( self.sc, new_test_df, 'label', 'id', ['a', 'b', 'c'], k=2, sigma=0.5, max_iters=1, standardize=False) pandas_comp_labels = computed_labels.toPandas() print(pandas_comp_labels) for idx, vec in enumerate(actual_new_label): computed_value = list(pandas_comp_labels['initial_label'][idx]) for jdx, val in enumerate(vec): self.assertAlmostEqual(val, computed_value[jdx], 4) print(computed_labels.toPandas())
def profile_dataframe(df): columns = df.columns # get general statistics provided by spark # it will have 5 rows for count, mean, stddev, min, max # each column will have those 5 values stats = df.describe().collect() ## get either nan or null counts nan_null_columns = [count(when(isnan(c) | col(c).isNull()), c).alias(c) for c in columns] nan_null_counts = df.select(*nan_null_columns) ## get distinct value counts distinct_columns = [countDistinct(col(c)).alias(c) for c in columns] disinct_counts = df.select(*distinct_columns) format_string = "%-30s %12s %12s %12s %12s %12s %12s %12s" print(format_string % ("column", "count", "count", "stddev", "min", "max", "null", "distinct count")) for i in range(len(columns)): print(format_string % (column[i][:30], str(stats[0][i])[:13], str(stats[0][i])[:13], str(stats[0][i])[:13], str(stats[0][i])[:13], str(stats[0][i])[:13], str(nan_null_counts[i]), str(disinct_counts[i])))
def shift(self, periods=1, fill_value=None): """ Shift Series/Index by desired number of periods. .. note:: the current implementation of shift uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Parameters ---------- periods : int Number of periods to shift. Can be positive or negative. fill_value : object, optional The scalar value to use for newly introduced missing values. The default depends on the dtype of self. For numeric data, np.nan is used. Returns ------- Copy of input Series/Index, shifted. Examples -------- >>> df = ks.DataFrame({'Col1': [10, 20, 15, 30, 45], ... 'Col2': [13, 23, 18, 33, 48], ... 'Col3': [17, 27, 22, 37, 52]}, ... columns=['Col1', 'Col2', 'Col3']) >>> df.Col1.shift(periods=3) 0 NaN 1 NaN 2 NaN 3 10.0 4 20.0 Name: Col1, dtype: float64 >>> df.Col2.shift(periods=3, fill_value=0) 0 0 1 0 2 0 3 13 4 23 Name: Col2, dtype: int64 """ if len(self._internal.index_columns) == 0: raise ValueError("Index must be set.") if not isinstance(periods, int): raise ValueError('periods should be an int; however, got [%s]' % type(periods)) col = self._scol index_columns = self._kdf._internal.index_columns window = Window.orderBy(index_columns).rowsBetween(-periods, -periods) shifted_col = F.lag(col, periods).over(window) col = F.when( shifted_col.isNull() | F.isnan(shifted_col), fill_value ).otherwise(shifted_col) return self._with_new_scol(col).alias(self.name)
def test_create_nan_labels(self): fraction = 0.1 input_data_frame = self.data_frame.filter(F.col('label').isin([0, 1])) output_data_frame = depSemisupervisedMnist.create_nan_labels( self.sc, dataframe=input_data_frame, label_col='label', fraction=fraction) # TEST 1: Does it contain missing_*label_name*? self.assertIn(member='missing_label', container=output_data_frame.columns) # TEST 2: Does the missing_factor correspond to the actual amount of missings? computed_fractions = ( output_data_frame.filter(~F.isnan('missing_label')).groupBy( 'missing_label').count().rdd.collectAsMap()) desired_frac = input_data_frame.groupBy('label').count().collect() desired_fractions = dict( map(lambda x: (x['label'], fraction * x['count']), desired_frac)) for key, val in computed_fractions.items(): self.assertAlmostEqual(val, desired_fractions[key], delta=input_data_frame.count() * 0.01) # 1 percent deviation
def main(spark): events = spark.read.json(READ_PATH) events.cache() null_keys = events.select([ f.count(f.when(f.isnan(c), c)).alias(c) for c in events.columns ]).collect() total_count = events.count() count_multiple_group_keys = (events.groupBy("anonymous_id").agg( f.countDistinct("browser_family").alias("browser_family_uniques"), f.countDistinct("device_family").alias("device_family_uniques"), f.countDistinct("os_family").alias("os_family_uniques")).filter( (f.col("browser_family_uniques") > 1) | (f.col("device_family_uniques") > 1) | (f.col("os_family_uniques") > 1)).count()) print("Quantidade de linhas com valores nulos por coluna:") print(null_keys) print("Quantidade total de eventos:") print(total_count) print( "Quantidade de anonymous_id's com mais de um valor possível de browser_family, device_family ou os_family:" ) print(count_multiple_group_keys)
def spark_count_nulls(spark, schehma_name, table_name, query_args=''): select_query = f"""SELECT * FROM {schehma_name}.{table_name} """ if len(query_args) > 0: select_query = select_query + f""" WHERE {query_args}""" spark_df = spark.sql(select_query) pd_df_nulls = spark_df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in spark_df.columns ]).toPandas().T pd_df_nulls.reset_index(inplace=True) pd_df_nulls.rename(columns={ 'index': 'COLUMN_NAME', 0: 'NULL_COUNT' }, inplace=True) df_null_columns = pd_df_nulls[pd_df_nulls.NULL_COUNT != 0] if len(df_null_columns) > 0: raise Exception( f'{df_null_columns} NULLs exist in {schehma_name}.{table_name}\n{df_null_columns.to_string()}' ) else: logging.info(pd_df_nulls.to_string())
def visualize_missing_values_spark(df): """Visualize missing values in a spark dataframe :param df: spark dataframe """ # create a dataframe with missing values count per column nan_count_df = df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns ]).toPandas() # convert dataframe from wide format to long format nan_count_df = pd.melt(nan_count_df, var_name='cols', value_name='values') # count total records in df total = df.count() # now lets add % missing values column nan_count_df['% missing values'] = 100 * nan_count_df['values'] / total plt.rcdefaults() plt.figure(figsize=(10, 5)) ax = sns.barplot(x="cols", y="% missing values", data=nan_count_df) ax.set_ylim(0, 100) ax.set_xticklabels(ax.get_xticklabels(), rotation=90) plt.show()
def isnull(self): """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are NA. NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings '' or numpy.inf are not considered NA values (unless you set pandas.options.mode.use_inf_as_na = True). Returns ------- Series : Mask of bool values for each element in Series that indicates whether an element is not an NA value. Examples -------- >>> ser = ks.Series([5, 6, np.NaN]) >>> ser.isna() # doctest: +NORMALIZE_WHITESPACE 0 False 1 False 2 True Name: 0, dtype: bool >>> ser.rename("a").to_frame().set_index("a").index.isna() Index([False, False, True], dtype='object', name='a') """ from databricks.koalas.indexes import MultiIndex if isinstance(self, MultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") if isinstance(self.spark_type, (FloatType, DoubleType)): return self._with_new_scol(self._scol.isNull() | F.isnan(self._scol)).rename(self.name) else: return self._with_new_scol(self._scol.isNull()).rename(self.name)
def isnull(self): """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are NA. NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings '' or numpy.inf are not considered NA values (unless you set pandas.options.mode.use_inf_as_na = True). Returns ------- Series : Mask of bool values for each element in Series that indicates whether an element is not an NA value. Examples -------- >>> ser = ks.Series([5, 6, np.NaN]) >>> ser.isna() # doctest: +NORMALIZE_WHITESPACE 0 False 1 False 2 True Name: 0, dtype: bool """ if isinstance(self.spark_type, (FloatType, DoubleType)): return self._with_new_scol(self._scol.isNull() | F.isnan(self._scol)).alias(self.name) else: return self._with_new_scol(self._scol.isNull()).alias(self.name)
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: if isinstance(index_ops.spark.data_type, (FloatType, DoubleType)): scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), F.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) else: # DecimalType scol = F.when(index_ops.spark.column.isNull(), F.lit(False)).otherwise( index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") if is_(df.cols.schema_dtype(col_name), (float, int)): expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) elif is_(df.cols.schema_dtype(col_name), (NullType)): expr.append(F.count(col_name).alias(col_name)) else: expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(df.select(*expr).to_json()) return result
def count_not_null(c, nan_as_null=True): """""" ## False -> 0 ##True -> 1 """""" pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True)) return sum(pred.cast("integer").alias(c))
def transform(self, X): ''' Transforms a given Spark dataframe containing playtime into a z-score based on a previously defined fit. Parameters: self (which contains a table per the previous fit that is used to transform a dataframe X) and X, a dataframe to be transformed by subtracting the average from self.table and dividing the difference by the standard deviation from self.table. Output: a transformed dataframe with z-scores in the playtime_scaled column. ''' X = X.alias('X') self.table = self.table.alias('self') X2 = (X.join(self.table, on=X['appid'] == self.table['appid'], how='left').select('X.*', 'self.avg', 'self.std_dev')) X2 = X2.withColumn('playtime_scaled', (X2['playtime_forever'] - X2['avg']) / X2['std_dev']) X2 = X2.drop('avg') X2 = X2.drop('std_dev') X2 = X2.filter(func.isnan('playtime_scaled') == False) return X2
def print_unique_and_missing(df, name_df='imigration'): len_df = df.count() cols_names = df.columns nuniques = df.agg( *(countDistinct(col(c)).alias(c) for c in df.columns)).rdd.flatMap(lambda x: x).collect() isnan_or_isnull = df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns ]).rdd.flatMap(lambda x: x).collect() for col_, uniq, nan in zip(cols_names, nuniques, isnan_or_isnull): if name_df == 'imigration': print( f'Column {col_:<8} has {uniq:>7} unique values and {nan/len_df*100:<8.3}% NaN values' ) if name_df == 'airport': print( f'Column {col_:<12} has {uniq:>7} unique values and {nan/len_df*100:<4.3}% NaN values' ) if name_df == 'demographics': print( f'Column {col_:<22} has {uniq:>4} unique values and {nan/len_df*100:<5.3}% NaN values' ) if name_df == 'temperature': print( f'Column {col_:<29} has {uniq:>6} unique values and {nan/len_df*100:<4.3}% NaN values' )
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") expr.append( F.count( F.when( F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(collect_as_dict(df.select(*expr).collect())) return result
def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike: return index_ops._with_new_scol( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), field=index_ops._internal.data_fields[0].copy( dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False ), )
def recommend_n_comics(top_n, new_comics_ids, account_id, als_model, comics_df, spark_instance): """ Given a list of new comics (to the user) and requested number N Return list of N comics, ordered descending by recommendation score """ # Create spark Df of new rows comics_to_predict = (spark_instance.createDataFrame([ (account_id, 1, comic_id) for comic_id in new_comics_ids ]).select( col('_1').alias('account_id'), col('_2').alias('bought'), col('_3').alias('comic_id'))) # Get predictions test_preds = als_model.transform(comics_to_predict) test_preds.persist() # Alias cdf = comics_df.alias('cdf') tp = test_preds.alias('tp') # Query results results = (tp.join( cdf, tp.comic_id == cdf.comic_id).filter(~isnan(col('prediction'))).orderBy( 'prediction', ascending=False).select('comic_title', 'img_url').limit(top_n)).toPandas() return results
def hasnans(self): """ Return True if it has any missing values. Otherwise, it returns False. >>> ks.DataFrame({}, index=list('abc')).index.hasnans False >>> ks.Series(['a', None]).hasnans True >>> ks.Series([1.0, 2.0, np.nan]).hasnans True >>> ks.Series([1, 2, 3]).hasnans False >>> (ks.Series([1.0, 2.0, np.nan]) + 1).hasnans True >>> ks.Series([1, 2, 3]).rename("a").to_frame().set_index("a").index.hasnans False """ sdf = self._internal.spark_frame scol = self.spark.column if isinstance(self.spark.data_type, (DoubleType, FloatType)): return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] else: return sdf.select(F.max(scol.isNull())).collect()[0][0]
def isnull(self): if isinstance(self.schema[self.name].dataType, (FloatType, DoubleType)): return Series(self._scol.isNull() | F.isnan(self._scol), self._kdf, self._index_info) else: return Series(self._scol.isNull(), self._kdf, self._index_info)
def count_not_null(c, nan_as_null=False): """Use conversion between boolean and integer - False -> 0 - True -> 1 """ pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True)) return sum(pred.cast("integer")).alias(c)
def execute(self, context): self.log.info('Getting the movie detials') spark = SparkSession.builder.appName('moviedb-etl')\ .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ .getOrCreate() #path s3_path = "s3://{}".format(self.s3_bucket) s3_path = s3_path + '/' + self.s3_key #read the dataset df = spark.read.csv(s3_path, header=True) ## Prepare movie and director tables # extract columns to create movie table movie_fields = [ "movie_title as title", "imdb_score as rating", "title_year as year", "duration", "director_name as director", "gross", "genres", "num_user_for_reviews as votes", "content_rating as content", "budget" ] movie_table = df.selectExpr(movie_fields).dropDuplicates() movie_table.show(5) # extract columns to create director table director_fields = [ "director_name", "gross", "genres", "movie_title", "content_rating", "budget", "imdb_score as rating" ] director_table = df.selectExpr(director_fields).dropDuplicates() director_table.show(5) # null value check director_table.select([ count(when(isnan(c), c)).alias(c) for c in director_table.columns ]).show() movie_table.select([ count(when(isnan(c), c)).alias(c) for c in movie_table.columns ]).show() # write the generated dataframe back to s3 s3_processed = "s3://{}".format(self.s3_bucket) s3_processed = s3_processed + '/' + 'processed' s3_movies = s3_processed + '/' + 'movies.csv' s3_direcor = s3_processed + '/' + 'director.csv' movie_table.write.csv(s3_movies, mode="overwrite") director_table.write.csv(s3_direcor, mode="overwrite")