def remove_outliers(df, column, train=True, train_mean=None, train_std=None): '''Remove outliers from a given column of the dataframe. The function uses the training set mean and standard deviation to revome all points not within 10 stddevs. It is applied to the testing set with the cooresponding training set's statistics in order to prevent data leakage. Args: df (DataFrame): dataframe of either the train or test data column (str): column name train (bool); whether df is the test or train set train_mean (float): mean of the train set column train_std (float): standard deviation of the train set column Returns: DataFrame, (and train mean and standard dev if applicable) ''' if train: samp_mean = df.agg({column: 'mean'}).collect()[0]['avg(' + column + ')'] samp_std = df.agg({column: 'std'}).collect()[0]['stddev(' + column + ')'] clean_df = df.filter(abs(df[column] - samp_mean) < 10 * samp_std) return clean_df, samp_mean, samp_std else: clean_df = df.filter(abs(df[column] - train_mean) < 10 * train_std) return clean_df
def clean_data(input_df, threshold=15000): """Clean data""" # Set counts that correspond to time_diff >= 8h to NaNs # Take absolute values of entries_count and exits_count tmp_df = ( input_df .withColumn("entries_count", F.when(F.col("time_diff") >= 8, None).otherwise( F.abs(F.col("entries_count")))) .withColumn("exits_count", F.when(F.col("time_diff") >= 8, None).otherwise( F.abs(F.col("exits_count")))) ) # Replace absolute values of entries_count and exits_count > 15,000 with NaNs tmp_df = ( tmp_df .withColumn("entries_count", F.when(F.col("entries_count") > threshold, None).otherwise( F.col("entries_count"))) .withColumn("exits_count", F.when(F.col("exits_count") > threshold, None).otherwise( F.col("exits_count"))) ) # Impute NaNs with average counts of the same turnstile, hour & day of week tmp_df = ( tmp_df .withColumn("hour", F.hour("time_rounded")) .withColumn("wkdy", F.dayofweek("time_rounded")) ) tmp_df = impute_nans(tmp_df, "entries_count") tmp_df = impute_nans(tmp_df, "exits_count") # Compute traffic output_df = tmp_df.withColumn("traffic", F.col("entries_count") + F.col("exits_count")) return output_df
def addErrorCols(transformedFull, col_target, col_predict, verbose, logger): try: if verbose: logger.info('Add error columns to spark df start, function add_error_cols()') transformedFull = transformedFull\ .select('*', abs((transformedFull[col_target] - transformedFull[col_predict]) /transformedFull[col_target]*100)\ .alias(col_target+'_APE')) transformedFull = transformedFull\ .select('*', abs((transformedFull[col_target] - transformedFull[col_predict]))\ .alias(col_target+'_AE')) transformedFull = transformedFull\ .select('*', pow(transformedFull[col_target] - transformedFull[col_predict],2)\ .alias(col_target+'_SE')) if verbose: logger.info('Add error columns to spark df end') except Exception: logger.exception("Fatal error in add_error_cols()") raise return transformedFull
def getPrecisionAtOneRecallFromPRCurve(curve, recall): pr_curve_with_recall_diff = curve\ .withColumn("recall_diff", F.abs(F.col("recall") - recall)) min_recall_diff = pr_curve_with_recall_diff\ .agg(F.min("recall_diff")\ .alias("min_recall_diff"))\ .collect()[0].asDict()["min_recall_diff"] precision = pr_curve_with_recall_diff\ .filter(F.abs(F.col("recall_diff") - min_recall_diff) < 1e-9)\ .sort("recall", F.desc("precision"))\ .first().asDict()["precision"] return precision
def interests(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that computes the interest of association rules (interest = |confidence - frequency(consequent)|; note the absolute value) obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), and prints the first <n> rules sorted by (1) descending antecedent size in association rule, and (2) descending interest. Return value: a CSV string. Test: tests/test_interests.py ''' spark = init_spark() result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map( lambda x: (x[1], x[0][0], x[0][1:])) df = spark.createDataFrame(result, ['id', 'plant', 'items']) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) result = model.associationRules modelResult = model.freqItemsets result=modelResult.join(result,modelResult['items']==result["consequent"]) total = df.count() result = result.withColumn("interest",abs(result["confidence"]-result["freq"]/total)) result = result.select(size("antecedent").alias('tam'), 'antecedent', 'consequent', 'confidence',"items","freq","interest") result = result.sort(desc('tam'), desc('interest')).limit(n) result=result.select('antecedent', 'consequent', 'confidence',"items","freq","interest") return toCSVLine(result)
def metrics(self, predictions): """ Evaluates the results of the model """ x = ((predictions['ArrDelay'] - predictions['prediction']) / predictions['ArrDelay']) * 100 predictions = predictions.withColumn('Accuracy', abs(x)) rmse_evaluator = RegressionEvaluator( labelCol="ArrDelay", predictionCol="prediction", metricName="rmse") mae_evaluator = RegressionEvaluator(labelCol='ArrDelay', predictionCol="prediction", metricName="mae") R2_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ArrDelay", metricName="r2") R2 = R2_evaluator.evaluate(predictions) mae = mae_evaluator.evaluate(predictions) rmse = rmse_evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) print("Mean Absolute Error (MAE) on test data = %g" % mae) print("R Squared (R2) on test data = %g" % R2) return R2, mae, rmse
def filter_outliers(dataframe, exclude_columns): """ For every feature, except those in exclude_columns, set all outliers to NULL. """ for column in dataframe.columns: if column in exclude_columns: continue # Exclude boolean types. if dataframe.schema[column].dataType == BooleanType(): continue stats = dataframe \ .select(_mean(col(column)).alias('mean'), stddev(col(column)).alias('std')) \ .collect() mean = stats[0]['mean'] std = stats[0]['std'] print("mean: %s; std: %s" % (str(mean), str(std))) count_before = dataframe.filter(col(column).isNull()).count() dataframe = dataframe.withColumn( column, when(abs((col(column) - mean) / std) < 3, col(column)).otherwise(None)) print("Deleted %s entries because of z-score (3) for %s." % ( str(dataframe.filter(col(column).isNull()).count() - count_before), column)) return dataframe
def mad(columns, more=None): """ Return the Median Absolute Deviation :param columns: Column to be processed :param more: Return some extra computed values (Median). :return: """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) result = {} for col_name in columns: _mad = {} # return mean(absolute(data - mean(data, axis)), axis) median_value = self.cols.median(col_name) mad_value = self.select(col_name) \ .withColumn(col_name, F.abs(F.col(col_name) - median_value)) \ .cols.median(col_name) if more: _mad = {"mad": mad_value, "median": median_value} else: _mad = {"mad": mad_value} result[col_name] = _mad return format_dict(result)
def absolute_difference(primary_col: str, secondary_col: str, output_col: str, df: DataFrame): """Return the absolute difference between 2 columns""" # note that sometimes the absolute function produces rounding errors return df.withColumn( output_col, F.round(F.abs(F.col(primary_col) - F.col(secondary_col)), 10))
def main(inputs, output): observation_schema = types.StructType([ types.StructField('station', types.StringType(), False), types.StructField('date', types.StringType(), False), types.StructField('observation', types.StringType(), False), types.StructField('value', types.IntegerType(), False), types.StructField('mflag', types.StringType(), False), types.StructField('qflag', types.StringType(), False), types.StructField('sflag', types.StringType(), False), types.StructField('obstime', types.StringType(), False),]) weather = spark.read.csv(inputs, schema=observation_schema) #Read the input files into a DataFrame t_min = weather.filter((weather.qflag.isNull())&(weather.observation=='TMIN')) #the field qflag (quality flag) is null, the station starts with 'CA' and the observation is 'TMAX' t_max = weather.filter((weather.qflag.isNull())&(weather.observation=='TMAX')) t_min_selected = t_min.select('date','station','value') t_max_selected = t_max.select('date','station','value') t_min_group = t_min_selected.groupby('date','station').agg(functions.min(t_min_selected['value'])).withColumnRenamed("MIN(value)", "min_count") t_max_group = t_max_selected.groupby('date','station').agg(functions.max(t_max_selected['value'])).withColumnRenamed("MAX(value)", "max_count") weather_joined = t_min_group.join(broadcast(t_max_group), ((t_max_group['date'] == t_min_group['date'])&((t_max_group['station'] == t_min_group['station'])))).drop(t_min_group.date).drop(t_min_group.station) weather_joined_newcol = weather_joined.withColumn("range", abs((weather_joined.min_count - weather_joined.max_count)/10)) # Divide the temperature by 10 so it's actually in °C, and call the resulting column tmax. weather_joined_group = weather_joined_newcol.select('date', 'range').groupby('date').agg(functions.max(weather_joined_newcol['range'])) weather_joined_two = weather_joined_newcol.join(broadcast(weather_joined_group), (weather_joined_group['date'] == weather_joined_newcol['date'])).drop(weather_joined_newcol.date) weather_joined_filter = weather_joined_two.filter(weather_joined_two['range'] == weather_joined_two['max(range)']).sort('date','station', ascending=True) final_output = weather_joined_filter.select('date','station','range') final_output.write.csv(output, mode='overwrite')
def TAES(spark,df,geolevels,queries,schema,u): z=sdftools.getAnswers(spark,df,geolevels,schema,queries) z=z.groupby(['geolevel','run_id']).sum() u.show(10) print("this is z") z.show(10) q=u.join(z, on=['geolevel','run_id']) columnstodrop=['plb','budget_group'] q=q.drop(*columnstodrop) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production q=q.withColumn('MDF/sum',sf.col('priv')/sf.col('sum(priv)')) q=q.withColumn('CEF/sum',sf.col('orig')/sf.col('sum(orig)')) q=q.withColumn('difference',sf.col('MDF/sum')-sf.col('CEF/sum')) q=q.withColumn('abs',sf.abs(sf.col('difference'))) print("This is q") q.show(10) q=q.groupby(['geolevel','run_id']).sum() columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)'] print("this is q2") q=q.drop(*columnstodrop) q.show(10) z=q.groupby(['geolevel']).avg() print("this is z") z.show(10) return q,z
def assert_df_matches_expected(df: pyspark.sql.DataFrame, column_name: str, column_type: str = "float", precision: float = 1e-15): # compare the column requested to the same one with alias of "expected" and see that all are under certain precision df_compare = df.select( "*", F.col(f"expected.{column_name}").alias("expected_output"), F.col(column_name).alias("actual_output")) # subtract and compare diff if (column_type == "float"): df_compare = df_compare.withColumn( "diff", F.abs(F.col("expected_output") - F.col("actual_output"))) df_compare = df_compare.withColumn("identical", F.col("diff") < F.lit(precision)) else: df_compare = df_compare.withColumn( "diff", F.concat(F.lit("expected:"), F.col("expected_output"), F.lit(" vs actual:"), F.col("actual_output"))) df_compare = df_compare.withColumn( "identical", F.col("expected_output") == F.col("actual_output")) # find ones that are different df_diff = df_compare.filter( F.coalesce(F.col("identical"), F.lit(False)) != True) if (df_diff.count() > 0): df_diff.show() assert df_diff.count( ) == 0, f"All actual values match expected for column {column_name}"
def getRddWithAbsDiff(spark, df, geolevels, queries, schema): rddWithAnswers = sdftools.getAnswers(spark, df, geolevels, schema, queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production rddWithDiff = rddWithAnswers.withColumn('diff', sf.col('priv') - sf.col('orig')) rddWithAbsDiff = rddWithDiff.withColumn('abs diff', sf.abs(sf.col('diff'))) return rddWithAbsDiff
def _generate_select_expression_for_extended_string_to_timestamp( source_column, name): """ More robust conversion from StringType to TimestampType. It is assumed that the timezone is already set to UTC in spark / java to avoid implicit timezone conversions. The conversion can handle unix timestamps in seconds and in milliseconds: - Timestamps in the range [-MAX_TIMESTAMP_S, MAX_TIMESTAMP_S] are treated as seconds - Timestamps in the range [-inf, -MAX_TIMESTAMP_S) and (MAX_TIMESTAMP_S, inf] are treated as milliseconds - There is a time interval (1970-01-01 +- ~2.5 months)where we can not distinguish correctly between s and ms (e.g. 3974400000 would be treated as seconds (2095-12-11T00:00:00) as the value is smaller than MAX_TIMESTAMP_S, but it could also be a valid date in Milliseconds (1970-02-16T00:00:00) Is able to additionally handle (compared to implicit Spark conversion): * Preceding whitespace * Trailing whitespace * Preceeding and trailing whitespace Hint ---- Please have a look at the tests to get a better feeling how it behaves under tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and tests/data/test_fixtures/mapper_custom_data_types_fixtures.py Example ------- >>> from spooq.transformer import Mapper >>> >>> input_df.head(3) [Row(input_string="2020-08-12T12:43:14+0000"), Row(input_string="1597069446"), Row(input_string="2020-08-12")] >>> mapping = [("output_value", "input_string", "extended_string_to_timestamp")] >>> output_df = Mapper(mapping).transform(input_df) >>> output_df.head(3) [Row(input_string=datetime.datetime(2020, 8, 12, 12, 43, 14)), Row(input_string=datetime.datetime(2020, 8, 10, 14, 24, 6)), Row(input_string=datetime.datetime(2020, 8, 12, 0, 0, 0))] """ return (F.when( F.abs(F.trim(source_column).cast(T.LongType())).between( 0, MAX_TIMESTAMP_SEC), F.trim(source_column).cast(T.LongType()).cast(T.TimestampType()), ).when( F.abs(F.trim(source_column).cast(T.LongType())) > MAX_TIMESTAMP_SEC, (F.trim(source_column) / 1000).cast(T.TimestampType()), ).otherwise(F.trim(source_column).cast(T.TimestampType())).alias(name))
def step_03_join(self): # TODO: # - Join all result of step_02 based on the group by attributes. # - For each metrics, renamed it to "datasource: metric_name" # - For each combination of datasource, calculate data difference column # - Calculate a test_result column if every related metric matches (If only 2 input sources is provided) group_by = self.config["group_by"] # Rename every metric with prefix as source_metricname for source, agg in self.agg.items(): metric_cols = list(filter(lambda x: x not in group_by, agg.columns)) self.agg[source] = reduce( lambda df, metric: df.withColumnRenamed( metric, source + "_" + metric), metric_cols, agg) # Join joined = reduce(lambda x, y: x.join(y, how="full", on=group_by), self.agg.values()) # Calculate differences if there are only two sources if len(self.agg) == 2: source1, source2 = tuple(self.config["data"].keys()) source1_metrics = list( self.config["data"][source1]["metrics"].keys()) source2_metrics = list( self.config["data"][source2]["metrics"].keys()) # Look for same metrics in both sources # I know that it could be done in O(n), this is more readable shared_metrics = sorted( set(source1_metrics) & set(source2_metrics)) for metric in shared_metrics: try: joined = joined.withColumn( "delta_" + metric, F.abs( F.col(source1 + "_" + metric) - F.col(source2 + "_" + metric))) except: # Cannot calculate difference, eg in case the metric is string pass # For float and double type, the acceptance rate is 0.1 percent if dict(joined.dtypes)[source1 + "_" + metric] in ("float", "double") \ or dict(joined.dtypes)[source2 + "_" + metric] in ("float", "double"): def difference(number1, number2, error=1e-3): return abs((number1 - number2) / number2) < error joined = joined.withColumn( "match_" + metric, F.udf(difference, T.BooleanType())(F.col(source1 + "_" + metric), F.col(source2 + "_" + metric))) else: joined = joined.withColumn( "match_" + metric, F.col(source1 + "_" + metric) == F.col(source2 + "_" + metric)) self.joined = joined return joined
def _m_z_score(self): df = self.df col_name = self.col_name mad = df.cols.mad(col_name, self.relative_error, True) m_z_col_name = name_col(col_name, "modified_z_score") return df.withColumn(m_z_col_name, F.abs(0.6745 * (F.col(col_name) - mad["median"]) / mad["mad"]))
def fixed_effects_p (df, grouping_columns): inverse_normal_udf = f.pandas_udf (lambda x: x.apply(norm.ppf), 'float') temp = df.withColumn('inverse_normal', inverse_normal_udf (df.P)).withColumn('sign', df.BETA / f.sqrt(df.BETA*df.BETA)) temp1 = temp.withColumn('Z_i', f.abs(temp.inverse_normal) * temp.sign).withColumn('w_i', f.sqrt(temp.n)) temp2 = temp1.withColumn('Z_i_w_i', temp1.Z_i * temp1.w_i).withColumn('w_i_sq', temp1.w_i * temp1.w_i) grouped = temp2.withColumn('studies', f.lit(1)).groupBy(grouping_columns).agg(f.sum('n'), f.sum('Z_i_w_i'), f.sum('w_i_sq'), f.sum('studies')).withColumnRenamed('sum(Z_i_w_i)','sum_Z_i_w_i').withColumnRenamed('sum(w_i_sq)', 'sum_w_i_sq') final = grouped.withColumn('Z', grouped.sum_Z_i_w_i / f.sqrt(grouped.sum_w_i_sq)) return(final.select(grouping_columns + ['sum(n)','sum(studies)','Z']))
def get_last_month(col): h = F.abs(F.xxhash64(col)) h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2) h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3) h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5) h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7) h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11) return -(h1 + h2 + h3 + h4 + h5)
def MAE(spark, df, geolevels, queries, schema): u = sdftools.getAnswers(spark, df, geolevels, schema, queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production u = u.withColumn('diff', sf.col('priv') - sf.col('orig')) u = u.withColumn('abs diff', sf.abs(sf.col('diff'))) y = u.groupby(['geocode', 'geolevel', 'level']).avg() z = u.groupby(['geolevel']).avg() return u, y, z
def find_otp_bus_legs_actual_end_time(otp_legs_st, clean_bus_trips): return otp_legs_st \ .withColumnRenamed('to_stop_id','stopPointId') \ .join(clean_bus_trips, ['date','route','busCode','tripNum','stopPointId'], how='inner') \ .na.drop(subset=['timestamp']) \ .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('otp_end_time')))) \ .withColumnRenamed('timestamp', 'to_timestamp') \ .withColumnRenamed('stopPointId','to_stop_id') \ .orderBy(['date','route','stopPointId','timediff'])
def remove_too_fast_objects(self): """ some data entries are surely erroneus, so some objects move up to 10-20 km per second. We should remove it. :return: filtered `self.df` """ window = Window.partitionBy(['id', F.to_date('ts')]).orderBy('ts') self.df = self.df \ .withColumn('delta_lat', (F.lag('lat').over(window) - F.col('lat'))) \ .withColumn('delta_lon', (F.lag('lon').over(window) - F.col('lon'))) \ .withColumn('delta_ts', (F.col('ts').cast('long') - F.lag('ts').over(window).cast('long'))) \ .withColumn('speed1', F.col('delta_lat') / F.col('delta_ts')) \ .withColumn('speed2', F.col('delta_lon') / F.col('delta_ts')) \ .dropna() \ .filter((F.abs(F.col('speed1')) < speed) & (F.abs(F.col('speed2')) < speed))
def _evaluate(self, dataset): dataset = dataset.withColumn( 'non_zero', F.when(F.col(self.predictionCol) == 0, 1).otherwise(F.col(self.predictionCol))) return (dataset.select(F.mean( F.abs( (F.col(self.labelCol) - F.col(self.predictionCol)) / F.col('non_zero'))).alias('mape')) \ .collect()[0][0]) * float(100)
def clean_choke(self, method="99"): """ Method to clean WH_choke variables values from the well_df Spark data frame attribute Parameters ---------- method : str (optional) Method to clean out WH_choke values. "99" entails suppressing all the data rows where the choke is lower than 99%. "no_choke" entails setting to None all the rows where the WH_choke value is 0 or where it is non constant i.e. differential is larger than 1 or second differential is larger than 3 (default is '99'). """ assert ("WH_choke" in self.well_df.schema.names), 'In order to clean out WH choke data, WH choke column' \ 'in well_df must exist' if method == "99": self.well_df = self.well_df.where("WH_choke > 99") # Select well_df only where WH is larger than 99% elif method == "no_choke": # Select well_df only where WH choke is constant window = Window.orderBy("ts") # Window ordering by time # Create differential and second differential columns for WH choke self.well_df = self.well_df.withColumn("WH_choke_lag", F.lag("WH_choke", 1, 0).over(window)) self.well_df = self.well_df.withColumn("WH_choke_diff", F.abs(F.col("WH_choke") - F.col("WH_choke_lag"))) self.well_df = self.well_df.withColumn("WH_choke_lag2", F.lag("WH_choke_lag", 1, 0).over(window)) self.well_df = self.well_df.withColumn("WH_choke_diff2", F.abs(F.col("WH_choke") - F.col("WH_choke_lag2"))) for col in self.well_df.schema.names: # Set all rows with WH choke less than 10 to 0 self.well_df = self.well_df.withColumn(col, F.when(F.col("WH_choke") < 10, None). otherwise(F.col(col))) # Select well_df where WH choke gradient is less than 1, set rows with high gradient to None self.well_df = self.well_df.withColumn(col, F.when(F.col("WH_choke_diff") > 1, None). otherwise(F.col(col))) # Select well_df where WH choke curvature is less than 3, set rows with higher values to None self.well_df = self.well_df.withColumn(col, F.when(F.col("WH_choke_diff2") > 3, None). otherwise(F.col(col))) else: print("Clean choke method inputted is not know. Try 99 or no_choke") return
def modified_z_score(df, col_name, threshold): """ Delete outliers from a DataFrame using modified z score Reference: http://colingorrie.github.io/outlier-detection.html#modified-z-score-method :param df: :param col_name: :param threshold: :return: """ median = df.cols.median(col_name) median_absolute_deviation = df.select( F.abs(F.col(col_name) - median).alias(col_name)).cols.median(col_name) df = df.withColumn( 'm_z_score', F.abs(0.6745 * (F.col(col_name) - median) / median_absolute_deviation)) df = df.rows.drop(F.col("m_z_score") > threshold) return df
def _add_time_diff(self, df): dedup_cols = self.group_by_columns + [self.time_column] df = df.dropDuplicates(dedup_cols) df = df.withColumn( DIFF_COL, F.abs( F.unix_timestamp( F.lead(self.time_column).over(self.merge_window)) - F.unix_timestamp(F.col(self.time_column)))) return df
def looping_funct(flow): """ This function first converts RDD into dataframe, drop all duplicates and then filters users who wont make actions and writes them to cassandra. The users who woul make an action is passedi nto calculation function. @type flow: RDD @param flow: RDD stream from kafka """ df = flow.toDF(['time_new', 'ticker', 'volume', 'price']) df = df.dropDuplicates(['ticker']) criteria = cass_data.join(df, ['ticker'], 'inner') # drop users whose condition don't need to be calculated criteria = criteria.withColumn('volume', when((abs(col('previous_price') - col('price')) < col('buy')) & ( abs(col('previous_price')-col('price'))<col('sell')), 0).otherwise(col('volume'))) writepart = criteria.filter(criteria.volume < 1) writeToCassandra(writepart, 'users', 'graph_data') criteria = criteria.filter(criteria.volume != 0) calculation(criteria)
def calculate_end_of_loan(df): overpay_df = (df.withColumn( "Overpay_end_of_loan", F.when((col("RemainingPrincipal") < 0) & (col('InterestDue') != -1), F.abs(col("RemainingPrincipal"))).otherwise(0.0)).withColumn( "RemainingPrincipal", F.when(col("RemainingPrincipal") < 0, 0.0).otherwise(col("RemainingPrincipal")))) return overpay_df
def get_fliers(self, outliers): # Filters only the outliers, should "showfliers" be True fliers_df = outliers.filter('__{}_outlier'.format(self.colname)) # If shows fliers, takes the top 1k with highest absolute values fliers = (fliers_df.select( F.abs(F.col(self.colname)).alias(self.colname)).orderBy( F.desc( self.colname)).limit(1001).toPandas()[self.colname].values) return fliers
def abs(columns): """ Apply abs to the values in a column :param columns: :return: """ columns = parse_columns(self, columns) df = self for col_name in columns: df = df.withColumn(col_name, F.abs(F.col(col_name))) return df
def abs(columns): """ Apply abs to the values in a column :param columns: :return: """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) df = self for col_name in columns: df = df.withColumn(col_name, F.abs(F.col(col_name))) return df
F.stddev(status_joined_df.precipIntensity).alias("stddevPrecipitation"), F.stddev(status_joined_df.windSpeed).alias("stddevWindSpeed"))) stats_df.write.mode('overwrite').parquet("hdfs://hadoop:9000/models/weather-stats") stats = stats_df.collect()[0] print "Statistics: %s" % (stats,) day_of_week = F.udf( lambda d: datetime.datetime.strptime(d, "%Y-%m-%d").weekday(), IntegerType()) status_normalized_df = (status_joined_df .withColumn( "zTemperature", F.abs(status_joined_df.temperature - stats.avgTemp) / stats.stddevTemp) .withColumn( "zHumidity", F.abs(status_joined_df.humidity - stats.avgHumidity) / stats.stddevHumidity) .withColumn( "zPressure", F.abs(status_joined_df.pressure - stats.avgPressure) / stats.stddevPressure) .withColumn( "zVisibility", F.abs(10 - status_joined_df.visibility) / stats.stddevVisibility) .withColumn( "zPrecipitation", F.abs(status_joined_df.precipIntensity) / stats.stddevPrecipitation) .withColumn( "zWindSpeed", F.abs(status_joined_df.windSpeed) / stats.stddevWindSpeed) .withColumn( "dayOfWeek", day_of_week(status_joined_df.date)) ) status_normalized_df.show()