Ejemplo n.º 1
0
def remove_outliers(df, column, train=True, train_mean=None, train_std=None):
    '''Remove outliers from a given column of the dataframe. 

    The function uses the training set mean and standard deviation to revome all 
    points not within 10 stddevs. It is applied to the testing set with the 
    cooresponding training set's statistics in order to prevent data leakage.

    Args:
        df (DataFrame): dataframe of either the train or test data
        column (str): column name
        train (bool); whether df is the test or train set
        train_mean (float): mean of the train set column
        train_std (float): standard deviation of the train set column 

    Returns:
        DataFrame, (and train mean and standard dev if applicable)
    '''
    if train:
        samp_mean = df.agg({column: 'mean'}).collect()[0]['avg(' + column + ')']
        samp_std = df.agg({column: 'std'}).collect()[0]['stddev(' + column + ')']
        clean_df = df.filter(abs(df[column] - samp_mean) < 10 * samp_std)
        return clean_df, samp_mean, samp_std
    else:
        clean_df = df.filter(abs(df[column] - train_mean) < 10 * train_std)
        return clean_df
Ejemplo n.º 2
0
def clean_data(input_df, threshold=15000):
    """Clean data"""
    # Set counts that correspond to time_diff >= 8h to NaNs
    # Take absolute values of entries_count and exits_count
    tmp_df = (
        input_df
        .withColumn("entries_count", F.when(F.col("time_diff") >= 8, None).otherwise(
            F.abs(F.col("entries_count"))))
        .withColumn("exits_count", F.when(F.col("time_diff") >= 8, None).otherwise(
            F.abs(F.col("exits_count"))))
    )

    # Replace absolute values of entries_count and exits_count > 15,000 with NaNs
    tmp_df = (
        tmp_df
        .withColumn("entries_count", F.when(F.col("entries_count") > threshold, None).otherwise(
            F.col("entries_count")))
        .withColumn("exits_count", F.when(F.col("exits_count") > threshold, None).otherwise(
            F.col("exits_count")))
    )

    # Impute NaNs with average counts of the same turnstile, hour & day of week
    tmp_df = (
        tmp_df
        .withColumn("hour", F.hour("time_rounded"))
        .withColumn("wkdy", F.dayofweek("time_rounded"))
    )
    tmp_df = impute_nans(tmp_df, "entries_count")
    tmp_df = impute_nans(tmp_df, "exits_count")

    # Compute traffic
    output_df = tmp_df.withColumn("traffic", F.col("entries_count") + F.col("exits_count"))
    return output_df
Ejemplo n.º 3
0
def addErrorCols(transformedFull,
                   col_target,
                   col_predict,
                   verbose,
                   logger):    
    try:
        if verbose:
            logger.info('Add error columns to spark df start, function add_error_cols()')
        transformedFull = transformedFull\
                         .select('*', abs((transformedFull[col_target] - transformedFull[col_predict])
                                         /transformedFull[col_target]*100)\
                         .alias(col_target+'_APE'))
        transformedFull = transformedFull\
                         .select('*', abs((transformedFull[col_target] - transformedFull[col_predict]))\
                         .alias(col_target+'_AE'))
        transformedFull = transformedFull\
                         .select('*', pow(transformedFull[col_target] - transformedFull[col_predict],2)\
                         .alias(col_target+'_SE'))

        if verbose:
            logger.info('Add error columns to spark df end')
    except Exception:
        logger.exception("Fatal error in add_error_cols()")
        raise
    return transformedFull
Ejemplo n.º 4
0
def getPrecisionAtOneRecallFromPRCurve(curve, recall):
    pr_curve_with_recall_diff = curve\
        .withColumn("recall_diff", F.abs(F.col("recall") - recall))
    min_recall_diff = pr_curve_with_recall_diff\
        .agg(F.min("recall_diff")\
        .alias("min_recall_diff"))\
        .collect()[0].asDict()["min_recall_diff"]
    precision = pr_curve_with_recall_diff\
        .filter(F.abs(F.col("recall_diff") - min_recall_diff) < 1e-9)\
        .sort("recall", F.desc("precision"))\
        .first().asDict()["precision"]

    return precision
Ejemplo n.º 5
0
def interests(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that computes 
    the interest of association rules (interest = |confidence - 
    frequency(consequent)|; note the absolute value)  obtained using 
    min support <s> and min confidence <c> (parameters of the FP-Growth 
    model), and prints the first <n> rules sorted by (1) descending 
    antecedent size in association rule, and (2) descending interest.

    Return value: a CSV string.
    Test: tests/test_interests.py
    '''
    spark = init_spark()
    result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(
        lambda x: (x[1], x[0][0], x[0][1:]))
    df = spark.createDataFrame(result, ['id', 'plant', 'items'])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    result = model.associationRules
    modelResult = model.freqItemsets
    result=modelResult.join(result,modelResult['items']==result["consequent"])
    total = df.count()

    result = result.withColumn("interest",abs(result["confidence"]-result["freq"]/total))
    result = result.select(size("antecedent").alias('tam'), 'antecedent', 'consequent', 'confidence',"items","freq","interest")
    result = result.sort(desc('tam'), desc('interest')).limit(n)
    result=result.select('antecedent', 'consequent', 'confidence',"items","freq","interest")

    return toCSVLine(result)
    def metrics(self, predictions):
        """
        Evaluates the results of the model

        """
        x = ((predictions['ArrDelay'] - predictions['prediction']) / predictions['ArrDelay']) * 100
        predictions = predictions.withColumn('Accuracy', abs(x))

        rmse_evaluator = RegressionEvaluator(
            labelCol="ArrDelay",
            predictionCol="prediction",
            metricName="rmse")

        mae_evaluator = RegressionEvaluator(labelCol='ArrDelay',
                                            predictionCol="prediction",
                                            metricName="mae")

        R2_evaluator = RegressionEvaluator(predictionCol="prediction",
                                           labelCol="ArrDelay",
                                           metricName="r2")

        R2 = R2_evaluator.evaluate(predictions)
        mae = mae_evaluator.evaluate(predictions)
        rmse = rmse_evaluator.evaluate(predictions)

        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
        print("Mean Absolute Error (MAE) on test data = %g" % mae)
        print("R Squared (R2) on test data = %g" % R2)

        return R2, mae, rmse
Ejemplo n.º 7
0
def filter_outliers(dataframe, exclude_columns):
    """
    For every feature, except those in exclude_columns, set all outliers to NULL.
    """
    for column in dataframe.columns:
        if column in exclude_columns:
            continue
        # Exclude boolean types.
        if dataframe.schema[column].dataType == BooleanType():
            continue
        stats = dataframe \
            .select(_mean(col(column)).alias('mean'), stddev(col(column)).alias('std')) \
            .collect()
        mean = stats[0]['mean']
        std = stats[0]['std']
        print("mean: %s; std: %s" % (str(mean), str(std)))
        count_before = dataframe.filter(col(column).isNull()).count()
        dataframe = dataframe.withColumn(
            column,
            when(abs((col(column) - mean) / std) < 3,
                 col(column)).otherwise(None))
        print("Deleted %s entries because of z-score (3) for %s." % (
            str(dataframe.filter(col(column).isNull()).count() - count_before),
            column))
    return dataframe
Ejemplo n.º 8
0
    def mad(columns, more=None):
        """
        Return the Median Absolute Deviation
        :param columns: Column to be processed
        :param more: Return some extra computed values (Median).
        :return:
        """
        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
        result = {}
        for col_name in columns:

            _mad = {}

            # return mean(absolute(data - mean(data, axis)), axis)
            median_value = self.cols.median(col_name)

            mad_value = self.select(col_name) \
                .withColumn(col_name, F.abs(F.col(col_name) - median_value)) \
                .cols.median(col_name)

            if more:
                _mad = {"mad": mad_value, "median": median_value}
            else:
                _mad = {"mad": mad_value}

            result[col_name] = _mad

        return format_dict(result)
Ejemplo n.º 9
0
def absolute_difference(primary_col: str, secondary_col: str, output_col: str,
                        df: DataFrame):
    """Return the absolute difference between 2 columns"""
    # note that sometimes the absolute function produces rounding errors
    return df.withColumn(
        output_col,
        F.round(F.abs(F.col(primary_col) - F.col(secondary_col)), 10))
def main(inputs, output):
    observation_schema = types.StructType([
    types.StructField('station', types.StringType(), False),
    types.StructField('date', types.StringType(), False),
    types.StructField('observation', types.StringType(), False),
    types.StructField('value', types.IntegerType(), False),
    types.StructField('mflag', types.StringType(), False),
    types.StructField('qflag', types.StringType(), False),
    types.StructField('sflag', types.StringType(), False),
    types.StructField('obstime', types.StringType(), False),])

    weather = spark.read.csv(inputs, schema=observation_schema) #Read the input files into a DataFrame
    t_min = weather.filter((weather.qflag.isNull())&(weather.observation=='TMIN')) #the field qflag (quality flag) is null, the station starts with 'CA' and the observation is 'TMAX'
    t_max = weather.filter((weather.qflag.isNull())&(weather.observation=='TMAX'))
    t_min_selected = t_min.select('date','station','value')
    t_max_selected = t_max.select('date','station','value')
    t_min_group = t_min_selected.groupby('date','station').agg(functions.min(t_min_selected['value'])).withColumnRenamed("MIN(value)", "min_count")
    t_max_group = t_max_selected.groupby('date','station').agg(functions.max(t_max_selected['value'])).withColumnRenamed("MAX(value)", "max_count")
    weather_joined = t_min_group.join(broadcast(t_max_group), ((t_max_group['date'] == t_min_group['date'])&((t_max_group['station'] == t_min_group['station'])))).drop(t_min_group.date).drop(t_min_group.station)
    weather_joined_newcol = weather_joined.withColumn("range", abs((weather_joined.min_count - weather_joined.max_count)/10)) # Divide the temperature by 10 so it's actually in °C, and call the resulting column tmax.
    weather_joined_group = weather_joined_newcol.select('date', 'range').groupby('date').agg(functions.max(weather_joined_newcol['range']))
    weather_joined_two = weather_joined_newcol.join(broadcast(weather_joined_group), (weather_joined_group['date'] == weather_joined_newcol['date'])).drop(weather_joined_newcol.date)
    weather_joined_filter = weather_joined_two.filter(weather_joined_two['range'] == weather_joined_two['max(range)']).sort('date','station', ascending=True)
    final_output = weather_joined_filter.select('date','station','range')
    final_output.write.csv(output, mode='overwrite')
Ejemplo n.º 11
0
def TAES(spark,df,geolevels,queries,schema,u):
    z=sdftools.getAnswers(spark,df,geolevels,schema,queries)
    z=z.groupby(['geolevel','run_id']).sum()
    u.show(10)
    print("this is z")
    z.show(10)
    q=u.join(z, on=['geolevel','run_id'])
    columnstodrop=['plb','budget_group']
    q=q.drop(*columnstodrop)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    q=q.withColumn('MDF/sum',sf.col('priv')/sf.col('sum(priv)'))
    q=q.withColumn('CEF/sum',sf.col('orig')/sf.col('sum(orig)'))
    q=q.withColumn('difference',sf.col('MDF/sum')-sf.col('CEF/sum'))
    q=q.withColumn('abs',sf.abs(sf.col('difference')))
    print("This is q")
    q.show(10)
    q=q.groupby(['geolevel','run_id']).sum()
    columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)']
    print("this is q2")
    q=q.drop(*columnstodrop)
    q.show(10)
    z=q.groupby(['geolevel']).avg()
    print("this is z")
    z.show(10)
    return q,z
Ejemplo n.º 12
0
def assert_df_matches_expected(df: pyspark.sql.DataFrame,
                               column_name: str,
                               column_type: str = "float",
                               precision: float = 1e-15):
    # compare the column requested to the same one with alias of "expected" and see that all are under certain precision
    df_compare = df.select(
        "*",
        F.col(f"expected.{column_name}").alias("expected_output"),
        F.col(column_name).alias("actual_output"))
    # subtract and compare diff
    if (column_type == "float"):
        df_compare = df_compare.withColumn(
            "diff", F.abs(F.col("expected_output") - F.col("actual_output")))
        df_compare = df_compare.withColumn("identical",
                                           F.col("diff") < F.lit(precision))
    else:
        df_compare = df_compare.withColumn(
            "diff",
            F.concat(F.lit("expected:"), F.col("expected_output"),
                     F.lit(" vs actual:"), F.col("actual_output")))
        df_compare = df_compare.withColumn(
            "identical",
            F.col("expected_output") == F.col("actual_output"))

    # find ones that are different
    df_diff = df_compare.filter(
        F.coalesce(F.col("identical"), F.lit(False)) != True)
    if (df_diff.count() > 0):
        df_diff.show()
    assert df_diff.count(
    ) == 0, f"All actual values match expected for column {column_name}"
Ejemplo n.º 13
0
def getRddWithAbsDiff(spark, df, geolevels, queries, schema):
    rddWithAnswers = sdftools.getAnswers(spark, df, geolevels, schema, queries)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    rddWithDiff = rddWithAnswers.withColumn('diff',
                                            sf.col('priv') - sf.col('orig'))
    rddWithAbsDiff = rddWithDiff.withColumn('abs diff', sf.abs(sf.col('diff')))
    return rddWithAbsDiff
Ejemplo n.º 14
0
def _generate_select_expression_for_extended_string_to_timestamp(
        source_column, name):
    """
    More robust conversion from StringType to TimestampType. It is assumed that the
    timezone is already set to UTC in spark / java to avoid implicit timezone conversions.

    The conversion can handle unix timestamps in seconds and in milliseconds:
        - Timestamps in the range [-MAX_TIMESTAMP_S, MAX_TIMESTAMP_S] are treated as seconds
        - Timestamps in the range [-inf, -MAX_TIMESTAMP_S) and (MAX_TIMESTAMP_S, inf] are treated as milliseconds
        - There is a time interval (1970-01-01 +- ~2.5 months)where we can not distinguish correctly between s and ms
          (e.g. 3974400000 would be treated as seconds (2095-12-11T00:00:00) as the value is smaller than
          MAX_TIMESTAMP_S, but it could also be a valid date in Milliseconds (1970-02-16T00:00:00)

    Is able to additionally handle (compared to implicit Spark conversion):
    * Preceding whitespace
    * Trailing whitespace
    * Preceeding and trailing whitespace

    Hint
    ----
    Please have a look at the tests to get a better feeling how it behaves under
    tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and
    tests/data/test_fixtures/mapper_custom_data_types_fixtures.py

    Example
    -------
    >>> from spooq.transformer import Mapper
    >>>
    >>> input_df.head(3)
    [Row(input_string="2020-08-12T12:43:14+0000"),
     Row(input_string="1597069446"),
     Row(input_string="2020-08-12")]
    >>> mapping = [("output_value", "input_string", "extended_string_to_timestamp")]
    >>> output_df = Mapper(mapping).transform(input_df)
    >>> output_df.head(3)
    [Row(input_string=datetime.datetime(2020, 8, 12, 12, 43, 14)),
     Row(input_string=datetime.datetime(2020, 8, 10, 14, 24, 6)),
     Row(input_string=datetime.datetime(2020, 8, 12, 0, 0, 0))]
    """
    return (F.when(
        F.abs(F.trim(source_column).cast(T.LongType())).between(
            0, MAX_TIMESTAMP_SEC),
        F.trim(source_column).cast(T.LongType()).cast(T.TimestampType()),
    ).when(
        F.abs(F.trim(source_column).cast(T.LongType())) > MAX_TIMESTAMP_SEC,
        (F.trim(source_column) / 1000).cast(T.TimestampType()),
    ).otherwise(F.trim(source_column).cast(T.TimestampType())).alias(name))
Ejemplo n.º 15
0
    def step_03_join(self):
        # TODO:
        # - Join all result of step_02 based on the group by attributes.
        # - For each metrics, renamed it to "datasource: metric_name"
        # - For each combination of datasource, calculate data difference column
        # - Calculate a test_result column if every related metric matches (If only 2 input sources is provided)
        group_by = self.config["group_by"]

        # Rename every metric with prefix as source_metricname
        for source, agg in self.agg.items():
            metric_cols = list(filter(lambda x: x not in group_by,
                                      agg.columns))
            self.agg[source] = reduce(
                lambda df, metric: df.withColumnRenamed(
                    metric, source + "_" + metric), metric_cols, agg)

        # Join
        joined = reduce(lambda x, y: x.join(y, how="full", on=group_by),
                        self.agg.values())

        # Calculate differences if there are only two sources
        if len(self.agg) == 2:
            source1, source2 = tuple(self.config["data"].keys())
            source1_metrics = list(
                self.config["data"][source1]["metrics"].keys())
            source2_metrics = list(
                self.config["data"][source2]["metrics"].keys())
            # Look for same metrics in both sources
            # I know that it could be done in O(n), this is more readable
            shared_metrics = sorted(
                set(source1_metrics) & set(source2_metrics))
            for metric in shared_metrics:
                try:
                    joined = joined.withColumn(
                        "delta_" + metric,
                        F.abs(
                            F.col(source1 + "_" + metric) -
                            F.col(source2 + "_" + metric)))
                except:  # Cannot calculate difference, eg in case the metric is string
                    pass
                # For float and double type, the acceptance rate is 0.1 percent
                if dict(joined.dtypes)[source1 + "_" + metric] in ("float", "double") \
                    or dict(joined.dtypes)[source2 + "_" + metric] in ("float", "double"):

                    def difference(number1, number2, error=1e-3):
                        return abs((number1 - number2) / number2) < error

                    joined = joined.withColumn(
                        "match_" + metric,
                        F.udf(difference,
                              T.BooleanType())(F.col(source1 + "_" + metric),
                                               F.col(source2 + "_" + metric)))
                else:
                    joined = joined.withColumn(
                        "match_" + metric,
                        F.col(source1 + "_" + metric) == F.col(source2 + "_" +
                                                               metric))
        self.joined = joined
        return joined
Ejemplo n.º 16
0
    def _m_z_score(self):
        df = self.df
        col_name = self.col_name

        mad = df.cols.mad(col_name, self.relative_error, True)
        m_z_col_name = name_col(col_name, "modified_z_score")

        return df.withColumn(m_z_col_name, F.abs(0.6745 * (F.col(col_name) - mad["median"]) / mad["mad"]))
Ejemplo n.º 17
0
def fixed_effects_p (df, grouping_columns):
  inverse_normal_udf = f.pandas_udf (lambda x: x.apply(norm.ppf), 'float')
  temp = df.withColumn('inverse_normal', inverse_normal_udf (df.P)).withColumn('sign', df.BETA / f.sqrt(df.BETA*df.BETA))
  temp1 = temp.withColumn('Z_i', f.abs(temp.inverse_normal) * temp.sign).withColumn('w_i', f.sqrt(temp.n))
  temp2 = temp1.withColumn('Z_i_w_i', temp1.Z_i * temp1.w_i).withColumn('w_i_sq', temp1.w_i * temp1.w_i)
  grouped = temp2.withColumn('studies', f.lit(1)).groupBy(grouping_columns).agg(f.sum('n'), f.sum('Z_i_w_i'), f.sum('w_i_sq'), f.sum('studies')).withColumnRenamed('sum(Z_i_w_i)','sum_Z_i_w_i').withColumnRenamed('sum(w_i_sq)', 'sum_w_i_sq')
  final = grouped.withColumn('Z', grouped.sum_Z_i_w_i / f.sqrt(grouped.sum_w_i_sq))
  return(final.select(grouping_columns + ['sum(n)','sum(studies)','Z']))
Ejemplo n.º 18
0
 def get_last_month(col):
     h = F.abs(F.xxhash64(col))
     h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2)
     h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3)
     h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5)
     h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7)
     h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11)
     return -(h1 + h2 + h3 + h4 + h5)
Ejemplo n.º 19
0
 def MAE(spark, df, geolevels, queries, schema):
     u = sdftools.getAnswers(spark, df, geolevels, schema, queries)
     # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
     u = u.withColumn('diff', sf.col('priv') - sf.col('orig'))
     u = u.withColumn('abs diff', sf.abs(sf.col('diff')))
     y = u.groupby(['geocode', 'geolevel', 'level']).avg()
     z = u.groupby(['geolevel']).avg()
     return u, y, z
Ejemplo n.º 20
0
def find_otp_bus_legs_actual_end_time(otp_legs_st, clean_bus_trips):
    return otp_legs_st \
                .withColumnRenamed('to_stop_id','stopPointId') \
                .join(clean_bus_trips, ['date','route','busCode','tripNum','stopPointId'], how='inner') \
                .na.drop(subset=['timestamp']) \
                .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('otp_end_time')))) \
                .withColumnRenamed('timestamp', 'to_timestamp') \
                .withColumnRenamed('stopPointId','to_stop_id') \
                .orderBy(['date','route','stopPointId','timediff'])
Ejemplo n.º 21
0
    def remove_too_fast_objects(self):
        """
        some data entries are surely erroneus, so some objects move up to 10-20 km per second.
        We should remove it.

        :return:  filtered `self.df`
        """

        window = Window.partitionBy(['id', F.to_date('ts')]).orderBy('ts')

        self.df = self.df \
            .withColumn('delta_lat', (F.lag('lat').over(window) - F.col('lat'))) \
            .withColumn('delta_lon', (F.lag('lon').over(window) - F.col('lon'))) \
            .withColumn('delta_ts', (F.col('ts').cast('long') - F.lag('ts').over(window).cast('long'))) \
            .withColumn('speed1', F.col('delta_lat') / F.col('delta_ts')) \
            .withColumn('speed2', F.col('delta_lon') / F.col('delta_ts')) \
            .dropna() \
            .filter((F.abs(F.col('speed1')) < speed) & (F.abs(F.col('speed2')) < speed))
Ejemplo n.º 22
0
 def _evaluate(self, dataset):
     dataset = dataset.withColumn(
         'non_zero',
         F.when(F.col(self.predictionCol) == 0,
                1).otherwise(F.col(self.predictionCol)))
     return (dataset.select(F.mean(
             F.abs(
                 (F.col(self.labelCol) - F.col(self.predictionCol)) / F.col('non_zero'))).alias('mape')) \
             .collect()[0][0]) * float(100)
Ejemplo n.º 23
0
    def clean_choke(self, method="99"):
        """
        Method to clean WH_choke variables values from the well_df Spark data frame attribute

        Parameters
        ----------
        method : str (optional)
            Method to clean out WH_choke values. "99" entails suppressing all the data rows where the choke is lower
            than 99%. "no_choke" entails setting to None all the rows where the WH_choke value is 0 or where it is non
            constant i.e. differential is larger than 1 or second differential is larger than 3 (default is '99').
        """

        assert ("WH_choke" in self.well_df.schema.names), 'In order to clean out WH choke data, WH choke column' \
                                                          'in well_df must exist'

        if method == "99":
            self.well_df = self.well_df.where("WH_choke > 99")  # Select well_df only where WH is larger than 99%

        elif method == "no_choke":

            # Select well_df only where WH choke is constant
            window = Window.orderBy("ts")  # Window ordering by time

            # Create differential and second differential columns for WH choke
            self.well_df = self.well_df.withColumn("WH_choke_lag", F.lag("WH_choke", 1, 0).over(window))
            self.well_df = self.well_df.withColumn("WH_choke_diff", F.abs(F.col("WH_choke") - F.col("WH_choke_lag")))
            self.well_df = self.well_df.withColumn("WH_choke_lag2", F.lag("WH_choke_lag", 1, 0).over(window))
            self.well_df = self.well_df.withColumn("WH_choke_diff2", F.abs(F.col("WH_choke") - F.col("WH_choke_lag2")))

            for col in self.well_df.schema.names:
                # Set all rows with WH choke less than 10 to 0
                self.well_df = self.well_df.withColumn(col, F.when(F.col("WH_choke") < 10, None).
                                                       otherwise(F.col(col)))
                # Select well_df where WH choke gradient is less than 1, set rows with high gradient to None
                self.well_df = self.well_df.withColumn(col,
                                                       F.when(F.col("WH_choke_diff") > 1, None).
                                                       otherwise(F.col(col)))
                # Select well_df where WH choke curvature is less than 3, set rows with higher values to None
                self.well_df = self.well_df.withColumn(col,
                                                       F.when(F.col("WH_choke_diff2") > 3, None).
                                                       otherwise(F.col(col)))
        else:
            print("Clean choke method inputted is not know. Try 99 or no_choke")
        return
Ejemplo n.º 24
0
 def modified_z_score(df, col_name, threshold):
     """
     Delete outliers from a DataFrame using modified z score
     Reference: http://colingorrie.github.io/outlier-detection.html#modified-z-score-method
     :param df:
     :param col_name:
     :param threshold:
     :return:
     """
     median = df.cols.median(col_name)
     median_absolute_deviation = df.select(
         F.abs(F.col(col_name) -
               median).alias(col_name)).cols.median(col_name)
     df = df.withColumn(
         'm_z_score',
         F.abs(0.6745 * (F.col(col_name) - median) /
               median_absolute_deviation))
     df = df.rows.drop(F.col("m_z_score") > threshold)
     return df
Ejemplo n.º 25
0
 def _add_time_diff(self, df):
     dedup_cols = self.group_by_columns + [self.time_column]
     df = df.dropDuplicates(dedup_cols)
     df = df.withColumn(
         DIFF_COL,
         F.abs(
             F.unix_timestamp(
                 F.lead(self.time_column).over(self.merge_window)) -
             F.unix_timestamp(F.col(self.time_column))))
     return df
Ejemplo n.º 26
0
def looping_funct(flow):
    """
    This function first converts RDD into dataframe, drop all duplicates and
    then filters users who wont make actions and writes them to cassandra. The
    users who woul make an action is passedi nto calculation function.
    @type  flow: RDD
    @param flow: RDD stream from kafka
    """
    df = flow.toDF(['time_new', 'ticker', 'volume', 'price'])
    df = df.dropDuplicates(['ticker'])
    criteria = cass_data.join(df, ['ticker'], 'inner')
    # drop users whose condition don't need to be calculated
    criteria = criteria.withColumn('volume',
        when((abs(col('previous_price') - col('price')) < col('buy')) & (
        abs(col('previous_price')-col('price'))<col('sell')),
        0).otherwise(col('volume')))
    writepart = criteria.filter(criteria.volume < 1)
    writeToCassandra(writepart, 'users', 'graph_data')
    criteria = criteria.filter(criteria.volume != 0)
    calculation(criteria)
Ejemplo n.º 27
0
def calculate_end_of_loan(df):

    overpay_df = (df.withColumn(
        "Overpay_end_of_loan",
        F.when((col("RemainingPrincipal") < 0) & (col('InterestDue') != -1),
               F.abs(col("RemainingPrincipal"))).otherwise(0.0)).withColumn(
                   "RemainingPrincipal",
                   F.when(col("RemainingPrincipal") < 0,
                          0.0).otherwise(col("RemainingPrincipal"))))

    return overpay_df
Ejemplo n.º 28
0
    def get_fliers(self, outliers):
        # Filters only the outliers, should "showfliers" be True
        fliers_df = outliers.filter('__{}_outlier'.format(self.colname))

        # If shows fliers, takes the top 1k with highest absolute values
        fliers = (fliers_df.select(
            F.abs(F.col(self.colname)).alias(self.colname)).orderBy(
                F.desc(
                    self.colname)).limit(1001).toPandas()[self.colname].values)

        return fliers
Ejemplo n.º 29
0
 def abs(columns):
     """
     Apply abs to the values in a column
     :param columns:
     :return:
     """
     columns = parse_columns(self, columns)
     df = self
     for col_name in columns:
         df = df.withColumn(col_name, F.abs(F.col(col_name)))
     return df
Ejemplo n.º 30
0
 def abs(columns):
     """
     Apply abs to the values in a column
     :param columns:
     :return:
     """
     columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
     df = self
     for col_name in columns:
         df = df.withColumn(col_name, F.abs(F.col(col_name)))
     return df
        F.stddev(status_joined_df.precipIntensity).alias("stddevPrecipitation"),
        F.stddev(status_joined_df.windSpeed).alias("stddevWindSpeed")))

stats_df.write.mode('overwrite').parquet("hdfs://hadoop:9000/models/weather-stats")

stats = stats_df.collect()[0]

print "Statistics: %s" % (stats,)

day_of_week = F.udf(
    lambda d: datetime.datetime.strptime(d, "%Y-%m-%d").weekday(),
    IntegerType())

status_normalized_df = (status_joined_df
    .withColumn(
        "zTemperature", F.abs(status_joined_df.temperature - stats.avgTemp) / stats.stddevTemp)
    .withColumn(
        "zHumidity", F.abs(status_joined_df.humidity - stats.avgHumidity) / stats.stddevHumidity)
    .withColumn(
        "zPressure", F.abs(status_joined_df.pressure - stats.avgPressure) / stats.stddevPressure)
    .withColumn(
        "zVisibility", F.abs(10 - status_joined_df.visibility) / stats.stddevVisibility)
    .withColumn(
        "zPrecipitation", F.abs(status_joined_df.precipIntensity) / stats.stddevPrecipitation)
    .withColumn(
        "zWindSpeed", F.abs(status_joined_df.windSpeed) / stats.stddevWindSpeed)
    .withColumn(
        "dayOfWeek", day_of_week(status_joined_df.date))
    )

status_normalized_df.show()