Beispiel #1
0
 def inner(df):
     window = Window.partitionBy("Estacao").orderBy("Data").rowsBetween(
         -2, 2)
     df = (df.withColumn("interp", fun.avg(df[col]).over(window)))
     df = df.withColumn("interp", fun.round(df["interp"], precision))
     df = (df.withColumn(col, fun.coalesce(df[col],
                                           df["interp"])).drop("interp"))
     return df
Beispiel #2
0
 def get_log_of_grades(self, df):
     # type: (dataframe) -> dataframe
     for col in self.grade_cols:
         df = df.withColumn(
             col,
             F.coalesce(F.log(F.lit(1) - F.col(col)),
                        F.lit(self.log_value_for_ones)))
     return df
Beispiel #3
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = coalesce(*[
         col.get_column_spec(source_df=source_df,
                             current_column=current_column)
         for col in self.value
     ])
     return column_spec
 def apply(self, data, on, to=None):
     """Apply entity map"""
     applied = (data.join(self._map, data[on] == self._map[self._from],
                          "left_outer").withColumn(
                              to or on,
                              coalesce(self._map[self._to], data[on])).drop(
                                  self._from).drop(self._to))
     return applied
 def dataB_gross_price(df: DataFrame) -> DataFrame:
     data_frame = df.withColumn(
         "gross_price",
         F.coalesce(
             F.when((df.gen_ledg == 41000),
                    df.adj_extended_amount).otherwise(0),
             F.lit(MISSING_NUMBER)))
     return data_frame
def transform(inc_df: DataFrame, prev_df: DataFrame) -> DataFrame:
    # calculating the metrics
    inc_df: DataFrame = inc_df.groupBy('email').count(). \
        select(['email',
                col('count').alias('page_view'),
                lit(config['process_date']).alias('last_active')
                ])

    # merging the data with historical records
    df_transformed: DataFrame = inc_df.join(prev_df, inc_df.email == prev_df.email, 'full'). \
        select([coalesce(prev_df.email, inc_df.email).alias('email'),
                (coalesce(prev_df.page_view, lit(0)) + coalesce(inc_df.page_view, lit(0))).alias('page_view'),
                coalesce(prev_df.created_date, inc_df.last_active).cast('date').alias('created_date'),
                coalesce(inc_df.last_active, prev_df.last_active).cast('date').alias('last_active')
                ])

    return df_transformed
Beispiel #7
0
    def weighted_average(c, window, offsets, weights):
        def value(i):
            return lag(c, -i).over(window)

        values = [coalesce(value(i) * w, lit(0))
                  for i, w in zip(offsets, weights)]

        return sum(values, lit(0))
Beispiel #8
0
    def any(self, axis: Union[int, str] = 0) -> bool:
        """
        Return whether any element is True.

        Returns False unless there at least one element within a series that is
        True or equivalent (e.g. non-zero or non-empty).

        Parameters
        ----------
        axis : {0 or 'index'}, default 0
            Indicate which axis or axes should be reduced.

            * 0 / 'index' : reduce the index, return a Series whose index is the
              original column labels.

        Examples
        --------
        >>> ks.Series([False, False]).any()
        False

        >>> ks.Series([True, False]).any()
        True

        >>> ks.Series([0, 0]).any()
        False

        >>> ks.Series([0, 1, 2]).any()
        True

        >>> ks.Series([False, False, None]).any()
        False

        >>> ks.Series([True, False, None]).any()
        True

        >>> ks.Series([]).any()
        False

        >>> ks.Series([np.nan]).any()
        False
        """

        if axis not in [0, 'index']:
            raise ValueError('axis should be either 0 or "index" currently.')

        sdf = self._kdf._sdf.select(self._scol)
        col = self._scol

        # Note that we're ignoring `None`s here for now.
        # any and every was added as of Spark 3.0
        # ret = sdf.select(F.expr("any(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
        # Here we use max as its alternative:
        ret = sdf.select(F.max(F.coalesce(col.cast('boolean'),
                                          F.lit(False)))).collect()[0][0]
        if ret is None:
            return False
        else:
            return ret
Beispiel #9
0
def user_preprocessing(spark: SparkSession, save_path="users"):
    """
    This method will generate user list and its feature.
    Generated dataframe will contains user_id, birth_year, gender, category_subscribe, subscribe column.
    :param spark: spark Session
    """
    arena_user_data = load_mysql(spark, "arena_user_data")
    arena_category_subscribers = load_mysql(spark,
                                            "arena_category_subscribers")
    arena_categories = load_mysql(spark, "arena_categories")
    arena_user_subscribers = load_mysql(spark, "arena_user_subscribers")
    arena_user_data = arena_user_data.select("id", "birth_year", "gender")
    arena_category_subscribers = arena_category_subscribers.select(
        "user_id", "cat_id").where(col("status") == 1)
    arena_categories = arena_categories.select("id", "cat_name")
    arena_user_subscribers = arena_user_subscribers.select(
        "subscriber_id", "user_id").where(col("status") == 1)
    cat_name_subs = arena_category_subscribers.alias("a").join(
        arena_categories.alias("b"),
        on=col("a.cat_id") == col("b.id"),
        how="left").select("a.user_id", "b.cat_name")
    cat_name_subs = cat_name_subs.groupBy("user_id").agg(
        collect_list("cat_name").cast(
            StringType()).alias("category_subscribe"))
    cat_name_subs = cat_name_subs.withColumn(
        "category_subscribe",
        regexp_replace(col("category_subscribe"), r"' ", r"'"))
    cat_name_subs = cat_name_subs.withColumn(
        "category_subscribe",
        regexp_replace(col("category_subscribe"), r"[\[\]\']", r""))
    cat_name_subs = cat_name_subs.withColumn(
        "category_subscribe",
        regexp_replace(col("category_subscribe"), r", ", r","))
    subscriber_list = arena_user_subscribers.groupBy("subscriber_id").agg(
        collect_list("user_id").cast(StringType()).alias("subscribe"))
    subscriber_list = subscriber_list.withColumn(
        "subscribe", regexp_replace(col("subscribe"), r"[\[\]\'\s]", r""))
    users = arena_user_data.alias("a").join(cat_name_subs.alias("b"), on=col("a.id") == col("b.user_id"), how="left")\
        .join(subscriber_list.alias("c"), on=col("a.id") == col("c.subscriber_id"), how="left")\
        .select("a.*", "b.category_subscribe", "c.subscribe").orderBy("id")
    users = users.withColumnRenamed("id", "user_id")
    users = users.withColumn("category_subscribe",
                             coalesce("category_subscribe")).withColumn(
                                 "subscribe", coalesce("subscribe"))
    save_parquet(users, save_path)
Beispiel #10
0
def pyspark():
    conf = SparkConf().setAppName("PySparkApp").setMaster("local")
    #conf = SparkConf()
    sc = SparkContext(conf=conf)

    #spark = SparkSession.builder.appName("WordCount").master("local").config(conf = conf).getOrCreate()
    sqlCtx = SQLContext(sc)

    df1 = get_features()
    sdf = sqlCtx.createDataFrame(df1)

    ops1 = "(price_from + price_to)/2"
    data = sdf.withColumn("MedianPrice", expr(ops1))

    tmp = data.withColumn('final_price',
                          coalesce(data['Price123'], data['MedianPrice']))

    finaldata = tmp.drop("price", "disFeature")

    state = {
        "VIC": "Victoria",
        "WA": "Western Australia",
        "ACT": "Australian Capital Territory",
        "NT": "Northern Territory",
        "NSW": "New South Wales",
        "TAS": "Tasmania"
    }

    stateDataP = pd.DataFrame(list(state.items()),
                              columns=["State", "StateName"])

    stateDataD = sqlCtx.createDataFrame(stateDataP)

    data1 = finaldata.join(stateDataD, on=['State'], how='inner')

    finaldataPD = data1.toPandas()
    #dataPD["StateName"].unique()

    sc.stop()

    finaldataPD['price_to'] = finaldataPD['price_to'].astype(str).astype(float)

    finaldataPD['Price123'] = finaldataPD['Price123'].astype(str).astype(float)

    finaldataPD['beds'] = finaldataPD['beds'].astype(str).astype(int)

    finaldataPD['baths'] = finaldataPD['baths'].astype(str).astype(int)

    finaldataPD['parking'] = finaldataPD['parking'].astype(str).astype(int)

    df123 = finaldataPD.copy()

    df123 = df123.replace({pd.np.nan: None})

    #print(df123)

    return df123
Beispiel #11
0
def my_concat(*cols):
    """Generate a format that allows import a Spark df as a one column txt

    Parameters
    ----------
    *cols : list
        columns
    Returns
    -------
    Spark data_license
        Data in the format needed to save as a txt
    """
    concat_columns = []
    for column in cols[:-1]:
        concat_columns.append(F.coalesce(column, F.lit("*")))
        concat_columns.append(F.lit(" "))
    concat_columns.append(F.coalesce(cols[-1], F.lit("*")))
    return F.concat(*concat_columns)
 def dataB_claims(df: DataFrame) -> DataFrame:
     # Identify the claims value based on the gen_ledg field value
     data_frame = df.withColumn(
         "claims",
         F.coalesce(
             F.when(df.gen_ledg.isin([46000, 46400]),
                    df.adj_extended_amount).otherwise(0),
             F.lit(MISSING_NUMBER)))
     return data_frame
def cleaning_stages(df, **kwargs):

    #Cleaning_stages to store stages to clean up the dataset
    #(filter nulls, impute missing data, wrangling data, etc)

    #TODO: Check for types of df - has to be spark DF

    #to_dos = ['drop_cols', 'cast_cols_dtype', 'fill_na', 'impute_cols']
    if 'drop_cols' in kwargs.keys():
        df = df.drop(*kwargs['drop_cols'])

    if 'cast_cols_dtype' in kwargs.keys():
        df = cast_col_to_types(df,
                               kwargs['cast_cols_dtype'][0],
                               to_type=kwargs['cast_cols_dtype'][1])

    if 'fill_na' in kwargs.keys():
        df = df.fillna(kwargs['fill_na'][1], subset=kwargs['fill_na'][0])

    if 'impute_cols' in kwargs.keys():
        for val in kwargs['impute_cols']:

            to_be_imputed_col = val[
                0]  # val[0] is the name of column to be imputed
            expr_sentence = val[
                1]  #val[1] is expression string to fill nulls with
            col_new_name = "new_" + to_be_imputed_col
            df = df.withColumn(
                col_new_name, expr(expr_sentence)
            )  #val[1] is expression string to fill nulls with
            filled = "filled_" + to_be_imputed_col
            df = df.withColumn(
                filled, coalesce(df[to_be_imputed_col], df[col_new_name]))
            # Drop the two intermidiate columns then rename imputed col with its original name
            df = df.drop(col_new_name, to_be_imputed_col)
            df = df.withColumnRenamed(filled, to_be_imputed_col)

    if 'rank_cols' in kwargs.keys():
        for val in kwargs['rank_cols']:

            to_be_ranked_col = val[
                0]  # val[0] is the name of column to be imputed
            expr_sentence = val[
                1]  #val[1] is expression string to fill nulls with
            col_new_name = "ranked_" + to_be_ranked_col
            df = df.withColumn(
                col_new_name, expr(expr_sentence)
            )  #val[1] is expression string to fill nulls with

    if 'convert_cols' in kwargs.keys():
        for val in kwargs['convert_cols']:
            to_be_converted_col = val[0]  # name of col to store converted cols
            expr_sentence = val[1]  # expression string to convert cols
            col_new_name = "converted_" + to_be_converted_col
            df = df.withColumn(col_new_name, expr(expr_sentence))

    return df
def test_auto_mapper_coalesce(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", None),
            (2, None, "Michael", "1970-02-02"),
            (3, None, "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        my_column=A.coalesce(
            A.column("last_name"), A.column("date_of_birth"), A.text("last_resort")
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        coalesce(
            col("b.last_name"),
            col("b.date_of_birth"),
            lit("last_resort").cast(StringType()),
        ).alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "Qureshi"
    )
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0]
        == "1970-02-02"
    )
    assert (
        result_df.where("member_id == 3").select("my_column").collect()[0][0]
        == "last_resort"
    )
def test_auto_mapper_array_multiple_items_with_null(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df: DataFrame = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=AutoMapperList(["address1", "address2", None]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["dst2"],
        when(
            array(lit("address1"), lit("address2"), lit(None)).isNotNull(),
            filter(
                coalesce(array(lit("address1"), lit("address2"), lit(None)),
                         array()),
                lambda x: x.isNotNull(),
            ),
        ).alias("dst2"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][1]
            == "address2")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][1]
            == "address2")
Beispiel #16
0
def multijoin(dfs, on=None, how=None, coalesce=None):
    """Join multiple dataframes.

    Args:
        dfs (list[pyspark.sql.DataFrame]).
        on: same as ``pyspark.sql.DataFrame.join``.
        how: same as ``pyspark.sql.DataFrame.join``.
        coalesce (list[str]): column names to disambiguate by coalescing
            across the input dataframes. A column must be of the same type
            across all dataframes that define it; if different types appear
            coalesce will do a best-effort attempt in merging them. The
            selected value is the first non-null one in order of appearance
            of the dataframes in the input list. Default is None - don't
            coalesce any ambiguous columns.

    Returns:
        pyspark.sql.DataFrame or None if provided dataframe list is empty.

    Example:
        Assume we have two DataFrames, the first is
        ``first = [{'id': 1, 'value': None}, {'id': 2, 'value': 2}]``
        and the second is
        ``second = [{'id': 1, 'value': 1}, {'id': 2, 'value': 22}]``

        Then collecting the ``DataFrame`` produced by

        ``multijoin([first, second], on='id', how='inner', coalesce=['value'])``

        yields ``[{'id': 1, 'value': 1}, {'id': 2, 'value': 2}]``.
    """
    if not dfs:
        return None

    # Go over the input dataframes and rename each to-be-resolved
    # column to ensure name uniqueness
    coalesce = set(coalesce or [])
    renamed_columns = defaultdict(list)
    for idx, df in enumerate(dfs):
        for col in df.columns:
            if col in coalesce:
                disambiguation = '__{}_{}'.format(idx, col)
                df = df.withColumnRenamed(col, disambiguation)
                renamed_columns[col].append(disambiguation)
        dfs[idx] = df

    # Join the dataframes
    joined_df = reduce(lambda x, y: x.join(y, on=on, how=how), dfs)

    # And coalesce the would-have-been-ambiguities
    for col, disambiguations in renamed_columns.items():
        joined_df = joined_df.withColumn(col, F.coalesce(*disambiguations))
        for disambiguation in disambiguations:
            joined_df = joined_df.drop(disambiguation)

    return joined_df
Beispiel #17
0
def metricSumDimensionOverWeekPerProfileDay(data,
                                            needed_dimension_variables,
                                            feature_col,
                                            sampling_multiplier,
                                            days=7,
                                            include_day_of_week=False):
    all_user_days = data.select("id").distinct().crossJoin(
        data.select("date").distinct())
    intermediate_table1 = data.filter(col(feature_col) > 0).select(
        ["id", "date", feature_col]).distinct()

    intermediate_table1 = intermediate_table1.alias("intermediate_table")
    all_user_days = all_user_days.alias("all_user_days")

    # Augment activity table to include non-active days
    intermediate_table2 = intermediate_table1.join(
        all_user_days, ['id', 'date'], 'outer').withColumn(
            "n_", F.coalesce("intermediate_table." + feature_col,
                             lit(0))).drop(feature_col).withColumnRenamed(
                                 "n_", feature_col)

    if include_day_of_week:
        intermediate_table2 = intermediate_table2.withColumn(
            "weekday_" + feature_col,
            F.when(
                F.date_format('date', 'u').cast(IntegerType()) <= 5,
                col(feature_col)).otherwise(0)).withColumn(
                    "weekend_" + feature_col,
                    F.when(
                        F.date_format('date', 'u').cast(IntegerType()) >= 6,
                        col(feature_col)).otherwise(0))

    # Calculate active hours for each profile-day
    windowSpec = Window.partitionBy([intermediate_table2.id]).orderBy(
        intermediate_table2.date).rowsBetween(1 - days, 0)

    active_hours_table = intermediate_table2.withColumn(
        "_temp",
        F.sum(intermediate_table2[feature_col]).over(windowSpec))
    if include_day_of_week:
        active_hours_table = active_hours_table.withColumn(
            "_temp_weekday",
            F.sum(intermediate_table2["weekday_" + feature_col]).over(
                windowSpec)).withColumn(
                    "_temp_weekend",
                    F.sum(intermediate_table2["weekend_" +
                                              feature_col]).over(windowSpec))
    return active_hours_table.drop(
        feature_col,
        "weekday_" + feature_col,
        "weekend_" + feature_col,
    ).withColumnRenamed("_temp", feature_col).withColumnRenamed(
        "_temp_weekday",
        "weekday_" + feature_col).withColumnRenamed("_temp_weekend",
                                                    "weekend_" + feature_col)
 def dataB_freight_upcharge(df: DataFrame) -> DataFrame:
     data_frame = df.withColumn(
         "freight_upcharge",
         F.coalesce(
             F.when(
                 df.charge_desc1.isin({
                     "ADDED FREIGHT", "FREIGHT ALLOWANCE", "FREIGHT CHARGE",
                     "FREIGHT SURCHARGE"
                 }), df.adj_extended_amount).otherwise(0.0),
             F.lit(MISSING_NUMBER)))
     return data_frame
Beispiel #19
0
def replace(dataframe: DataFrame, column: str,
            replace_dict: Dict[str, str]) -> DataFrame:
    """Replace values of a string column in the dataframe using a dict.

    Example:

    >>> from butterfree.extract.pre_processing import replace
    ... from butterfree.testing.dataframe import (
    ...     assert_dataframe_equality,
    ...     create_df_from_collection,
    ... )
    >>> from pyspark import SparkContext
    >>> from pyspark.sql import session
    >>> spark_context = SparkContext.getOrCreate()
    >>> spark_session = session.SparkSession(spark_context)
    >>> input_data = [
    ...     {"id":1, "type": "a"}, {"id":2, "type": "b"}, {"id":3, "type": "c"}
    ... ]
    >>> input_df = create_df_from_collection(input_data, spark_context, spark_session)
    >>> input_df.collect()

    [Row(id=1, type='a'), Row(id=2, type='b'), Row(id=3, type='c')]

    >>> replace_dict = {"a": "type_a", "b": "type_b"}
    >>> replace(input_df, "type", replace_dict).collect()

    [Row(id=1, type='type_a'), Row(id=2, type='type_b'), Row(id=3, type='c')]

    Args:
        dataframe: data to be transformed.
        column: string column on the dataframe where to apply the replace.
        replace_dict: dict with values to be replaced.
            All mapped values must be string.

    Returns:
        Dataframe with column values replaced.

    """
    if not isinstance(dataframe, DataFrame):
        raise ValueError("dataframe needs to be a Pyspark DataFrame type")
    if (column not in dict(
            dataframe.dtypes)) or (dict(dataframe.dtypes)[column] != "string"):
        raise ValueError(
            "column needs to be the name of an string column in dataframe")
    if (not isinstance(replace_dict, dict)) or (not all(
            isinstance(value, str) for value in chain(*replace_dict.items()))):
        raise ValueError("replace_dict needs to be a Python dict with "
                         "all keys and values as string values")

    mapping = create_map(
        [lit(value) for value in chain(*replace_dict.items())]  # type: ignore
    )
    return dataframe.withColumn(column,
                                coalesce(mapping[col(column)], col(column)))
Beispiel #20
0
def add_fiscal_year_and_month_abbr(
        df,
        date_fmt: str = 'yyyy/MM/dd',
        filter_column_year: str = 'voucher_creation_date',
        filter_column_month: str = 'shipment_pickup_date') -> DataFrame:
    expr_mapping = {
        '_fiscal_year': (F.coalesce(
            F.year(F.add_months(F.to_date(filter_column_year, date_fmt), 3)),
            F.year(F.add_months(F.to_date(filter_column_month, date_fmt),
                                3)))),
        '_month_abbr': (F.coalesce(
            F.upper(
                F.date_format(F.to_date(filter_column_year, date_fmt), 'MMM')),
            F.upper(
                F.date_format(F.to_date(filter_column_month, date_fmt),
                              'MMM'))))
    }
    select_expr = build_col_expr(expr_mapping)
    transformed = df.select(F.expr('*'), *select_expr)
    return transformed
Beispiel #21
0
 def combine_key_tables(
         left: DataFrame,
         right: DataFrame
 ) -> DataFrame:
     return (
         left.join(
             right,
             left[keys[0]] == right[keys[0]],
             how='full'
         ).select(
             *[
                 f.coalesce(left[key], right[key]).alias(key)
                 for key in keys
             ],
             f.concat(
                 f.coalesce(left.key_source, f.array()),
                 f.coalesce(right.key_source, f.array())
             ).alias('key_source')
         )
     )
Beispiel #22
0
def to_date_(col):
    '''Convert multiple date formats from string'''

    formats = [
        'yyyy-M-d',
        'yyyy M d',
        'M/dd/yyyy',
        'yyyy MMM d',
        'M-d-yyyy',
    ]
    return F.coalesce(*[F.to_date(col, f) for f in formats])
Beispiel #23
0
 def _get_telemetry_sanity_check_metrics(self, enrollments, df):
     """Return aggregations that check for problems with a client."""
     return [
         # Check to see whether the client_id is also enrolled in other branches
         # E.g. indicates cloned profiles. Fraction of such users should be
         # small, and similar between branches.
         F.max(
             F.coalesce((df.experiments[self.experiment_slug] !=
                         enrollments.branch).astype('int'),
                        F.lit(0))).alias('has_contradictory_branch'),
         # Check to see whether the client_id was sending data in the conversion
         # window that wasn't tagged as being part of the experiment. Indicates
         # either a client_id clash, or the client unenrolling. Fraction of such
         # users should be small, and similar between branches.
         F.max(
             F.coalesce(
                 (~F.isnull(df.experiments)
                  & F.isnull(
                      df.experiments[self.experiment_slug])).astype('int'),
                 F.lit(0))).alias('has_non_enrolled_data'),
     ]
 def dataB_msf(df: DataFrame) -> DataFrame:
     df = df.withColumn(
         "msf",
         F.when(((df.sq_ft.isNotNull()) & (df.sq_ft > 0)),
                (df.sq_ft / 1000.0)).otherwise(
                    F.when(((df.lbs.isNotNull()) &
                            (df.dmat_nominal_basis_weight.isNotNull() |
                             (df.dmat_nominal_basis_weight != 0))),
                           df.lbs / df.dmat_nominal_basis_weight).otherwise(
                               F.lit(MISSING_NUMBER))))
     df = df.withColumn('msf', F.coalesce(df.msf, F.lit(MISSING_NUMBER)))
     return df
Beispiel #25
0
    def run(self, df):
        """ 
        Process and join the data as desired. 

        Key functionalities: 
            - Check for erroneous data
            - Support Update/Insert
            - Restartable if the job fails

        Arguments:
            df {Spark.DataFrame} -- Dataframe that is the result of a left join 
                of the three disparate data sources for the ticket sales. 
        """
        self.logger.info('Data Processing Start // {}'.format(self.etl_id))

        try:

            test_df = df.na.drop()
            test_df.count() == df.count()

            jdbc_options = self._inst_jdbc_params()
            jdbc_options['dbtable'] = 'final_data'

            target_df = self.sqlcontext.read.format('jdbc').options(
                **jdbc_options).load()

            coalesce_cols = [
                column for column in target_df.columns
                if column not in ['transaction_id', 'process_date']
            ]

            param_df = df
            df = df.alias('a').join(
                target_df.alias('b'), ['transaction_id'], how='outer').select(
                    'transaction_id',
                    *(F.coalesce('b.' + col, 'a.' + col).alias(col)
                      for col in coalesce_cols)).distinct()

            insert_row_count = (df - param_df)
            if insert_row_count > 0:
                self.logger.info(
                    'Inserting {} new rows in target dataframe // {}'.format(
                        insert_row_count, self.etl_id))

            self.logger.info('Data Processing Complete // {}'.format(
                self.etl_id))

            test_df.unpersist()

            return df

        except Exception as e:
            self.logger.error('{} // {}'.format(e, self.etl_id))
def hdfs_join_cms(cms_df):
    """ 解析app_open与game_open日志内容,并与cms数据库连接 """
    ##hdfs_df
    sql = """ 
    select t0.custom_uuid,t0.date,t0.package_id,t0.title,t0.site,t0.source from sharp.app_open t0 where t0.dt="{date_0}" 
    union all
    select t1.custom_uuid,t1.date,t1.package_id,t1.title,t1.site,t1.source from sharp.app_open t1 where t1.dt="{date_1}"
    union all
    select t7.custom_uuid,t7.date,t7.package_id,t7.title,t7.site,t7.source from sharp.app_open t7 where t7.dt="{date_7}" """.format(
        date_0=str_dt_0, date_1=str_dt_1, date_7=str_dt_7)
    spark.sql("show databases")
    spark.sql("use sharp")
    hdfs_df = spark.sql(sql)
    ##与CMS数据库应用&游戏数据连接
    condition_0_1 = (F.coalesce(F.col("t_0.package_id"),
                                F.lit("123")) == F.coalesce(
                                    F.col("t_1.fsk_pid"), F.lit("123")))
    df = hdfs_df.alias("t_0").join(cms_df.alias("t_1"),condition_0_1,"left_outer") \
                                 .select(F.col("t_0.custom_uuid").alias("custom_uuid"),F.col("t_0.date").alias("date"),F.col("t_0.site").alias("site"),F.col("t_0.package_id").alias("package_id"), \
                                 F.col("t_0.title").alias("title"),F.col("t_0.source").alias("source"),F.col("t_1.fsk_cid").alias("fsk_cid"))
    return df
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     # if column is not of type date then convert it to date
     formats_column_specs: List[Column] = [
         to_timestamp(self.value.get_column_spec(
             source_df=source_df, current_column=current_column),
                      format=format_) for format_ in self.formats
     ] if self.formats else [
         to_timestamp(
             self.value.get_column_spec(source_df=source_df,
                                        current_column=current_column))
     ]
     if source_df is not None and isinstance(self.value, AutoMapperDataTypeColumn) \
             and not dict(source_df.dtypes)[self.value.value] == "timestamp":
         return coalesce(*formats_column_specs)
     elif isinstance(self.value, AutoMapperDataTypeLiteral):
         return coalesce(*formats_column_specs)
     else:
         column_spec = self.value.get_column_spec(
             source_df=source_df, current_column=current_column)
         return column_spec
Beispiel #28
0
def test_coalesce(data_gen):
    num_cols = 20
    s1 = gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen))
    # we want lots of nulls
    gen = StructGen([('_c' + str(x), data_gen.copy_special_case(None, weight=1000.0)) 
        for x in range(0, num_cols)], nullable=False)
    command_args = [f.col('_c' + str(x)) for x in range(0, num_cols)]
    command_args.append(s1)
    data_type = data_gen.data_type
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : gen_df(spark, gen).select(
                f.coalesce(*command_args)))
    def lsg_sales(self, prod_list, coupons):
        start_date, end_date = date_period(self.period, self.start_date)
        # Check bound date
        table_name = 'cdwds.lsg_f_sls_invc'
        dt_col_name = 'invc_dt_key'
        _, bound_end_date = date_period(-1, end_date)
        bound_date_check(table_name, dt_col_name, start_date, bound_end_date,
                         self.env, 'YYYYMMDD', 'LSG')

        query = 'SELECT '\
                'UPPER(prod_prc_ref_sku) AS prod_id, sum(ext_net_sls_pmar_amt) AS sales ' \
                'FROM cdwds.lsg_f_sls_invc I' \
                'LEFT JOIN cdwds.lsg_prod_v P ON P.sku = prod_prc_ref_sku ' \
                f'WHERE invc_dt_key<{start_date} AND invc_dt_key>={end_date} ' \
                'AND UPPER(prod_prc_ref_sku) IS NOT NULL ' \
                "AND P.stk_type_cd <> 'D' " \
                f'GROUP BY UPPER(prod_prc_ref_sku)'

        sales = redshift_cdw_read(query,
                                  db_type='RS',
                                  database='CDWDS',
                                  env=self.env)
        if prod_list:
            print(f'There are {prod_list.count()} products.')
            sales = sales.\
                join(broadcast(prod_list), ['prod_id'], how='inner')
        else:
            print('Product list is not defined for pulling sales.')

        if coupons:
            coupons_count = coupons.select("coupon_key").distinct().count()
            print(f'There are {coupons_count} rows in coupons table.')
            sales = sales. \
                join(broadcast(coupons), ['prod_id'], how = 'left'). \
                withColumn('coupon', coalesce('coupon', 'prod_id'))
        else:
            print('Coupons is not defined for pulling sales.')

        coupon_sales = sales.groupby('coupon', 'coupon_key').agg({'sales': 'sum'}). \
            withColumnRenamed('sum(sales)', 'coupon_sales'). \
            filter(col('coupon_sales') > 0)

        if sales.count() == 0:
            raise OutputOutOfBoundError(
                'Sales count is 0. Check the data validity of cdwds.lsg_f_sls_invc.'
            )

        if self.debug:
            print(f'Total rows in SKU sales count: {sales.count()}')
            print(
                f'Total number of coupons with sales: {coupon_sales.count()}')

        return sales, coupon_sales
def test_auto_mapper_date_format(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01 12:30"),
            (2, "Vidal", "Michael", "1970-02-02 06:30"),
        ],
        ["member_id", "last_name", "first_name", "opening_time"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn(
        "opening_time", to_timestamp("opening_time",
                                     format="yyyy-MM-dd hh:mm"))

    assert dict(source_df.dtypes)["opening_time"] == "timestamp"

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"]).columns(openingTime=A.datetime(
            A.column("opening_time")).to_date_format("hh:mm:ss"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["openingTime"],
        date_format(coalesce(to_timestamp(col("b.opening_time"))),
                    "hh:mm:ss").alias("openingTime"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("openingTime").collect()
            [0][0] == "12:30:00")
    assert (result_df.where("member_id == 2").select("openingTime").collect()
            [0][0] == "06:30:00")

    # check type
    assert dict(result_df.dtypes)["openingTime"] == "string"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")


# COMMAND ----------

from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()


# COMMAND ----------

from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerId"))).show()


# COMMAND ----------

df.na.drop("all", subset=["StockCode", "InvoiceNo"])


# COMMAND ----------

df.na.fill("all", subset=["StockCode", "InvoiceNo"])


# COMMAND ----------

fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
    res['name'] = book
    res['text'] = "\n".join(['<div class="page-break" page="%d">%s</div>' % (r.seq, r.text) for r in pp]) + ('<archiveid tokenizetagcontent="false">%s</archiveid>' % book)
    return Row(**res)

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: pretty-cluster.py <input> <page-out> <book-out>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Proteus Pages")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.load(sys.argv[1])
    cols = set(raw.columns)
    idcols = [col(x) for x in ['identifier', 'issue', 'book'] if x in cols]

    df = raw.withColumn('identifier', regexp_replace(coalesce(*idcols), '[^A-Za-z0-9]+', ''))

    counts = df.groupBy('identifier').count().select(col('identifier'), col('count').alias('imagecount'))

    appendID = udf(lambda book, text: '%s <archiveid tokenizetagcontent="false">%s</archiveid>' % (text, book))

    renamed = df.join(counts, 'identifier')\
                .drop('regions')\
                .withColumn('pageNumber', col('seq'))\
                .withColumn('name', concat_ws('_', col('identifier'), col('seq')))\
                .withColumn('text', regexp_replace(col('text'), '\\n', '<br>\\\n'))

    renamed.withColumn('text', appendID(col('identifier'), col('text')))\
           .write.format('json').save(sys.argv[2])

    renamed.rdd.groupBy(lambda r: r.identifier).map(pageCat).toDF()\
Beispiel #33
0
 def merge(self, df):
     """
     Combines the Data Grid with a given DataFrame.
     The result is similar to a full outer join, 
     except that when there is no match for a given row, 
     the row is created anyway with all other columns set to NULL.
     
     Assuming these schemas:
     
         DataGrid:  dg[a,b,c]
         DataFrame: df[b,c,d,e]
         
     This is equivalent of executing the following query after adding 
     new columns to the original DataGrid:
     
         SELECT a, b, e, COALESCE(dg.b,df.b) as b, COALESCE(dg.c,df.c) as c
         FROM df
         FULL OUTER JOIN df ON 1=1
             AND dg.b = df.b
             AND dg.c = df.c
     """
     dg = self.dataframe
     
     # Get information about columns and computes common 
     # and different column sets between the DataFrame and DataGrid.
     dg_columns = dg.columns
     df_columns = df.columns
     common_columns = list(set(dg_columns) & set(df_columns)) # intersect
     
     # Merge the new DataFrame with the current DataGrid
     if not dg_columns:
         # Use the given DataFrame as default DataGrid
         #self.dataframe = df
         #self.setDataFrame(df)
         dg = df
         #self.index()
     else:
         all_columns = list(set(dg_columns + df_columns)) # union
         new_columns = list(set(df_columns) - set(dg_columns)) # diff
         diff_columns = list(set(all_columns) - set(common_columns)) # diff
         
         # Merge Columns
         if not common_columns:
             # Add all columns from the given DataFrame that do not exist yet 
             # in the DataGrid and initialize them with NULL values.
             common_columns = df_columns
             diff_columns = dg_columns
             for c in new_columns:
                 dg = dg.withColumn(c, lit(None).cast(NullType()))
         
         # Rename DataFrame's columns that are shared with the DataGrid
         condition = []
         for c in common_columns:
             df = df.withColumnRenamed(c, 'df_'+c)
             condition.append(dg[c] == df['df_'+c])
         
         # Join DataFrames
         dg = dg.join(df, condition, 'outer')
         
         for c in common_columns:
             dg = dg.withColumn(c, coalesce(c, "df_"+c))
             dg = dg.drop("df_"+c)
             
         #self.dataframe = dg
         #self.setDataFrame(dg)
         #self.index()
         
         if config.DEBUG:
             # Debugging: Print out the equivalent SQL Query
             select_stmt = ' SELECT '
             from_stmt = ' FROM dg'
             join_stmt = ' FULL OUTER JOIN df ON 1=1'
             
             # Append non-common columns to select statement
             for i in range(0,len(diff_columns)):
                 if i!=0: select_stmt += ', '
                 select_stmt += diff_columns[i]
             
             # Append common columns with COALESCE function to select statement
             # and add conditions to the join predicate 
             for i in range(0,len(common_columns)):
                 if i!=0 or len(diff_columns)>0: select_stmt += ', '
                 c = common_columns[i]
                 select_stmt += 'COALESCE(dg.'+c+', df.'+c+') AS '+c
                 join_stmt += ' AND dg.'+c+'=df.'+c
              
             query = select_stmt + from_stmt + join_stmt
             print query
             
             # Column information
             print 'dg_columns: ' + dg_columns.__repr__()
             print 'df_columns: ' + df_columns.__repr__()
             print 'all_columns: ' + all_columns.__repr__()
             print 'new_columns: ' + new_columns.__repr__()
             print 'common_columns: ' + common_columns.__repr__()
             print 'diff_columns: ' + diff_columns.__repr__()
     
     #self.dataframe.cache()
     return dg