Beispiel #1
0
def format_output(df):
    df = df.withColumn("uniqueKey",
                       f.upper(f.concat(f.lit("RY"),
                                        f.substring(f.col('year'), 3, 2),
                                        f.lit("_"),
                                        f.col("channel"),
                                        f.lit("_"),
                                        f.col("division"),
                                        f.lit("_"),
                                        f.col("gender"),
                                        f.lit("_"),
                                        f.col("category"),
                                        ))) \
        .withColumn("channel", f.upper(f.col("channel"))) \
        .withColumn("year", f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2))) \
        .withColumn("week_1", f.concat(f.lit("W"), f.col("week")))

    output = df.orderBy("week").groupBy('uniqueKey', 'division', 'gender', 'category', 'channel', 'year').agg(
        f.to_json(
            f.collect_list(
                f.create_map('week_1', 'netSales')
            )
        ).alias('Net Sales'),
        f.to_json(
            f.collect_list(
                f.create_map('week_1', 'salesUnits')
            )
        ).alias('Sales Units')
    )

    return output
def task_a_3_step_3_final(spark):
    result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "popular-topics-by-country_step-2").parse_json(a3_struct_common) \
        .withWatermark("timetamp_start", "1 minute").groupBy(
        "timetamp_start",
        "timetamp_end"
    ).agg(
        F.collect_list(
            F.create_map(
                [
                    "country_name",
                    F.create_map(
                        [
                            "topic_name_exp",
                            "topic_sum"
                        ]
                    )
                ]
            )
        ).alias("statistics")
    ).select(
        F.struct(
            F.concat(F.hour('timetamp_start'), lit(":"), F.minute('timetamp_start')).alias("time_start"),
            F.concat(F.hour('timetamp_end'), lit(":"), F.minute('timetamp_end')).alias("time_end"),
            col('statistics')
        ).alias("res")
    ).send_to_kafka(config.BOOTSTRAP_SERVERS, "popular-topics-by-country", config.LOG_PREFIX)

    return result
Beispiel #3
0
    def feature_convert_id(self):
        user_data = self.spark.sql(UserSql)
        item_data = self.spark.sql(ItemSql)
        uid_id = self.spark.sql(uid2id)
        user_data = user_data.join(uid_id, ['uid'], "inner").withColumnRenamed(
            "id", "uidIndex")
        user_data = uid_id.join(user_data, uid_id.uid == user_data.subeventid,
                                "inner").drop(uid_id.uid).withColumnRenamed(
                                    "id", "subeventidIndex")
        item_data = item_data.join(uid_id, uid_id.uid == item_data.tuid,
                                   "inner").drop(uid_id.uid).withColumnRenamed(
                                       "id", "tuidIndex")
        # for feature in ["uid", "age", "workid", "height", "sex"]:
        #     user_data, item_data = labelEncoderExample(user_data, item_data, feature)
        # user_data, item_data = labelEncoderExample(user_data, item_data, "uid")
        # uid特征哈希处理
        # user_data = user_data.withColumn("uidIndex", F.col("uid").cast(IntegerType()) % 500000)
        # user_data = user_data.withColumn("subeventidIndex", F.col("subeventid").cast(IntegerType()) % 500000)
        # item_data = item_data.withColumn("tuidIndex", F.col("tuid").cast(IntegerType()) % 500000)
        # 年龄转化
        age_mapping_expr = F.create_map(
            [F.lit(x) for x in chain(*self.age_dict.items())])
        user_data = user_data.withColumn(
            "ageIndex", age_mapping_expr.getItem(F.col("age")))
        item_data = item_data.withColumn(
            "tageIndex", age_mapping_expr.getItem(F.col("tage")))
        # 身高转化
        height_mapping_expr = F.create_map(
            [F.lit(x) for x in chain(*self.height_dict.items())])
        user_data = user_data.withColumn(
            "heightIndex", height_mapping_expr.getItem(F.col("height")))
        item_data = item_data.withColumn(
            "theightIndex", height_mapping_expr.getItem(F.col("theight")))
        # 工作地点转化
        workid_mapping_expr = F.create_map(
            [F.lit(x) for x in chain(*self.workid_dict.items())])
        user_data = user_data.withColumn(
            "workidIndex", workid_mapping_expr.getItem(F.col("workid")))
        item_data = item_data.withColumn(
            "tworkidIndex", workid_mapping_expr.getItem(F.col("tworkid")))
        # 性别转化
        sex_mapping_expr = F.create_map(
            [F.lit(x) for x in chain(*self.sex_dict.items())])
        user_data = user_data.withColumn(
            "sexIndex", sex_mapping_expr.getItem(F.col("sex")))
        item_data = item_data.withColumn(
            "tsexIndex", sex_mapping_expr.getItem(F.col("tsex")))

        user_data, item_data = someColumnRenamed(user_data, item_data)
        return user_data, item_data
Beispiel #4
0
def _mark_as_lit(data, data_type):
    # To support nested types, 'data_type' is required.
    assert data_type is not None

    if data is None:
        return f.lit(data).cast(data_type)

    if isinstance(data_type, ArrayType):
        assert isinstance(data, list)
        # Sadly you cannot create a literal from just an array in pyspark
        return f.array([_mark_as_lit(x, data_type.elementType) for x in data])
    elif isinstance(data_type, StructType):
        assert isinstance(data, tuple) and len(data) == len(data_type.fields)
        # Sadly you cannot create a literal from just a dict/tuple in pyspark
        children = zip(data, data_type.fields)
        return f.struct([
            _mark_as_lit(x, fd.dataType).alias(fd.name) for x, fd in children
        ])
    elif isinstance(data_type, DateType):
        # Due to https://bugs.python.org/issue13305 we need to zero pad for years prior to 1000,
        # but this works for all of them
        dateString = data.strftime("%Y-%m-%d").zfill(10)
        return f.lit(dateString).cast(data_type)
    elif isinstance(data_type, MapType):
        assert isinstance(data, dict)
        # Sadly you cannot create a literal from just a dict/tuple in pyspark
        col_array = []
        for k in data:
            col_array.append(_mark_as_lit(k, data_type.keyType))
            col_array.append(_mark_as_lit(data[k], data_type.valueType))
        return f.create_map(*col_array)
    else:
        # lit does not take a data type so we might have to cast it
        return f.lit(data).cast(data_type)
Beispiel #5
0
def map_col(spark, df, datafolder, map_col_name, df_col_name, new_col_name):
    """
    Map the mapping of a simple csv file with key-value structure to a new column in the dataframe
    matching the keys.

    Parameters
    ----------
    spark: SparkSession
    df : spark dataframe
        The file containing the df_col_name to be used for mapping.
    datafolder : str
        Folder location of the csv file to be used for mapping.
    map_col_name : str
        The column name of the mapping file.
    df_col_name : str
        The column name in the Spark dataframe to be used.
    new_col_name : str
        New column name of the mapping results.
    """
    df_map = spark_read_csv(spark, datafolder, f'{map_col_name}.csv')
    df_map = df_map.toPandas()
    id_col = f'{map_col_name}_id'
    dic_map = dict(zip(df_map[id_col], df_map[map_col_name]))
    mapping_expr = F.create_map([F.lit(x) for x in chain(*dic_map.items())])
    return df.withColumn(new_col_name, mapping_expr[F.col(df_col_name)])
Beispiel #6
0
def correct_country_names(
    df: DataFrame,
    country_col: str,
    country_mapping_path: str,
) -> DataFrame:
    """
    Replace corrupted country values with true ones.

    :param df: dataframe including country_name column
    :param country_col: Column name of country
    :param country_mapping_path: Path of mapping config
    :return: dataframe including country_name columns
    """
    column = country_col
    replace_dict = provide_config(country_mapping_path)
    corrupted_values = list(replace_dict.keys())
    map_col = create_map([lit(x) for x in chain(*replace_dict.items())])
    df = df.withColumn(column, F.regexp_replace(column, '"', ''))
    df = df.withColumn(
        column,
        F.when(F.col(column).isin(corrupted_values),
               map_col[df[column]]).otherwise(F.col(column)))
    df = df.filter(F.col(column).isNotNull())
    df = df.drop_duplicates()
    logging.info("Corrupted country columns are replaced with true values")
    return df
    def compute_score(self, aux, df_records, tol=15):
        """
        Compute scoreboard of auxiliary information aux inside record df_records.
        Both must be spark dataframes.
        Returns a spark dataframe.
        """
        s = aux.groupby('custId').count().take(1)[0][1]
        #mapping = {n:binom_cdf(p=tol/self.nb_combination, s=s)(n) for n in range(0,self.max_nb_review_per_cust+1)}
        mapping_2 = {
            n: proba_2(p=tol / self.nb_combination, s=s)(n)
            for n in range(0, self.max_nb_review_per_cust + 1)
        }
        #mapping_expr = create_map([lit(x) for x in chain(*mapping.items())])
        mapping_expr_2 = create_map(
            [lit(x) for x in chain(*mapping_2.items())])

        merged = broadcast(prepare_join(aux, '_1', True)).crossJoin(
            prepare_join(df_records, '_2', True))

        merged = merged.withColumn('similarity', self.similarity_func(merged))
        #merged = merged.withColumn('value', 1/F.log(F.log(merged.nbCustReviews_2+100)) * merged.similarity)
        #merged = merged.withColumn('value', 1/F.log(merged.nbMovieReviews_2) * merged.value)
        #merged = merged.withColumn('value', binom_cdf_udf(merged.nbCustReviews_2) * merged.similarity)
        merged = merged.withColumn(
            'value',
            mapping_expr_2.getItem(col('nbCustReviews_2')) * merged.similarity)
        #merged = merged.withColumn('value', merged.similarity)
        merged = merged.groupBy('custId_1', 'custId_2',
                                'movieId_1').max('value')
        merged = merged.withColumnRenamed('max(value)', 'value')
        merged = merged.groupBy('custId_1', 'custId_2').sum('value')
        merged = merged.withColumnRenamed('sum(value)', 'value')
        return merged
Beispiel #8
0
def test_create_glue_table_parquet(session, bucket, database, compression, partition_by):
    path = "data_samples/nano.csv"
    schema = "id INTEGER, name STRING, value DOUBLE, date DATE, time TIMESTAMP"
    timestamp_format = "yyyy-MM-dd"
    dataframe = session.spark.read_csv(path=path,
                                       schema=schema,
                                       timestampFormat=timestamp_format,
                                       dateFormat=timestamp_format,
                                       header=True)
    dataframe = dataframe \
        .withColumn("my_array", array(lit(0), lit(1))) \
        .withColumn("my_struct", struct(lit("text").alias("a"), lit(1).alias("b"))) \
        .withColumn("my_map", create_map(lit("k0"), lit(1.0), lit("k1"), lit(2.0)))
    s3_path = f"s3://{bucket}/test"
    dataframe.write \
        .mode("overwrite") \
        .format("parquet") \
        .partitionBy(partition_by) \
        .save(compression=compression, path=s3_path)
    session.spark.create_glue_table(dataframe=dataframe,
                                    file_format="parquet",
                                    partition_by=partition_by,
                                    path=s3_path,
                                    compression=compression,
                                    database=database,
                                    table="test",
                                    replace_if_exists=True)
    query = "select count(*) as counter from test"
    pandas_df = session.pandas.read_sql_athena(sql=query, database=database)
    assert pandas_df.iloc[0]["counter"] == 5
    query = "select my_array[1] as foo, my_struct.a as boo, my_map['k0'] as bar from test limit 1"
    pandas_df = session.pandas.read_sql_athena(sql=query, database=database)
    assert pandas_df.iloc[0]["foo"] == 0
    assert pandas_df.iloc[0]["boo"] == "text"
    assert pandas_df.iloc[0]["bar"] == 1.0
def convert_endpoint_to_site(dataset, src_col, dst_col):
    """
    Convert src/dst hostname to the respective site names.

    :return: dataset
    """
    import requests  # , re
    from pyspark.sql.functions import col, create_map, lit
    from itertools import chain

    # retrieve mapping
    cric_url = "http://wlcg-cric.cern.ch/api/core/service/query/?json&type=SE"
    r = requests.get(url=cric_url).json()
    site_protocols = {}
    for site, info in r.items():
        if "protocols" in info:
            # print(se, type(se), info, type(info))
            for name, prot in info.get('protocols', {}).items():
                site_protocols.setdefault(get_hostname(prot['endpoint']), site)

    # apply mapping
    mapping_expr = create_map([lit(x) for x in chain(*site_protocols.items())])
    out_cols = dataset.columns
    dataset = dataset.withColumnRenamed(src_col, "src")
    dataset = dataset.withColumnRenamed(dst_col, "dst")
    dataset = dataset.withColumn(src_col, mapping_expr[dataset["src"]]) \
        .withColumn(dst_col, mapping_expr[dataset["dst"]])
    return (dataset.select(out_cols))
Beispiel #10
0
def svod(default_schema, snapshot_date):

    print(
        "Selecting rows from source table DATASCIENCEsvod_segmentation_master")

    seg_source = spark.sql(
        "SELECT * FROM {}.svod_segmentation_master where snapshot_date='{}'".
        format(default_schema, snapshot_date))

    read_count = seg_source.count()
    print("Read count is  {} ".format(read_count))
    print("Generating the map ".format(seg_source.count()))
    svod_df = seg_source.select(
        "userid", "subscription_id", "segment_name",
        create_map(lit('For the Family'), col('prob_segment_0'),
                   lit('Drama Watchers'), col('prob_segment_1'),
                   lit('Anime Fans'), col('prob_segment_2'),
                   lit('Broadcast Generalists'), col('prob_segment_3'),
                   lit('Reality Watchers'), col('prob_segment_4'),
                   lit('Comedy Watchers'), col('prob_segment_5'),
                   lit('Exclusive / Prestige'), col('prob_segment_6'),
                   lit('Content Miners / Film Buffs'),
                   col('prob_segment_7')).alias("prob"), "snapshot_date")

    return svod_df
def get_weighted_dataframe(df, doGen, resonance, era, subEra, shift=None):
    '''
    Produces a dataframe with a weight and weight2 column
    with weight corresponding to:
        1 for data
    or
        pileup for mc
    The optional shift parameter allows for a different
    systematic shift to the weights
    '''
    # TODO: implement systematic shifts in the weight such as PDF, pileup, etc.
    # get the pileup
    pileup_ratio, pileup_edges = get_pileup(resonance, era, subEra)

    # build the weights (pileup for MC)
    # TODO: if there is a weight column (ie, gen weight) get that first
    if doGen:
        pileupMap = {e: r for e, r in zip(pileup_edges[:-1], pileup_ratio)}
        mapping_expr = F.create_map(
            [F.lit(x) for x in itertools.chain(*pileupMap.items())])
        weightedDF = df.withColumn(
            'weight', mapping_expr.getItem(F.col('tag_nVertices')))
    else:
        weightedDF = df.withColumn('weight', F.lit(1.0))
    weightedDF = weightedDF.withColumn(
        'weight2', F.col('weight') * F.col('weight'))

    return weightedDF
Beispiel #12
0
    def create_exprs(_input_col, _buckets, _func):
        def count_exprs(_exprs):
            return F.sum(F.when(_exprs, 1).otherwise(0))

        _exprs = []
        for i, b in enumerate(_buckets):
            lower = b["lower"]
            upper = b["upper"]

            if is_numeric(lower):
                lower = round(lower, 2)

            if is_numeric(upper):
                upper = round(upper, 2)

            if len(_buckets) == 1:
                count = count_exprs((_func(_input_col) == lower))
            else:
                if i == len(_buckets):
                    count = count_exprs((_func(_input_col) > lower)
                                        & (_func(_input_col) <= upper))
                else:
                    count = count_exprs((_func(_input_col) >= lower)
                                        & (_func(_input_col) < upper))

            info = F.create_map(F.lit("count"), count.cast("int"),
                                F.lit("lower"), F.lit(lower), F.lit("upper"),
                                F.lit(upper)).alias("hist_agg" + "_" +
                                                    _input_col + "_" +
                                                    str(b["bucket"]))
            _exprs.append(info)
        _exprs = F.array(*_exprs).alias("hist" + _input_col)
        return _exprs
Beispiel #13
0
def percentile_agg(col_name, df, values, relative_error):
    """
    Return the percentile of a dataframe
    :param col_name:  '*', list of columns names or a single column name.
    :param df:
    :param values: list of percentiles to be calculated
    :param relative_error:  If set to zero, the exact percentiles are computed, which could be very expensive. 0 to 1 accepted
    :return: percentiles per columns
    """

    # Make sure values are double

    if values is None:
        values = [0.05, 0.25, 0.5, 0.75, 0.95]

    values = val_to_list(values)
    values = list(map(str, values))

    if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
        # Get percentiles

        p = F.expr(
            "percentile_approx(`{COLUMN}`, array({VALUES}), {ERROR})".format(
                COLUMN=col_name,
                VALUES=" , ".join(values),
                ERROR=relative_error))

        # Zip the arrays
        expr = [[F.lit(v), p.getItem(i)] for i, v in enumerate(values)]
        expr = F.create_map(*list(itertools.chain(*expr)))

    else:
        expr = None
    # print(expr)
    return expr
Beispiel #14
0
def rank_preselection_by_popularity(path_train, behaviors_df, preselection_df):
    _, behaviors_train_df, preselection_train_df = read_data(path_train)
    items_popularity_df = behaviors_train_df.union(behaviors_df).groupby(
        'item_id').agg(F.count('item_id').alias('popularity')).sort(
            F.desc('popularity'))
    # add the items in preselection_df that were not in behaviors_df with a 0 popularity
    items_popularity_df = items_popularity_df.join(
        preselection_df.select('item_id').distinct(), 'item_id',
        how='full').fillna(0)

    preselection_pop_df = preselection_df.join(items_popularity_df, 'item_id')

    preselection_pop_df = preselection_pop_df.withColumn(
        'rank',
        F.row_number().over(
            Window.partitionBy('user', 'index').orderBy(F.desc('popularity'))))
    preselection_pop_df = preselection_pop_df.withColumn('dic', F.create_map(['item', 'rank']))\
                                             .drop('item_id')\
                                             .drop('rank')\
                                             .drop('popularity')\
                                             .drop('success')
    preselection_pop_df = preselection_pop_df.groupby('user', 'index').agg(
        F.collect_list('dic').alias('dic_list'))
    preselection_pop_df = preselection_pop_df.orderBy(['index', 'user'],
                                                      ascending=[1, 1])
    return items_popularity_df, preselection_pop_df
 def _get_base_data(self, startdate, enddate, part_start, part_end):
     sql = """
         select trade_id,prd_ind,trd_type,busi_date,
                 sum(pre_mkt_val) pre_mkt_val,
                 sum(now_mkt_val) now_mkt_val,
                 sum(pos_cash_flow) pos_cash_flow,
                 sum(neg_cash_flow) neg_cash_flow,
                 max(exception_label) exception_label,
                 sum(return) return
         from {2}.{3}
         where busi_date>='{0}' and busi_date<='{1}' and part>='{4}' and part<='{5}'
         and prd_no!='0.0'
         GROUP  by trade_id,prd_ind,trd_type,busi_date
     """.format(startdate, enddate, self.adata,
                self.stock_cust_daily_holding, part_start, part_end)
     df = self.sparkSession.sql(sql).withColumn("detail_item",
                                                F.create_map(F.lit("pre_mkt_val"), "pre_mkt_val",
                                                             F.lit("now_mkt_val"), "now_mkt_val",
                                                             F.lit("pos_cash_flow"),
                                                             "pos_cash_flow",
                                                             F.lit("neg_cash_flow"),
                                                             "neg_cash_flow",
                                                             F.lit("exception_label"),
                                                             "exception_label",
                                                             F.lit("trd_type"), "trd_type",
                                                             F.lit("return"), "return",
                                                             F.lit("busi_date"),
                                                             "busi_date"))\
         .groupBy("trade_id", "prd_ind")\
         .agg(F.collect_list("detail_item").alias("detail_list"))
     df.persist(StorageLevel.DISK_ONLY).count()
     return df
Beispiel #16
0
def apply_overwrite_dict_to_df(df, lookup_col, overwrite_dict):
    """
    df : A spark dataframe
    lookup_col : The column name that should be used to apply fixes (e.g. col1)
    overwrite_dict : should be a dictionary where each key is the value of lookup_col you want to fix. (e.g. {'a' : {'col2' : 2}} to fix the value in col2 when col1 is equal to 'a')
    """
    # Split overwrite_dict into a dictionary of single key value pairs dictionaries
    skvp = {}
    for k in overwrite_dict:
        for kk in overwrite_dict[k]:
            if kk not in skvp:
                skvp[kk] = {}

            skvp[kk][k] = overwrite_dict[k][kk]

    # for each col that is going to be overwritten apply the single key value pairs
    for k in skvp:
        mapping_expr = F.create_map(
            [F.lit(x) for x in chain(*skvp[k].items())])
        df = df.withColumn(
            k,
            F.when(mapping_expr.getItem(df[lookup_col]).isNull(),
                   df[k]).otherwise(mapping_expr.getItem(df[lookup_col])),
        )

    return df
Beispiel #17
0
def Binning(df, num_col, no_of_buckets):

	for (a,b) in df.dtypes:
			if a==num_col:
				o_dtype = b


	tdf = df.withColumn(num_col, col(num_col).cast('double'))
	qds = QuantileDiscretizer(numBuckets=no_of_buckets, inputCol=num_col, outputCol="bucket_no")
	bucketizer = qds.fit(tdf)
	splits = bucketizer.getSplits()
	tdf = bucketizer.transform(tdf)

	bucket_dict = dict()

	for i in range(no_of_buckets):
		bucket_dict[float(i)] = str(splits[i]) + ' to ' + str(splits[i+1])

	#tdf = tdf.withColumn('bucket_no', col(num_col).cast('string'))

	mapping_expr=create_map([lit(x) for x in chain(*bucket_dict.items())])
	tdf = tdf.withColumn(num_col + '_bucket_range', mapping_expr.getItem(col('bucket_no')))

	tdf = tdf.withColumn(num_col, col(num_col).cast(o_dtype))
	
	return tdf, bucket_dict
Beispiel #18
0
def _as_categorical_type(index_ops: IndexOpsLike, dtype: CategoricalDtype,
                         spark_type: DataType) -> IndexOpsLike:
    """Cast `index_ops` to categorical dtype, given `dtype` and `spark_type`."""
    assert isinstance(dtype, CategoricalDtype)
    if dtype.categories is None:
        codes, uniques = index_ops.factorize()
        return codes._with_new_scol(
            codes.spark.column,
            field=codes._internal.data_fields[0].copy(dtype=CategoricalDtype(
                categories=uniques)),
        )
    else:
        categories = dtype.categories
        if len(categories) == 0:
            scol = SF.lit(-1)
        else:
            kvs = chain(*[(SF.lit(category), SF.lit(code))
                          for code, category in enumerate(categories)])
            map_scol = F.create_map(*kvs)

            scol = F.coalesce(map_scol[index_ops.spark.column], SF.lit(-1))
        return index_ops._with_new_scol(
            scol.cast(spark_type),
            field=index_ops._internal.data_fields[0].copy(
                dtype=dtype, spark_type=spark_type, nullable=False),
        )
Beispiel #19
0
def test_flatten_schema_no_changes(spark):
    """Check non-struct columns are not affected"""
    df = spark.createDataFrame(df_p)
    df = df.withColumn("array", sf.array(sf.lit("a"), sf.lit("-")))
    df = df.withColumn("map", sf.create_map(sf.lit("b"), sf.lit("_")))
    result = flatten_df(df)
    assert df.columns == result.columns
    assert df.count() == result.count()
    def test_column_getitem(self):
        from pyspark.sql.functions import col, create_map, lit

        map_col = create_map(lit(0), lit(100), lit(1), lit(200))
        self.assertRaisesRegexp(
            Py4JJavaError,
            "Unsupported literal type class org.apache.spark.sql.Column id",
            lambda: map_col.getItem(col('id')))
Beispiel #21
0
 def source_map(df, alias, extra_filter=""):
     m = F.create_map(
         list(
             chain(*((F.lit(name.split("_")[0]), F.col(name))
                     for name in df.columns
                     if name != "addon_id" and extra_filter in name
                     )))).alias(alias)
     return m
 def generate_map_with_empty_validity(spark, path):
     gen_data = StructGen(
         [['number', IntegerGen()], ['word', LongGen()]], nullable=False)
     gen_df(spark, gen_data)
     df = gen_df(spark, gen_data)
     df_noNulls = df.filter("number is not null")
     df_map = df_noNulls.withColumn("map", f.create_map(
         ["number", "word"])).drop("number").drop("word")
     df_map.coalesce(1).write.parquet(path)
Beispiel #23
0
def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike:
    categories = cast(CategoricalDtype, index_ops.dtype).categories
    if len(categories) == 0:
        scol = SF.lit(None)
    else:
        kvs = chain(*[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)])
        map_scol = F.create_map(*kvs)
        scol = map_scol[index_ops.spark.column]
    return index_ops._with_new_scol(scol)
Beispiel #24
0
def MakeDict(df, keycol, valcol):
	mymap = df.select(create_map(keycol, valcol).alias('map'))
	mylist = mymap.select(collect_list(mymap.map).alias('dict')).head()['dict']
	d = {}
	for elem in mylist:
		for key in elem:
			d[key] = elem[key]

	return d
Beispiel #25
0
def childMap(df, id, parentid, parentLookupCol, lookupCol):
    top = "top" + parentLookupCol
    rec = df.where((F.col(lookupCol).isNotNull())
                   & (F.col("parent_flag") == "false"))
    rec = rec.select(F.col(id),
                     F.lit("").alias(parentid),
                     F.col(lookupCol).alias(top))
    child = rec.select(F.create_map([F.col(id), F.col(top)]).alias("childMap"))
    return child
Beispiel #26
0
def recode(col_name, map_dict, default=None):
    if not isinstance(col, Column):
        col_name = col(col_name)
    mapping_expr = create_map([lit(x) for x in chain(*map_dict.items())])
    if default is None:
        return mapping_expr.getItem(col_name)
    else:
        return when(~isnull(mapping_expr.getItem(col_name)),
                    mapping_expr.getItem(col_name)).otherwise(default)
Beispiel #27
0
    def change_dimension_level_name(self,
                                    targetCol,
                                    defaltName="Others",
                                    topnLevel=None,
                                    newLevelNameDict=None):
        """
        used to change level names of a particular dimension columns
        Parameters
        ----------
        self : Object
            An Object of class DataFrameTransformer
        targetCol : list/tuple of strings
            column on which to apply this transformation.
        topnLevel : int or None
            Top levels to keep(by level count). all other levels will be clubbed as "Others"
        defaltName : basestring
            default Name given to all the Other levels
        newLevelNameDict : dict
            mapping for changing Level Name {"existingName1":"newName1","existingName2":"newName2"}

        Notes
        ----------
        If both topnLevel and newLevelNameDict are provided then topnLevel will take precedence

        """
        if topnLevel != None:
            topnLevel = topnLevel
        else:
            topnLevel = GLOBALSETTINGS.DTREE_TARGET_DIMENSION_MAX_LEVEL - 1
        print targetCol

        for colName in targetCol:
            levelCountDict = self._metaParser.get_unique_level_dict(colName)
            levelCountArray = sorted(levelCountDict.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
            countArr = [x[1] for x in levelCountArray]
            totalCount = sum(countArr)
            existinCount = sum(countArr[:topnLevel])
            newLevelCount = levelCountArray[:topnLevel]
            newLevelCount.append((defaltName, totalCount - existinCount))
            mappingDict = dict([(tup[0], tup[0]) if idx <= topnLevel - 1 else
                                (tup[0], defaltName)
                                for idx, tup in enumerate(levelCountArray)])
            mapping_expr = create_map(
                [lit(x) for x in chain(*mappingDict.items())])
            existingCols = self._data_frame.columns
            self._data_frame = self._data_frame.withColumnRenamed(
                colName,
                str(colName) + "JJJLLLLKJJ")
            self._data_frame = self._data_frame.withColumn(
                colName,
                mapping_expr.getItem(col(str(colName) + "JJJLLLLKJJ")))
            self._data_frame = self._data_frame.select(existingCols)
            self._dataframe_helper.set_dataframe(self._data_frame)
            self._metaParser.update_level_counts(colName, dict(newLevelCount))
Beispiel #28
0
def parentMap(df, id, parentid, parentLookupCol, lookupCol):
    top = "top" + lookupCol
    rec = df.withColumn(
        "parent_flag",
        F.when(F.col(parentid) == "", "true").otherwise("false"))
    rec = rec.withColumn(
        top, F.when(F.col("parent_flag") == "true", F.col(parentLookupCol)))
    parent = rec.where(F.col("parent_flag") == "true") \
        .select(F.create_map([F.col(id),
                              F.col(top)]).alias("parentMap"))
    return parent
Beispiel #29
0
def replace(dataframe: DataFrame, column: str,
            replace_dict: Dict[str, str]) -> DataFrame:
    """Replace values of a string column in the dataframe using a dict.

    Example:

    >>> from butterfree.extract.pre_processing import replace
    ... from butterfree.testing.dataframe import (
    ...     assert_dataframe_equality,
    ...     create_df_from_collection,
    ... )
    >>> from pyspark import SparkContext
    >>> from pyspark.sql import session
    >>> spark_context = SparkContext.getOrCreate()
    >>> spark_session = session.SparkSession(spark_context)
    >>> input_data = [
    ...     {"id":1, "type": "a"}, {"id":2, "type": "b"}, {"id":3, "type": "c"}
    ... ]
    >>> input_df = create_df_from_collection(input_data, spark_context, spark_session)
    >>> input_df.collect()

    [Row(id=1, type='a'), Row(id=2, type='b'), Row(id=3, type='c')]

    >>> replace_dict = {"a": "type_a", "b": "type_b"}
    >>> replace(input_df, "type", replace_dict).collect()

    [Row(id=1, type='type_a'), Row(id=2, type='type_b'), Row(id=3, type='c')]

    Args:
        dataframe: data to be transformed.
        column: string column on the dataframe where to apply the replace.
        replace_dict: dict with values to be replaced.
            All mapped values must be string.

    Returns:
        Dataframe with column values replaced.

    """
    if not isinstance(dataframe, DataFrame):
        raise ValueError("dataframe needs to be a Pyspark DataFrame type")
    if (column not in dict(
            dataframe.dtypes)) or (dict(dataframe.dtypes)[column] != "string"):
        raise ValueError(
            "column needs to be the name of an string column in dataframe")
    if (not isinstance(replace_dict, dict)) or (not all(
            isinstance(value, str) for value in chain(*replace_dict.items()))):
        raise ValueError("replace_dict needs to be a Python dict with "
                         "all keys and values as string values")

    mapping = create_map(
        [lit(value) for value in chain(*replace_dict.items())]  # type: ignore
    )
    return dataframe.withColumn(column,
                                coalesce(mapping[col(column)], col(column)))
Beispiel #30
0
    def get_documents_df(self, data_files_path, redirects_files_path):
        """
        Return a DataFrame containing the entities to be indexed.
        Redirects are filtered out, if given.
        :param data_files_path: path to .ttl file(s) (e.g., /dbpedia/all_data/*.ttl)
        :param redirects_files_path:  path to .ttl file(s) (e.g., /dbpedia/redirects/*.ttl).
        :return:
        """
        # DF schema: subj, pred, obj
        df = self._ttl_as_df(data_files_path)

        # Filter redirected entities, if any
        redirects = self._get_redirects(redirects_files_path)
        df = df.join(redirects, df.subj == redirects.subj, 'left_anti')

        # Replace RDF properties with index fields names
        mapping = F.create_map(
            [F.lit(x) for x in chain(*self._predicate2field.items())])
        df = df \
            .withColumn('pred', mapping[df.pred]) \
            .dropna()  # remove unknown properties

        # Swap subj and obj when pred = redirect (store the relation as subj hasRedirect obj)
        # Make subj the uri col
        uri_col = ElasticConfig.Fields.URI.value
        df = df \
            .withColumn(uri_col, F.when(df.pred != ElasticConfig.Fields.REDIRECT.value, df.subj).otherwise(df.obj)) \
            .withColumn('obj_new', F.when(df.pred != ElasticConfig.Fields.REDIRECT.value, df.obj).otherwise(df.subj)) \
            .drop('subj', 'obj') \
            .select(F.col(uri_col), F.col('pred'), F.col('obj_new').alias('obj'))

        # Pivot table grouping by uri; collect objects into lists
        df = df.groupBy(uri_col).pivot("pred").agg(F.collect_list('obj'))

        # Add a column with extra surface forms
        extra_surface_forms = F.udf(self._surface_forms_from_uri,
                                    ArrayType(StringType()))
        df = df.withColumn("extra_surface_forms", extra_surface_forms(uri_col))

        # If the surface forms column already exists, merge it with the new one
        if ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value in df.columns:
            merge_surface_forms = F.udf(lambda sf1, sf2: list({*sf1 + sf2}),
                                        ArrayType(StringType()))
            df = df \
                .withColumn(ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value,
                            merge_surface_forms(ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value,
                                                'extra_surface_forms')) \
                .drop('extra_surface_forms')
        else:  # else just rename the new one
            df = df.withColumnRenamed(
                'extra_surface_forms',
                ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value)

        return df
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)


# COMMAND ----------

from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded").show(2)


# COMMAND ----------

from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .show(2)


# COMMAND ----------

df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)


# COMMAND ----------

df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)