Ejemplo n.º 1
0
 def test_capture_illegalargument_exception(self):
     self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks",
                             lambda: self.spark.sql("SET mapred.reduce.tasks=-1"))
     df = self.spark.createDataFrame([(1, 2)], ["a", "b"])
     self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values",
                             lambda: df.select(sha2(df.a, 1024)).collect())
     try:
         df.select(sha2(df.a, 1024)).collect()
     except IllegalArgumentException as e:
         self.assertRegexpMatches(e.desc, "1024 is not in the permitted values")
         self.assertRegexpMatches(e.stackTrace,
                                  "org.apache.spark.sql.functions")
Ejemplo n.º 2
0
 def test_capture_illegalargument_exception(self):
     self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks",
                             lambda: self.spark.sql("SET mapred.reduce.tasks=-1"))
     df = self.spark.createDataFrame([(1, 2)], ["a", "b"])
     self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values",
                             lambda: df.select(sha2(df.a, 1024)).collect())
     try:
         df.select(sha2(df.a, 1024)).collect()
     except IllegalArgumentException as e:
         self.assertRegexpMatches(e.desc, "1024 is not in the permitted values")
         self.assertRegexpMatches(e.stackTrace,
                                  "org.apache.spark.sql.functions")
Ejemplo n.º 3
0
def remove_duplicated_schedules(schedules):
    # Compute the hash of each stop item
    schedules = schedules.withColumn(
        "hash",
        F.sha2(
            F.concat_ws(
                "||",
                schedules.trip_id,
                schedules.times,
                schedules.start_time,
                schedules.end_time,
                schedules.locations,
                schedules.headsigns,
            ),
            256,
        ),
    )

    # Drop duplicated paths
    unique_schedules = schedules.dropDuplicates(["hash"])

    # Remove the hash columns
    unique_schedules = unique_schedules.drop("hash")

    return unique_schedules
Ejemplo n.º 4
0
    def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None):
        """
        Function to load a supporting table to passengers from GCS and save in BigQuery.
        :param csv_filepath: str input filename
        :param uid_name: str name to give the UID column
        :param uid_col_list: list of str column names to combine into UID
        :param csv_bq: str output project.datset.table where the dat will be saved
        :param passenger_bq: str, optional. If passengers_df already has been loaded
        """
        csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath)
        logger.info(f"Loading address info from {csv_path}")
        csv_df = self.sparkql.read.csv(csv_path, header=True)

        csv_df = csv_df.withColumn(uid_name,
                                       sha2(concat_ws("",
                                                      *uid_col_list
                                                      ),
                                            256
                                            ))
        if passenger_bq:
            passengers_df = self.sparkql.read.format('bigquery') \
                                 .option('table', passenger_bq) \
                                 .load() \
                                 .withColumnRenamed('uid', 'passenger_uid')
        else:
            passengers_df = self.passengers_df.withColumnRenamed('uid', 'passenger_uid')

        csv_df = csv_df.join(passengers_df.select('email', 'passenger_uid'),
                                 on='email',
                                 how='left')
        logger.info(f"writing card data to {csv_bq}")
        csv_df.write.format('bigquery') \
          .option('table', csv_bq) \
          .save()
Ejemplo n.º 5
0
def create_dimensional_partitions(data, parquet_loc, execution_date):
    """Creates the dimensional objects (Dimensions, Facts) from a PySpark RDD and outputs them to the Parquet format
       to be processed by Redshift.

    Args:
        data (pyspark.rdd.RDD): the base PySpark RDD
        parquet_loc (str): the output path where the Parquet files are to be saved
        execution_date (str): the execution date
    """

    broker_staging = data.select(['broker']).distinct()

    # calculate hash using SHA2
    broker_staging = broker_staging.withColumn(
        "hash", sha2(concat_ws("||", *broker_staging.columns), 256))

    # geography dim
    geography_staging = data.select(['country', 'county', 'parish']).distinct()
    geography_staging = geography_staging.withColumn(
        "hash", sha2(concat_ws("||", *geography_staging.columns), 256))

    asset_staging = data.select([
        'contract_number', 'country', 'county', 'parish', 'title',
        'description', 'price', 'property_type', 'bathrooms', 'bedrooms',
        'area_net', 'latitude', 'longitude'
    ]).distinct()

    # calculate hash using SHA2
    asset_staging = asset_staging.withColumn(
        "hash", sha2(concat_ws("||", *asset_staging.columns), 256))

    # weekly stock base
    asset_stock = data.select([
        'broker', 'contract_number', 'country', 'county', 'parish', 'price'
    ]).withColumn("quantity", lit(1)).withColumn("stock_date",
                                                 lit(execution_date))

    # save the data onto parquet to be consumed by Redshift
    broker_staging_loc = parquet_loc + "broker_staging.parquet"
    asset_staging_loc = parquet_loc + "asset_staging.parquet"
    geography_staging_loc = parquet_loc + "geography.parquet"
    stock_staging_loc = parquet_loc + "asset_stock.parquet"

    to_parquet(broker_staging, broker_staging_loc)
    to_parquet(asset_staging, asset_staging_loc)
    to_parquet(geography_staging, geography_staging_loc)
    to_parquet(asset_stock, stock_staging_loc)
def benchmark2():
    print("===Benchmark 2===")
    print(
        "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets"
    )
    print("")

    emptyDatabase()

    print("creating dataframe 1: two random generated doubles")
    randDF = sqlContext.range(0, 7000000).withColumn(
        'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache()
    randDFRows = randDF.count()
    randDFItems = randDFRows * len(randDF.columns)
    randDF.printSchema()
    print("bemchmarking dataframe 1")
    rand_benchmark = benchmark2execution(
        "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE")
    randDF.unpersist()

    print(
        "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers"
    )
    tmpDF = sqlContext.createDataFrame(
        sc.parallelize(range(
            0, 3000000)).map(lambda i: Row(number=i, string=str(i))))
    hashDF = tmpDF.select(tmpDF.number,
                          sha1(tmpDF.string).alias("sha1"),
                          sha2(tmpDF.string, 256).alias("sha256"),
                          sha2(tmpDF.string, 512).alias("sha512"),
                          md5(tmpDF.string).alias("md5")).cache()
    hashDFRows = hashDF.count()
    hashDFItems = hashDFRows * len(hashDF.columns)
    hashDF.printSchema()
    print("bemchmarking dataframe 2")
    hash_benchmark = benchmark2execution(
        "hash", hashDF,
        "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)"
    )
    hashDF.unpersist()

    print("jdbc_innodb\tapi_columnstore\t\trows\t\titems")
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems))
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
Ejemplo n.º 7
0
 def test_capture_illegalargument_exception(self):
     self.assertRaisesRegexp(
         IllegalArgumentException, "Setting negative mapred.reduce.tasks",
         lambda: self.sqlCtx.sql("SET mapred.reduce.tasks=-1"))
     df = self.sqlCtx.createDataFrame([(1, 2)], ["a", "b"])
     self.assertRaisesRegexp(IllegalArgumentException,
                             "1024 is not in the permitted values",
                             lambda: df.select(sha2(df.a, 1024)).collect())
Ejemplo n.º 8
0
def add_row_hash(df):
    """Adds a row hash to detect record changes.
    :param df:
    :return df: 
    """

    hash_columns = [x for x in df.columns if x not in ["snapshot_date"]]
    df = df \
        .withColumn("row_hash", F.sha2(F.concat_ws("||", *hash_columns), 256))

    return df
Ejemplo n.º 9
0
def benchmarkSHA256(df, jobLogger):
    jobLogger.info(
        '****************************************************************')
    jobLogger.info('Starting benchmark test calculatng SHA-512 hashes')
    start_time = timer()
    hashed_df = (df.withColumn('hashed_value', F.sha2(F.col('value'), 512)))

    # now trigger the computations by fetching a count at the RDD level
    count_value = hashed_df.rdd.count()
    end_time = timer()
    return (end_time - start_time), count_value
Ejemplo n.º 10
0
 def test_capture_illegalargument_exception(self):
     self.assertRaisesRegexp(
         IllegalArgumentException,
         "Setting negative mapred.reduce.tasks",
         lambda: self.sqlCtx.sql("SET mapred.reduce.tasks=-1"),
     )
     df = self.sqlCtx.createDataFrame([(1, 2)], ["a", "b"])
     self.assertRaisesRegexp(
         IllegalArgumentException,
         "1024 is not in the permitted values",
         lambda: df.select(sha2(df.a, 1024)).collect(),
     )
Ejemplo n.º 11
0
    def launch(self):
        self.logger.info("Launching databricks_jobs job")

        df, repartition = self.prepare_dataframe()

        image_df = df. \
            repartition(repartition, sha2("image_path", 224)).\
            rdd.\
            flatMap(lambda x: extract_face_emb(x.image_path)). \
            map(lambda x: ';'.join(map(str, x))).\
            saveAsTextFile(self.output_path)

        self.logger.info("Sample job finished!")
Ejemplo n.º 12
0
    def pseudonymize(self, df, schema):  #: list[list[str]]):
        """ Performs pseudonymization of the given dataframe based on the provided schema.
            For example, if the given df is for an entity called person, 
            2 dataframes will be returned, one called person that has hashed ids and masked fields, 
            and one called person_lookup that contains the original person_id, person_id_pseudo,
            and the non-masked values for columns marked to be masked."""

        df_pseudo = df_lookup = df

        for col_name, dtype, op in schema:
            if op == "hash-no-lookup" or op == "hnl":
                # This means that the lookup can be performed against a different table so no lookup is needed.
                df_pseudo = df_pseudo.withColumn(
                    col_name,
                    F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
                           256)).withColumnRenamed(col_name,
                                                   col_name + "_pseudonym")
                df_lookup = df_lookup.drop(col_name)
            elif op == "hash" or op == 'h':
                df_pseudo = df_pseudo.withColumn(
                    col_name,
                    F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
                           256)).withColumnRenamed(col_name,
                                                   col_name + "_pseudonym")
                df_lookup = df_lookup.withColumn(
                    col_name + "_pseudonym",
                    F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256))
            elif op == "mask" or op == 'm':
                df_pseudo = df_pseudo.withColumn(col_name, F.lit('*'))
            elif op == "partition-by":
                pass  # make no changes for this column so that it will be in both dataframes and can be used for partitioning
            elif op == "no-op" or op == 'x':
                df_lookup = df_lookup.drop(col_name)

        df_pseudo = self.fix_column_names(df_pseudo)
        df_lookup = self.fix_column_names(df_lookup)

        return (df_pseudo, df_lookup)
Ejemplo n.º 13
0
def transformation(logger, spark, source_df, processing_dt,
                   initial_spark_schemas, config, collection):
    try:
        if config["module_name"] == "vacancy":
            new_df = source_df.select([
                F.col(col).alias(
                    re.sub("[^0-9a-zA-Z$]+", " ",
                           col).strip().replace(" ", "_").lower())
                for col in source_df.columns
            ])
            new_df = new_df.withColumn("date_uploaded", F.lit(datetime.strftime(processing_dt, "%Y-%m-%d"))) \
                           .withColumn("error_desc", F.lit(None).cast(ArrayType(StringType())))

            schema = get_old_schema(
                logger,
                spark,
                schema=initial_spark_schemas[collection],
                database_name=config["published_database_name"],
                table_name=collection)
            old_df = spark.createDataFrame([], schema)
            evolved_df = get_evolved_schema(logger, old_df, new_df)

        elif config["module_name"] in ("application", "payment"):
            new_df = source_df.select([
                F.col(col).alias(col[0].lower() +
                                 re.sub(r'(?!^)[A-Z]', lambda x: '_' + x.group(
                                     0).lower(), col[1:]))
                for col in source_df.columns
            ])
            new_df = new_df.withColumn("date_uploaded", F.lit(datetime.strftime(processing_dt, "%Y-%m-%d"))) \
                           .withColumn("error_desc", F.lit(None).cast(ArrayType(StringType())))\
                           .withColumn("row_hash_id", F.sha2(F.concat_ws("||", *new_df.columns), 256))

            old_schema = get_old_schema(
                logger,
                spark,
                schema=initial_spark_schemas[collection],
                database_name=config["published_database_name"],
                table_name=collection)

            old_df = spark.createDataFrame([], old_schema)
            evolved_df = get_evolved_schema(logger, old_df, new_df)

    except BaseException as ex:
        logger.error(
            "Failed to transformation the source dataframe because of error: %s",
            str(ex))
        sys.exit(-1)

    return evolved_df
Ejemplo n.º 14
0
    def universal_identifier_generator(data_set, key_field, key_name):
        """
        Universal Identifier Generator generates UUIDs based on data fields from the data set.  This is the equivalent
        of a validation hash, based on business key(s).
        :param data_set: The data set the hash is being built from and added to.
        :param key_field:  Business key field(s) to be hashed.
        :type key_field: string or list
        :param key_name: Name of the uuid field
        :type key_name: String
        :return uuid_key:
        """

        if type(key_field) is not list:
            key_field = [key_field]

        data_set = data_set.withColumn(
            key_name, F.sha2(F.concat_ws('||', *key_field), 512))

        return data_set
Ejemplo n.º 15
0
def main():
    spark = SparkSession.builder.appName("Anonymize PySpark").getOrCreate()
    args_iter = iter(sys.argv[1:])
    args = dict(zip(args_iter, args_iter))

    #sample args for interactive testing
    #args = {'project_bucket': 'project1-lz', 'input_table': 'upload', 'output_table': 'raw', 'database': 'default', 'file_name': 'Tweets.csv'}

    project_bucket = args['project_bucket']
    input_table = args['input_table']
    output_table = args['output_table']
    database = args['database']
    input_s3_uri = 's3://' + project_bucket + '/' + input_table + '/' + args[
        'file_name']
    output_s3_uri = 's3://' + project_bucket + '/' + output_table + '/' + args[
        'file_name'].split('.')[0] + '-anon/'

    # Interactive pyspark from glue development endpoint allows reaading from glue crawlers
    # from awsglue.context import GlueContext
    #glueContext = GlueContext(SparkContext.getOrCreate())

    # Create a dataframe from glue catalog
    #df = glueContext.create_data_frame.from_catalog(database=database, table_name=input_table)

    #Print out information about this data
    #print("Count:  ", df.count())
    #df.printSchema()

    df = spark.read.csv(input_s3_uri, header=True)

    # replace each tweeters name with crc bigint
    dfAnnocrc = df.withColumn("annonym",
                              sha2("name",
                                   256)).select("annonym", "tweet_id",
                                                "airline", "airline_sentiment",
                                                "text")

    # write back to s3 as parquet
    dfAnnocrc.write.mode("append").parquet(output_s3_uri)
Ejemplo n.º 16
0
    def add_hashed_id(df, columns=[], hashed_col='Hashed_ID', hash_type='md5'):
        """
            This method will create a dummy transaction for each account record in the dataframe.

            Returns
                --------
                Dataframe with hashed Id as a column
                ------
            Parameters
                --------
                df : spark dataframe
                    dataframe to create hashed id on
                columns : list of strings
                    columns to use hash, default is None which takes in all columns of df
                hashed_col : string
                    column name for hashed id
                --------
        """
        if len(columns) == 0:
            columns = df.columns
        else:
            illegal_columns = []
            for column in columns:
                if column not in df.columns:
                    illegal_columns.append(column)
            if len(illegal_columns) > 0:
                raise IllegalArgumentException(
                    'Column {} does not exist in dataframe'.format(', '.join(illegal_columns)))
        
        if hashed_col is None or hashed_col == '':
            hashed_col = 'Hashed_ID'
        
        if hash_type == 'md5':
            df = df.withColumn(hashed_col, F.md5(F.concat(*columns)))
        else:
            df = df.withColumn(hashed_col, F.sha2(F.concat(*columns)))
        return df
Ejemplo n.º 17
0
    def load_passengers(self, passenger_filename, passenger_output):
        """
        Function to load the passenger data from csv in GCS, clean, add UID,
        and upload to BigQuery
        :param passenger_filename: str input file name
        :param passenger_output: str of project.dataset.table to save passenger data
        """
        self.passenger_filename = passenger_filename
        self.passenger_output = passenger_output
        people_path = 'gs://{}/{}'.format(self.bucket, passenger_filename)

        logger.info(f"Loading passenger info from {self.bucket}.{passenger_filename}")
        passengers_df = self.sparkql.read.csv(people_path, header=True)

        # Use withColumn and initcap to standardize the names
        passengers_df = passengers_df.withColumn('first_name',
                                                 initcap(col('first_name')))\
                                     .withColumn('middle_name',
                                                 initcap(col('middle_name')))\
                                     .withColumn('last_name',
                                                 initcap(col('last_name')))

        # Create full_name column
        passengers_df = passengers_df.withColumn('full_name',
                                                 concat_ws(" ",
                                                           col('first_name'),
                                                           col('middle_name'),
                                                           col('last_name')))
        passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256))

        # Write to BigQuery
        logger.info(f"Writing file to {passenger_output}")
        passengers_df.write.format('bigquery') \
          .option('table', passenger_output) \
          .save()
        self.passengers_df = passengers_df
Ejemplo n.º 18
0
def run():
    # Build session
    sparkql = SparkSession.builder.master('local[1]').getOrCreate()

    # Load config informaiton if __name__ == '__main__':
    people_path = config['defaults']['ch3']['ep1']['passenger_input'].get(str)
    save_path = config['defaults']['ch3']['ep1']['passenger_output'].get(str)
    bq_table = config['defaults']['ch3']['ep1']['passenger_table'].get(str)
    logger.info(f"Loading passenger info from {people_path}")

    # read csv file into spark dataframe
    passengers_df = sparkql.read.csv(people_path, header=True)
    logger.info(f"There are {passengers_df.count()} rows")

    # Load the passenger data and make sure the names have initial capitalization
    logger.info("Cleaning names and creating full name")
    passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\
                                 .withColumn('middle_name', initcap(col('middle_name')))\
                                 .withColumn('last_name', initcap(col('last_name')))

    # Create full_name column
    passengers_df = passengers_df.withColumn(
        'full_name',
        concat_ws(" ", col('first_name'), col('middle_name'),
                  col('last_name')))
    logger.info("Creating sha2 uid from email")
    # Create a sha2 uid based on the email
    passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256))

    logger.info(f"Saving file to {save_path}")
    # Save dataframe as a parquet file
    passengers_df.write.parquet(save_path)

    logger.info("Uploading file to BigQuery")
    # Upload the file as an external table in BigQuery
    gbq_load(bq_table, save_path)
Ejemplo n.º 19
0
def add_hashed_column(dataframe, column_name):
    dataframe = dataframe.withColumn(
        column_name + "_hashed",
        sha2(dataframe[column_name].cast(StringType()), 512))
    return dataframe
Ejemplo n.º 20
0
    nestedwindowSpec = Window.partitionBy("ROW_ID").orderBy(
        monotonically_increasing_id())
    nested_parquet_filepath = (mount + basepath + zone + "/" + contry_name +
                               "/" + source_name + "/" + object_name + "_" +
                               nested_column + "/" + year + "/" + month + "/" +
                               day + "/" + object_name + "_" + nested_column +
                               "_" + filename_timestamp + ".parquet")
    if max_records_on_array > 0:
        result = denormalizer(newDf, nested_column, identity_columns)
        result = result.withColumn("ITEM_ID",
                                   sqlfn.row_number().over(nestedwindowSpec))
        # added to remove any structypes
        result = flattenDataframe(result)
        result.write.format("parquet").save(nested_parquet_filepath)
        newDf = newDf.withColumn(
            nested_column, sha2(newDf[nested_column].cast(StringType()), 512))
    else:
        print("No nested records found, writing empty file")
        result = newDf.select("contactPoints", *identity_columns).limit(0)

        result = result.withColumn("ITEM_ID",
                                   sqlfn.row_number().over(nestedwindowSpec))
        result.write.format("parquet").save(nested_parquet_filepath)
        newDf = newDf.withColumn(nested_column, lit(None).cast(StringType()))

# COMMAND ----------

# MAGIC %md
# MAGIC #### Total Count of the Records

# COMMAND ----------
def transform_hcp_trans_data():
    df_hcp_trans_data = spark \
        .read \
        .option('mergeSchema', 'true') \
        .parquet(config.get(config_set, 'hcp.txns.base.raw.path'))

    df_hcp_trans_data.createOrReplaceTempView(config.get(config_set, 'hcp.transactions.data.table'))

    df_hcp_txns_base = spark.sql("""
        select 
        BOOKING_STATUS
        ,HOTELHUB_BOOKING_REF
        ,CONFIRMATION_REF
        ,CANCELLATION_REF
        ,PNR
        ,PNR_Type
        ,HOTELHUB_MODE
        ,MARKET
        ,CLIENT_CLIENT_TOP_NAME
        ,CLIENT_SUB_UNIT_CLIENT_NAME
        ,CUSTOMER_AGENCY_NAME
        ,HOTEL_NAME
        ,CITY
        ,COUNTRY
        ,STAR_RATING
        ,cast(concat_ws
        ('-', concat(cast('20' as string), substr(cast(date_in as string),1,2)), 
        substr(cast(date_in as string),3,2), 
        substr(cast(date_in as string),5,2) ) as timestamp)
        as DATE_IN
        ,cast(concat_ws
        ('-', concat(cast('20' as string), substr(cast(date_out as string),1,2)), 
        substr(cast(date_out as string),3,2), 
        substr(cast(date_out as string),5,2) ) as timestamp)
        as DATE_OUT
        ,NIGHTS
        ,NUM_OF_ROOMS
        ,NUM_OF_GUEST
        ,OUT_POLICY_REASON
        ,BOOKING_SOURCE
        ,RATE_DESCRIPTION
        ,CANCELLATION_POLICY
        ,cast(RATEPERDAY_AMOUNT as double) as RATEPERDAY_AMOUNT
        ,RATEPERDAY_CURRCODE
        ,AGENCY_PRIORITY
        ,CUSTOMER_PRIORITY
        ,PAYMENT_MODE
        ,cast(RATEPERDAY_EUR as double) as RATEPERDAY_EUR
        ,cast(RATEPERDAY_GBP as double) as RATEPERDAY_GBP
        ,cast(RATEPERDAY_USD as double) as RATEPERDAY_USD
        ,TOTALAMOUNT_BOOKED_CURRCODE
        ,TOTALAMOUNT_BOOKED
        ,LOCAL_CURRENCY_CODE
        ,cast(RATEPERDAY_LCC as double) as RATEPERDAY_LCC
        ,cast(TOTALAMOUNT_EUR as double) as TOTALAMOUNT_EUR
        ,cast(TOTALAMOUNT_GBP as double) as TOTALAMOUNT_GBP
        ,cast(TOTALAMOUNT_USD as double) as TOTALAMOUNT_USD
        ,cast(TOTALAMOUNT_LCC as double) as TOTALAMOUNT_LCC
        ,BOOKED_RATE_TYPE_CODE
        ,CONTENT_SOURCE
        ,GDS_CHAIN_NAME
        ,cast(concat_ws
        ('-', concat(concat(cast('20' as string)), substr(cast(created_date as string),1,2)), 
        substr(cast(created_date as string),3,2), 
        substr(cast(created_date as string),5,2) ) as timestamp)
        as CREATED_DATE
        ,CREATEDBY_USER
        ,CONFIRMEDBY_USER
        ,case when length(cancel_datetime)=0 then '' 
        else 
        cast(concat_ws
        ('-', concat(concat(cast('20' as string)), substr(cancel_datetime,1,2)), 
        substr(cancel_datetime,3,2), 
        substr(cancel_datetime,5,11) ) as timestamp)
        end as cancel_datetime
        ,ABANDON_BY_USER
        ,OBT_PNR
        ,CLIENT_BOOKING_CHANNEL
        ,RATE_ACCESS_CODE_BOOKED
        ,COMMISSION_TYPE
        ,COMMISSION_CURRENCY
        ,COMMISSION_AMOUNT
        ,ESTIMATED_INCOME_DUE
        ,RATE_ACCESS_CODE_SHOPPED
        ,HOTELHUB_PROPERTY_ID
        ,HARP_PROPERTY_ID_NO
        ,CONTENT_SOURCE_PROPERTY_ID
        ,AGGREGATOR_BOOKING_COMMISSION
        ,AGGREGATOR_REVENUE_VALUE
        ,AGGREGATOR_REVENUE_SHARE
        ,AGGREGATOR_CURRENCY
        ,RATE_CHANGE
        ,RATE_ACCESS_CODE_RETURNED
        ,BACK_OFFICE_ACCOUNT_NUMBER
        ,case 
            when traveller_portrait_guid='' then 'UNKNOWN' 
            when traveller_portrait_guid like '%-%' then regexp_replace(traveller_portrait_guid, '-',':')
            else traveller_portrait_guid 
            end as TRAVELLER_PORTRAIT_GUID
        ,case when length(booking_start_dttm)=0 then '' 
        else 
        cast(concat_ws
        ('-', concat(concat(cast('20' as string)), substr(booking_start_dttm,1,2)), 
        substr(booking_start_dttm,3,2), 
        substr(booking_start_dttm,5,11) ) as timestamp)
        end as booking_st_tm
        ,case when length(booking_end_dttm)=0 then '' 
        else 
        cast(concat_ws
        ('-', concat(concat(cast('20' as string)), substr(booking_end_dttm,1,2)), 
        substr(booking_end_dttm,3,2), 
        substr(booking_end_dttm,5,11) ) as timestamp)
        end as booking_en_tm
        ,cast(STEP0_TIME as int) as STEP0_TIME
        ,cast(STEP1_TIME as int) as STEP1_TIME
        ,cast(STEP2_TIME as int) as STEP2_TIME
        ,cast(STEP3_TIME as int) as STEP3_TIME
        ,cast(STEP4_TIME as int) as STEP4_TIME
        ,GDS_SHOPPED_FOR_RATE
        ,cast(CHEAP_CLIENT_NEG_RATE as int) as CHEAP_CLIENT_NEG_RATE
        ,CHEAP_CLIENT_NEG_RATE_CURRCODE
        ,CHEAP_CLIENT_NEG_RATE_DESCRIPTION
        ,cast(CHEAP_CLIENT_OR_CWV_RATE as int) as CHEAP_CLIENT_OR_CWV_RATE
        ,CHEAP_CLIENT_OR_CWV_NEG_RATE_CURRCODE
        ,CHEAP_CLIENT_OR_CWV_RATE_DESCRIPTION
        ,cast(CHEAP_GDS_PUBLISHED_RATE as int) as CHEAP_GDS_PUBLISHED_RATE
        ,CHEAP_GDS_NEG_RATE_CURRCODE
        ,CHEAP_GDS_PUBLISHED_RATE_DESCRIPTION
        ,cast(CHEAP_BOOKINGCOM_RATE as int) as CHEAP_BOOKINGCOM_RATE 
        ,CHEAP_BOOKINGCOM_RATE_CURRCODE
        ,CHEAP_BOOKINGCOM_RATE_DESCRIPTION
        ,BRANCH_IATA
        ,ON_REQUEST_INDICATOR
        ,case when length(lastmodified_datetime)=0 then ''
        else
        cast(concat_ws('-', concat(concat(cast('20' as string)), substr(lastmodified_datetime,1,2)),
        substr(lastmodified_datetime,3,2),substr(lastmodified_datetime,5,11) ) as timestamp)
        end as lastmodified_datetime
        ,modified_count
        ,booking_time_duration
        ,rate_bucket
        ,back_office
        ,client_sub_unit_client_id
        ,cast (MISSED_SAVING as int) as missed_saving
        ,cast(REALISED_SAVING as int) as realised_saving
        ,POPULAR_HOTEL
        ,HOTEL_RANK
        ,cast(HOTEL_TOTAL as int) as hotel_total
        ,agency_source_name
        ,client_client_top_id
        ,err_desc
        ,aggregator_property_type
        ,hotel_bucket_simplified
        ,gds_commission_text
        ,avlb_htl_count 
        ,offer 
        ,aaa_rate 
        ,error_code 
        ,concat(hotelhub_property_id,rate_access_code_booked,booked_rate_type_code,case 
                when content_source like 'BOOKING%' then 'BC'
                when content_source like 'EAN%' then 'EH'
                when content_source like 'DESIYA%' then 'DH'
                when content_source like 'PREMIER%' then 'PI'
                when content_source like 'CHL%' then 'CM'
                when content_source like 'SABR%' then 'S'
                when content_source like 'AMAD%' then 'A'
                when content_source like 'GALI%' then 'G'
                when content_source like 'APO%' then '1V'
                when content_source like 'HOTELH%' and gds_shopped_for_rate like 'SABR%' then 'S'
                when content_source like 'HOTELH%' and gds_shopped_for_rate like 'APO%' then '1V'
                when content_source like 'HOTELH%' and gds_shopped_for_rate like 'AMAD%' then 'A'
                when content_source like 'HOTELH%' and gds_shopped_for_rate like 'GALI%' then 'G'
                else 'XX' end
        ) as rate_id
        ,case
         when content_source in('BOOKING.COM','EAN HOTEL COLLECT', 'BOOKING.COM CASHONLY','DESIYA HOTELS','LOCAL AGGREGATOR') then 'AGG RATE'
         when content_source ='PREMIER INN - PI' then 'PUB - DIRECT CONNECT'
         when (rate_bucket like '%CLIENT%' or rate_bucket like '%ROOMIT%') and rate_access_code_booked='CWV' then 'ROOMIT (CWV)'
         when (rate_bucket like '%CLIENT%' or rate_bucket like '%ROOMIT%') and ((instr(lower(rate_description),'client value')!=0) or (instr(lower(rate_description),'cwv')!=0)) then 'ROOMIT (CWV)'
         when (rate_access_code_booked='CWV' or rate_access_code_booked is null or rate_access_code_booked='') and 
              ((instr(lower(rate_description),'client value')!=0) or (instr(lower(rate_description),'cwv')!=0)) then 'ROOMIT (CWV)'
         when (instr(lower(rate_description),'client')!=0 and instr(lower(rate_description),'value')!=0)or 
              (instr(lower(rate_description),'carlson')!=0 and instr(lower(rate_description),'value')!=0) or
              instr(lower(rate_description),'roomit')!=0 or
              instr(lower(rate_description),'room it')!=0 then 'ROOMIT (CWV)'
         when rate_description not like'CWV%' and instr(lower(rate_description),'crs')!=0 then 'CLIENT'
         when (rate_bucket like '%CLIENT%' or rate_bucket like '%ROOMIT%') and rate_access_code_booked ='CLIENT' then 'ROOMIT (CLIENT)'
         when (rate_access_code_booked='CLIENT' or rate_access_code_booked is null or rate_access_code_booked='') and 
              ((instr(lower(rate_description),'client')!=0) or (instr(lower(rate_description),'carlson')!=0)) then 'ROOMIT (CLIENT)'
         when ((instr(lower(rate_description),'client')!=0) and (instr(lower(rate_description),'value')=0)) or 
              ((instr(lower(rate_description),'carlson')!=0) and (instr(lower(rate_description),'value')=0)) or
              instr(lower(rate_description), 'consortia')>0 then 'ROOMIT (CLIENT)'
         when rate_bucket like '%PUBLIC%' and instr(lower(rate_description), 'worldwide')>0 then 'PUB'
         when instr(lower(rate_description), 'room rac')>0 or
              instr(lower(rate_description), 'room pro')>0 or
              instr(lower(rate_description), 'bed flexible rate')>0 or
              instr(lower(rate_description), 'beds flexible rate')>0 then 'PUB'
         when  (rate_bucket like '%CLIENT%' or rate_bucket like '%KUNDEN%') then 'CLIENT'
         when  ((rate_bucket like '%REQUEST%' or rate_bucket like '%ANFRAGE%' or rate_bucket like '%DEMANDE%' or rate_bucket like '%PETICI%' or rate_bucket like 'PUBLIC%'  or rate_bucket is null or rate_bucket='') and 
              (gds_commission_text like 'NO%' or gds_commission_text is null or gds_commission_text ='')) then 'CLIENT'
         when  (rate_bucket like '%PUBLIC%' or rate_bucket='U') and 
              (gds_commission_text like '%NON%' or gds_commission_text like '%NOT%' or gds_commission_text like '%NO C%' or gds_commission_text like '% 0.00%') then 'CLIENT'
         when  ((gds_commission_text like '%NON%' or gds_commission_text like '%NOT%' or gds_commission_text like '%NO C%' or gds_commission_text like '% 0.00%' or gds_commission_text like '%UNK%') and 
              (rate_description like '%COR%' or rate_description like '%CLT' or rate_description like '%GOV%' or rate_description like '%NEG%')) and
              (instr(lower(rate_description),'corn')=0 and instr(lower(rate_description),'decor')=0 and instr(lower(rate_description),'corri')=0) then 'CLIENT'
         when ((gds_commission_text is null or gds_commission_text ='')and 
              (rate_bucket like '%REQUEST%' or rate_bucket like '%ANFRAGE%' or rate_bucket like '%DEMANDE%' or rate_bucket like '%PETICI%' or rate_bucket like 'PUBLIC%'  or rate_bucket is null or rate_bucket='') and 
              (rate_description like '%CLT' or rate_description like'%COR%' or rate_description like'%NEG%')) and
              (instr(lower(rate_description),'corn')=0 and instr(lower(rate_description),'decor')=0 and instr(lower(rate_description),'corri')=0) then 'CLIENT'
         when  ((gds_commission_text is null or gds_commission_text ='')and rate_description like '%COR') then 'CLIENT'
         when  (gds_commission_text like '%NON%' or gds_commission_text like '%NOT%' or gds_commission_text like '%NO C%' or gds_commission_text like '% 0.00%') then 'CLIENT'
         when  instr(lower(rate_description),'corporate')!=0 or instr(lower(rate_description),'government')!=0 then 'CLIENT'
         when  (rate_access_code_booked is null or rate_access_code_booked='' or rate_access_code_booked='SC' or rate_access_code_booked='COR') and 
            (instr(lower(rate_description),' cor')>0 or instr(lower(rate_description),' clt')>0) then 'CLIENT'
         when instr(lower(rate_description),lower(client_client_top_name))>0 then 'CLIENT'
         when instr(lower(rate_description), 'aaa')>0 or instr(lower(rate_description), 'caa')>0 or instr(lower(rate_description), 'aarp')>0 or instr(lower(rate_description), 'spg member')>0 then 'CLIENT'
         else 'PUB' end as new_rate_bucket,
         payment_type_used,
         case 
        when content_source like 'BOOKING%' then 'BC'
        when content_source like 'EAN%' then 'EH'
        when content_source like 'DESIYA%' then 'DH'
        when content_source like 'PREMIER%' then 'PI'
        when content_source like 'CHL%' then 'CM'
        when content_source like 'SABR%' then 'S'
        when content_source like 'AMAD%' then 'A'
        when content_source like 'GALI%' then 'G'
        when content_source like 'APO%' then '1V'
        when content_source like 'HOTELH%' and gds_shopped_for_rate like 'SABR%' then 'S'
        when content_source like 'HOTELH%' and gds_shopped_for_rate like 'APO%' then '1V'
        when content_source like 'HOTELH%' and gds_shopped_for_rate like 'AMAD%' then 'A'
        when content_source like 'HOTELH%' and gds_shopped_for_rate like 'GALI%' then 'G'
        else 'XX' end as channel_type,
        SESSIONID,
        date_created_year,
        date_created_month,
        date_created_day
    from 	{}""".format(config.get(config_set, 'hcp.transactions.data.table')))

    df_hcp_txns_base_deduped = df_hcp_txns_base \
        .withColumn('rownum',
                    F.row_number().over(Window
                                        .partitionBy('HOTELHUB_BOOKING_REF')
                                        .orderBy(F.col('LASTMODIFIED_DATETIME').desc()))) \
        .filter(F.col('rownum') == 1) \
        .drop('rownum')

    # Export for general analytical use as sbx_dst.hcp_txns_base
    dump_partitioned_dataframe(df_hcp_txns_base_deduped,
                               ['date_created_year', 'date_created_month', 'date_created_day'],
                               config.get(config_set, 'hcp.txns.base.path'),
                               config.get(config_set, 'hcp.txns.base.table'))

    # Generate match keys
    channels = ['CYTRIC', 'TRVDOO', 'KDSS', 'GETTHERE', 'CONCUR', 'BOOK2GO', 'SERKO', 'ZILLIOUS']
    regexp_pattern = '[^a-zA-Z0-9]+'

    df_hcp_txns_new = spark.read.parquet(config.get(config_set, 'hcp.txns.base.path')) \
        .filter(F.col('booking_status').isin(['CFD', 'CNX'])) \
        .filter('pnr is not null or obt_pnr is not null') \
        .withColumn('concat_base_OBT_PNR', F.concat('date_in', 'date_out', 'OBT_PNR')) \
        .withColumn('concat_base_PNR', F.concat('date_in', 'date_out', 'PNR')) \
        .withColumn('full_mk',
                    F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels),
                           F.regexp_replace(F.concat('concat_base_OBT_PNR',
                                                     'HARP_PROPERTY_ID_NO',
                                                     'TRAVELLER_PORTRAIT_GUID'),
                                            regexp_pattern, ''))
                    .otherwise(F.regexp_replace(F.concat('concat_base_PNR',
                                                         'HARP_PROPERTY_ID_NO',
                                                         'TRAVELLER_PORTRAIT_GUID'),
                                                regexp_pattern, ''))) \
        .withColumn('prop_mk',
                    F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels),
                           F.regexp_replace(F.concat('concat_base_OBT_PNR', 'HARP_PROPERTY_ID_NO'),
                                            regexp_pattern, ''))
                    .otherwise(F.regexp_replace(F.concat('concat_base_PNR', 'HARP_PROPERTY_ID_NO'),
                                                regexp_pattern, ''))) \
        .withColumn('pnr_mk',
                    F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels),
                           F.regexp_replace('concat_base_OBT_PNR', regexp_pattern, ''))
                    .otherwise(F.regexp_replace('concat_base_PNR', regexp_pattern, ''))) \
        .withColumn('dedupe_key',
                    F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels),
                           F.sha2(F.regexp_replace('concat_base_OBT_PNR', regexp_pattern, ''), 256))
                    .otherwise(F.sha2(F.regexp_replace('concat_base_PNR', regexp_pattern, ''), 256))) \
        .withColumn('rk', F.rank().over(Window.partitionBy('dedupe_key').orderBy(F.col('lastmodified_datetime').desc()))) \
        .filter('rk = 1') \
        .withColumn('row_num', F.lit(9999))

    dump_partitioned_dataframe(df_hcp_txns_new,
                               ['date_created_year', 'date_created_month', 'date_created_day'],
                               config.get(config_set, 'hcp.txns.new.path'),
                               config.get(config_set, 'hcp.txns.new.table'))
def hash_and_register_data_tables(data_df, table_name):
    data_df = data_df.withColumn("hash_value", F.sha2(F.concat_ws("||", *data_df.columns), 512))
    data_df.createOrReplaceTempView(table_name)
    
    print(f'row count for {table_name}: ' + str(data_df.count()))
Ejemplo n.º 23
0
 spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
 for row in guestTable:
     tableName = row
     if tableName == "thirdparty":
         sql = guestTable[tableName][
             0] + " thirdparty_etl.thirdparty_t_task " + guestTable[
                 tableName][1]
         guestPath = "s3://rupiahplus-data-warehouse/etl/banda/guest/" + tableName
         spark.sql(sql).write.mode("overwrite").orc(guestPath)
     else:
         #先给banda加上今天的partitions
         tempDataBase = " `banda-etl-s3`"
         guestPath = "s3://rupiahplus-data-warehouse/etl/banda/guest/" + tableName
         if tableName == "t_customer":
             sql = "select * from  " + tempDataBase + "." + tableName
             spark.sql(sql).withColumn('mobile', F.sha2(
                 F.col('mobile'), 256)).drop('imei').drop('password').drop(
                     'etldate').write.mode("overwrite").orc(guestPath)
         elif tableName == "t_loan_app":
             sql = "select * from  " + tempDataBase + "." + tableName
             spark.sql(sql).drop('credential_no').drop(
                 'etldate').write.mode("overwrite").orc(guestPath)
         elif tableName == "t_personal_info":
             sql = guestTable[tableName][0] + tempDataBase + "." + tableName
             spark.sql(sql).drop('credential_no').drop(
                 'etldate').write.mode("overwrite").orc(guestPath)
         elif tableName == "t_auto_review_loan":
             sql = "select * from  " + tempDataBase + "." + tableName
             spark.sql(sql).drop('name').drop('etldate').write.mode(
                 "overwrite").orc(guestPath)
         elif tableName == "t_lpay_deposit":
             sql = "select *  from  " + tempDataBase + "." + tableName
Ejemplo n.º 24
0
 df2_1=df2.select("entitydata").rdd.map(lambda p1:is_json(p1["entitydata"])).map(lambda g2: dict((k.lower(), unicode(v)) if type(v) != "unicode" else ((k.lower(), v)) for k, v in g2.iteritems()))
 l2=df2_1.collect()
 #l2=df2.select("entitydata").rdd.map(lambda r: json.loads(r["entitydata"])).map(lambda g2: dict((k.lower(), unicode(v)) if type(v) != "unicode" else ((k.lower(), v)) for k, v in g2.iteritems())).collect()
 df3_2=spark.createDataFrame(l2)
 #df3=df2.select("entitydata").rdd.map(lambda s:re.sub(r'(:)(null)',r'\1"NULL"',s["entitydata"])).map(lambda s2:re.sub(r'(:)(true)',r'\1"True"',s2)).map(lambda s3:re.sub(r'(:)(false)',r'\1"False"',s3)).map(lambda k: re.sub(r'(:)([0-9a-zA-Z.:]+)(,)', r'\1"\2"\3', k)).map(lambda k: re.sub(r'(:)([0-9a-zA-Z,:]+)(})', r'\1"\2"\3', k)).map(lambda d21: re.sub('[a-zA-Z"]+:', lambda m: m.group(0).lower(), d21)).map(lambda p: ast.literal_eval(p))
 #.map(lambda g1: str(g1).lower())
 #df3_2=df3.map(lambda v:Row(**v)).toDF()
 #df3_2=spark.createDataFrame(df3.collect())
 df2=df2.withColumn("columnindex",  row_number().over(w))
 df3_2=df3_2.withColumn("columnindex", row_number().over(w))
 final=df2.join(df3_2, df2.columnindex == df3_2.columnindex, 'inner').drop(df3_2.columnindex)
 final=final.drop('columnindex')
 sha_columns=df3_2.columns
 sha_columns.append("eventtype")
 sha_columns.remove("columnindex")
 final=final.withColumn("sha_key", sha2(concat_ws("||", *sha_columns), 256))
 if "_corrupt_record_data" in final.columns:
     print("_corrupt_JSON_data found")
     bad_record_new=final.filter("_corrupt_record_data is not null").select("entitystring","process__id","gdia_load_date")
     bad_record_new=bad_record_new.withColumn("reason", lit("JSON parsing error in entity data"))
     bad_record=bad_record.union(bad_record_new)
     final=final.filter("_corrupt_record_data is null")
     final=final.drop("_corrupt_record_data")
 else:
     print(" json data is clean")
 final=final.drop('entitydata')
 final=final.drop('entitystring')
 if i344.lower() in tables_in_db:
     print("Table already existing: "+i344)
     col_in_new_data=[j28.lower() for j28 in final.columns]
     col_in_table=sqlContext.table(i344).columns
Ejemplo n.º 25
0
    tokenizedit=tokenized.withColumn('match_deviceid_3_tokens',match_deviceid_3_tokens_udf(col('words')))
    #
    new_expand_match=tokenizedit.join(tokens_to_match, tokenizedit.match_deviceid_3_tokens == tokens_to_match.match_deviceid_3_tokens , 'left_outer').select(tokenizedit.metadata, tokenizedit.logzio_id, tokenizedit.beat, tokenizedit.host, tokenizedit.it, tokenizedit.logzio_codec, tokenizedit.message, tokenizedit.offset, tokenizedit.source, tokenizedit.tags, tokenizedit.type, tokenizedit.messagecut , tokenizedit.words )
    tokenized_validated = new_expand_match.orderBy(rand()).limit(95000)
    tokenized_validated.printSchema()
#
tokenized_validated.coalesce(1).write.json(output_file2)
# Tokenize NON-Fraud-LABEL
# hash the message de-duplicate those records
notfraud_file=sqlContext.read.json(input_file3).repartition(50)
notfraud_file.printSchema()
#
notfraud_df=notfraud_file\
.filter("message IS NOT NULL").filter("words IS NOT NULL")\
.withColumn('fraud_label',lit(0).cast('int'))\
.withColumn('hash_message',F.sha2(col('message'),512)).groupby(col('hash_message'))\
.agg(F.first(col('fraud_label')).alias('fraud_label'),F.first(col('words')).alias('words'),F.first(col('message')).alias('message'))\
.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)
notfraud_df.printSchema()
# Only the Not-Fraud are randomly sorted
#
from pyspark.sql.functions import rand
#
df_notfraud_words = notfraud_df.filter("message IS NOT NULL").select(col('fraud_label'),col('hash_message'),col('words'))\
.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)
df_notfraud_words.printSchema()
#
# FILTER FRAUD AND LABEL 
# Join with Internal Curation Data in urltopredict staged folder
# hash the message de-duplicate those records
fraud_file=sqlContext.read.json(input_file1_playback_fraud).repartition(50)
Ejemplo n.º 26
0
def process_log_data(spark, input_data, output_data):
    """Process user log data creating the tables user, time and songplays

    Args:
        spark (SparkSession): The spark session object
        input_data (str): The input files path
        output_data (str): The output files path
    """
    # read log data file
    LOGGER.info('read log data file')
    log_df = spark.read.json(input_data)

    # filter by actions for song plays
    LOGGER.info('filter by actions for song plays')
    log_df = log_df.where(F.col('page') == 'NextSong')

    # extract columns for users table
    LOGGER.info('extract columns for users table')
    user_table = log_df.select(
        ['userId', 'firstName', 'lastName', 'gender', 'level'])

    # write users table to parquet files
    LOGGER.info('write users table to parquet files')
    user_path = os.path.join(output_data, 'user')
    user_table.coalesce(1).write.mode('overwrite').parquet(user_path)

    # create datetime column from original timestamp column
    LOGGER.info('create datetime column from original timestamp column')
    get_timestamp = F.udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                          TimestampType())
    log_df = log_df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    LOGGER.info('extract columns to create time table')
    time_table = log_df.select(
        'start_time',
        F.hour('start_time').alias('hour'),
        F.dayofmonth('start_time').alias('day'),
        F.weekofyear('start_time').alias('weekofyear'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'),
        F.dayofweek('start_time').alias('weekday')).drop_duplicates(
            ['start_time'])

    # write time table to parquet partitioned by year and month
    LOGGER.info('write time table to parquet partitioned by year and month')
    time_table.coalesce(1).write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet(os.path.join(output_data, 'time'))

    # read in song data to use for songplays table
    LOGGER.info('read in song data to use for songplays table')
    song_df = spark.read.parquet(os.path.join(output_data, 'song'))
    artist_df = spark.read.parquet(os.path.join(output_data, 'artist'))

    # join artist and song data
    LOGGER.info('join artist and song data')
    song_df = artist_df.select(['artist_name', 'artist_id'])\
        .join(song_df, on='artist_id', how='inner')

    # extract columns from joined song and log datasets to create songplays
    LOGGER.info('extract columns from joined song and log datasets to create '
                'songplays')
    on_clause = \
        (song_df.title == log_df.song) \
        & (song_df.artist_name == log_df.artist) \
        & (song_df.duration == log_df.length)
    songplays_table = log_df.join(song_df, on_clause, how='inner')

    # select columns and create year and month columns
    LOGGER.info('select columns and create year and month columns')
    songplays_table = songplays_table.select(
        'start_time',
        F.col('userId').alias('user_id'), 'level', 'song_id', 'artist_id',
        F.col('itemInSession').alias('session_id'), 'location',
        F.col('userAgent').alias('user_agent'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'))

    # create songplay_id and drop duplicates by this column
    LOGGER.info('create songplay_id and drop duplicates by this column')
    key_columns = [
        'start_time', 'user_id', 'song_id', 'artist_id', 'session_id'
    ]
    songplays_table = songplays_table.withColumn(
        'songplay_id', F.sha2(F.concat_ws("||", *key_columns),
                              256)).drop_duplicates(['songplay_id'])

    # write songplays table to parquet files partitioned by year and month
    LOGGER.info('write songplays table to parquet partitioned by year/month')
    songplays_table.coalesce(1).write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet(os.path.join(output_data, 'songplays'))
Ejemplo n.º 27
0
def test_functions():
    df = spark.createDataFrame([{'col': 'foo'}], ['col'])
    rows = df.select(F.sha2(df.col, 256).alias('hashed')).collect()

    assert rows[
        0].hashed == '2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae'
Ejemplo n.º 28
0
# COMMAND ----------

from pyspark.sql import functions as F
df = df.withColumn('swap', F.rand(2586) > 0.45)

df = df.withColumn(
    '_first_name',
    F.when(F.col('swap'),
           F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*')).otherwise(
               F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*')))
df = df.withColumn(
    '_last_name',
    F.when(F.col('swap'),
           F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*')).otherwise(
               F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*')))
df = df.withColumn('_address', F.sha2(F.col('address'), 256))

# COMMAND ----------

display(df)

# COMMAND ----------

# Create a view or table

df.select(df._first_name, df._last_name, df._address, df.date_of_birth) \
  .coalesce(1) \
  .write \
  .format('csv') \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
Ejemplo n.º 29
0
# |20150722-13-3145-119-81-61-166|3323           |
# |20150722-12-1630-52-74-219-71 |2967           |
# |20150722-14-0015-52-74-219-71 |2907           |
# |20150722-17-0015-119-81-61-166|2841           |
# +------------------------------+---------------+

# 4. Find the most engaged users, ie the IPs with the longest session times
# I am appending IP and Hash value of User agent, there by assuming each user agent within same ip
# corresponds to different user
# I am also assuming that we are trying to find most engaged users based on session times
# (not across all the sessions_ids, which would become most engaged user of a day)
# As I am copying my output here, I am sorting the output by duration_min
mostEngaugedBaseDF = baseDF.select("timestamp", "session_id", "ip",
                                   "user_agent")
mostEngaugedBaseDF = mostEngaugedBaseDF.withColumn(
    "user", f.concat("ip", f.lit('_'), f.sha2("user_agent", 256)))
mostEngaugedDF = mostEngaugedBaseDF.groupby('user', 'session_id')\
    .agg((f.max('timestamp').cast('long') - f.min('timestamp').cast('long')) / 60)\
    .toDF("user", "session_id", "duration_min")\
    .orderBy("duration_min", ascending=False)

mostEngaugedDF.show(10, truncate=False)
# +--------------------------------------------------------------------------------+--------------------------------+------------------+
# |user                                                                            |session_id                      |duration_min      |
# +--------------------------------------------------------------------------------+--------------------------------+------------------+
# |111.119.199.22_f54af9f03ea52c6a4f3d0873010fa93778a1e387399baf0c331558235b47d37b |20150722-06-3145-111-119-199-22 |13.983333333333333|
# |117.220.186.227_3a5a319663e42275d264c0d49636fe3673c4ace35a759d89f400715744532cbd|20150722-06-3145-117-220-186-227|13.4              |
# |15.211.153.75_180050cb76309ecd4e9e895a18ed06b490500b93ab309126d91a9719e69097b7  |20150722-06-3145-15-211-153-75  |9.933333333333334 |
# |119.235.53.134_3a5a319663e42275d264c0d49636fe3673c4ace35a759d89f400715744532cbd |20150722-06-3145-119-235-53-134 |9.9               |
# |116.50.79.74_3a5a319663e42275d264c0d49636fe3673c4ace35a759d89f400715744532cbd   |20150722-06-3145-116-50-79-74   |9.65              |
# |52.74.219.71_3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112   |20150722-06-3145-52-74-219-71   |9.316666666666666 |
Ejemplo n.º 30
0

# Build session
sparkql = SparkSession.builder.master('yarn').getOrCreate()

# Load passenger data
bucket = <your bucket>
sparkql.conf.set('temporaryGcsBucket', bucket) #this gives our job a temporary bucket to use when writint

bucket_path = 'gs://{}/'.format(bucket)
people_path = bucket_path + 'passengers_1k.csv'
passengers_df = sparkql.read.csv(people_path, header=True)

# Use withColumn and initcap to standardize the names
passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\
                             .withColumn('middle_name', initcap(col('middle_name')))\
                             .withColumn('last_name', initcap(col('last_name')))

# Create full_name column
passengers_df = passengers_df.withColumn('full_name',
                                         concat_ws(" ",
                                                   col('first_name'),
                                                   col('middle_name'),
                                                   col('last_name')))
passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256))

bq_dataset = <your dataset>
bq_table = 'passengers'
passengers_df.write.format('bigquery') \
  .option('table', '{}.{}'.format(bq_dataset, bq_table)) \
  .save()
Ejemplo n.º 31
0
sparkql.conf.set('temporaryGcsBucket', bucket)

bucket_path = 'gs://{}/'.format(bucket)
addr_path = bucket_path + 'passengers_addrs_1k.csv'
addr_df = sparkql.read.csv(addr_path, header=True)

card_path = bucket_path + 'passengers_cards_1k.csv'
card_df = sparkql.read.csv(card_path, header=True)

# Create uid for each
addr_df = addr_df.withColumn('addr_uid',
                             sha2(concat_ws("",
                                            col("street_address"),
                                            col("city"),
                                            col("state_code"),
                                            col("from_date"),
                                            col("to_date")
                                            ),
                                  256
                                  ))

card_df = card_df.withColumn('card_uid',
                             sha2(concat_ws("",
                                            col("provider"),
                                            col("card_number"),
                                            col("expiration_date"),
                                            col("security_code")
                                            ),
                                  256
                                  ))
Ejemplo n.º 32
0
df     = spark.read.format("com.databricks.spark.csv") \
                      .option("header", "true") \
                      .option("multiline","true") \
                      .option("escape", "\"") \
                      .schema(SCHEMA_INPUT_FILE) \
                      .load("/user/root/bd-platform/input/rs_aligned_json_metadata.csv")

metadata_schema =  StructType([ StructField("src_filename", StringType(), True),
                           StructField("tgt_filename", StringType(), True),
                          StructField("label", StringType(), True)
                        ])

df = df.withColumn("metadata_json_parsed", F.from_json(df["metadata_json"], metadata_schema))

df = df.select(F.sha2(F.col("src"), 256).alias("src_hash"), "src", "tgt", "src_lang", "tgt_lang", \
              "metadata_json_parsed.src_filename", \
              "metadata_json_parsed.tgt_filename", \
              "metadata_json_parsed.label")

df.write.parquet("/user/root/hive/proto/proto.parquet")


######################################################
# Exposing in spark-sql
######################################################
#  > create database test_db;
#  > use test_db;
#  > create table sentences_parquet(
#      src_hash string,
#      src string,