Ejemplo n.º 1
0
def assign_vids_uid_time(df_impressions, vid_assignment_table, demo_cols):
    select_cols = ["vid", "timestamp", "h1", "p1", "p2", "user_id"]
    if demo_cols is not None:
        df = df_impressions.join(F.broadcast(vid_assignment_table), demo_cols)
        select_cols = demo_cols + select_cols
    else:
        df = df_impressions.join(F.broadcast(vid_assignment_table))

    df_vid_impressions = (
        df.withColumn(
            "h1",
            F.hash(
                F.concat(
                    F.col("user_id").astype("string"),
                    F.col("timestamp").astype("string"))))
        #.withColumn("h1", F.hash("user_id"))
        .withColumn(
            "p1",
            F.col("h1") / (2**32) +
            0.5).where(F.col("p1") >= F.col("prob_>=")).where(
                F.col("p1") < F.col("prob_<")).withColumn(
                    "p2",
                    F.hash(
                        F.concat(
                            F.col("h1").astype("string"),
                            F.col("timestamp").astype("string"))) / (2**32) +
                    0.5).withColumn(
                        "vid",
                        (F.col("p2") * F.col("total_VID")).astype('int') +
                        F.col("start_VID")).select(*select_cols))
    return df_vid_impressions
Ejemplo n.º 2
0
def build_inference_df(data):
  
  inference_df = (data.select("customerID", F.col("churnString").alias("Churn"))
                  .withColumn("Churn", F.col("Churn") == "Yes")
                  .withColumn("LastCallEscalated",
                              F.when(F.col("Churn"), F.hash(F.col("customerID")) % 100 < 35)
                              .otherwise(F.hash(F.col("customerID")) % 100 < 15)))

  
  return inference_df
Ejemplo n.º 3
0
def assign_vids_weighted(df_impressions, vid_assignment_table, demo_cols=None):
    select_cols = ["vid", "timestamp", "h1", "p1"]
    if demo_cols is not None:
        df = df_impressions.join(F.broadcast(vid_assignment_table), demo_cols)
        select_cols = demo_cols + select_cols
    else:
        df = df_impressions.crossJoin(F.broadcast(vid_assignment_table))

    df_vid_impressions = (
        df.withColumn("h1", F.hash(F.col("user_id").astype("string")))
        #.withColumn("h1", F.hash("user_id"))
        .withColumn("p1",
                    F.col("h1") / (2**32) + 0.5).withColumn(
                        "n",
                        F.round("weight").cast("integer")).where(
                            F.col("p1") >= F.col("prob_>=")).where(
                                F.col("p1") < F.col("prob_<")).withColumn(
                                    "vids",
                                    udf_find_n_vids(
                                        F.col("n"), F.col("start_VID"),
                                        F.col("total_VID"),
                                        F.col("h1"))).withColumn(
                                            "vid",
                                            F.explode(F.col("vids"))).select(
                                                *select_cols))
    return df_vid_impressions
Ejemplo n.º 4
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = hash(*[
         col.get_column_spec(source_df=source_df,
                             current_column=current_column)
         for col in self.value
     ]).cast("string")
     return column_spec
Ejemplo n.º 5
0
def test_hash_repartition_exact(gen, num_parts):
    data_gen = gen[0]
    part_on = gen[1]
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : gen_df(spark, data_gen, length=1024)\
                    .repartition(num_parts, *part_on)\
                    .withColumn('id', f.spark_partition_id())\
                    .withColumn('hashed', f.hash(*part_on))\
                    .selectExpr('*', 'pmod(hashed, {})'.format(num_parts)))
Ejemplo n.º 6
0
def add_hash_column(obj, cols=True, hash_colname='_hash', exclude_cols=[]):
    # add the _updated timestamp
    if isinstance(cols, bool) and cols:
        cols = obj.columns

    colnames = (set(obj.columns) & set(cols)) - set(exclude_cols)
    cols = [x for x in cols if x in colnames]

    obj = obj.withColumn(hash_colname, F.hash(*cols))
    return obj
Ejemplo n.º 7
0
def test_auto_mapper_hash(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "54"),
            (2, "Vidal", "67"),
            (3, "Vidal", None),
            (4, None, None),
        ],
        ["member_id", "last_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn("my_age", col("my_age").cast("int"))

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"
              ]).columns(age=A.hash(A.column("my_age"), A.column("last_name")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        hash(col("b.my_age"), col("b.last_name")).cast("string").alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("age").collect()[0][0] ==
            "-543157534")
    assert (result_df.where("member_id == 2").select("age").collect()[0][0] ==
            "2048196121")
    assert (result_df.where("member_id == 3").select("age").collect()[0][0] ==
            "-80001407")
    assert result_df.where("member_id == 4").select(
        "age").collect()[0][0] == "42"

    assert dict(result_df.dtypes)["age"] == "string"
Ejemplo n.º 8
0
    def _expode_field_arrays(self) -> DataFrame:

        timestamp_conditionals = [
            F.col("field.field") == "timestamp",
            F.col("field.field") == "time_created",
        ]

        ts_case = F.when(
            self.bit_any(*timestamp_conditionals),
            (F.col("field.value").cast("long") + 631065600).cast("timestamp"),
        ).otherwise(None)

        ts_col = (F.first(ts_case, ignorenulls=True).over(
            Window.partitionBy("message_id")).alias("timestamp"))

        field_col = (F.when(F.col("field") == "type",
                            F.col("message")).otherwise(
                                F.col("field")).alias("field"))
        self._spark.conf.set("spark.sql.shuffle.partitions", "8")
        return (self.csv_df.select(
            "*",
            F.explode("fields").alias("field"),
            F.hash("fields").alias("message_id"),
        ).drop("fields").select(
            "record_type",
            "local number",
            "message",
            "message_id",
            ts_col,
            "field.*",
        ).replace("file_id", "file", "message").select(
            "record_type",
            "local number",
            "message",
            "message_id",
            "timestamp",
            field_col,
            "value",
            "units",
        ).join(PROFILE, on=["field", "value"], how="left").select(
            "record_type",
            "local number",
            "message",
            "message_id",
            "timestamp",
            "field",
            F.coalesce("enum_value", "value").alias("value"),
            "units",
        ))
Ejemplo n.º 9
0
def customer_meta(df):
    SENIOR_CUTOFF = 65
    ADULT_CUTOFF = 18
    DAYS_IN_YEAR = 365.25
    EXPONENTIAL_DIST_SCALE = 6.3

    augmented_original = replicate_df(df, options["dup_times"] or 1)

    customerMetaRaw = augmented_original.select(
        "customerID",
        F.lit(now).alias("now"),
        (F.abs(F.hash(augmented_original.customerID)) % 4096 /
         4096).alias("choice"),
        "SeniorCitizen",
        "gender",
        "Partner",
        "Dependents",
        F.col("MonthlyCharges").cast(
            get_currency_type()).alias("MonthlyCharges"),
    )

    customerMetaRaw = customerMetaRaw.withColumn(
        "ageInDays",
        F.floor(
            F.when(
                customerMetaRaw.SeniorCitizen == 0,
                (customerMetaRaw.choice *
                 ((SENIOR_CUTOFF - ADULT_CUTOFF - 1) * DAYS_IN_YEAR)) +
                (ADULT_CUTOFF * DAYS_IN_YEAR),
            ).otherwise((SENIOR_CUTOFF * DAYS_IN_YEAR) +
                        (DAYS_IN_YEAR *
                         (-F.log1p(-customerMetaRaw.choice) *
                          EXPONENTIAL_DIST_SCALE)))).cast("int"),
    )

    customerMetaRaw = customerMetaRaw.withColumn(
        "dateOfBirth", F.expr("date_sub(now, ageInDays)"))

    return customerMetaRaw.select(
        "customerID",
        "dateOfBirth",
        "gender",
        "SeniorCitizen",
        "Partner",
        "Dependents",
        "MonthlyCharges",
        "now",
    ).orderBy("customerID")
Ejemplo n.º 10
0
    def _expand_message_records(self, message: str) -> DataFrame:
        message_df = (self.csv_df.where(F.col("message") == message).select(
            "record_type",
            "message",
            F.hash("fields").alias("record_id"),
            F.explode("fields").alias("field_struct"),
        ).select("*", "field_struct.*").drop("field_struct").groupBy(
            "record_type", "message", "record_id").pivot("field").agg(
                F.max(F.concat_ws("-", F.col("value"), F.col("units")))))

        if "type" in message_df.columns:
            message_type = message.replace("_id", "")
            message_df = message_df.withColumnRenamed("type", message_type)

        enum_cols = set(message_df.columns).intersection(set(PROFILE.keys()))

        for col in enum_cols:
            message_df = message_df.replace(**PROFILE[col], subset=col)
        return message_df
Ejemplo n.º 11
0
def input_data(path, filename, schemaString):
    fields = [
        StructField(field_name, StringType(), True)
        for field_name in schemaString.split()
    ]
    schema = StructType(fields)
    dataType = StructType([
        StructField("Class", IntegerType(), True),
        StructField("Timeseries", ArrayType(FloatType()), True)
    ])
    '''
    # when separator is space ' '
    df = spark.read.csv(path + filename, schema=schema)
    def split_cols(array):
        Class  = int(float(array[1]))
        timeseries = array[2:len(array)]
        timeseries = [float(value) for value in timeseries]
        return (Class, timeseries)
    split_cols = udf(split_cols, dataType)
    df = df.withColumn('text', split_cols(split('class_timeseries', '\\s+'))).select(hash(col('text.Timeseries')).alias('id'), col('text.*'))
    return df
    '''
    #when separator is ','
    df = spark.read.csv(path + filename, sep=';', schema=schema)

    def split_cols(array):
        Class = int(float(array[0]))
        timeseries = array[1:len(array)]
        timeseries = [float(value) for value in timeseries]
        return (Class, timeseries)

    split_cols = udf(split_cols, dataType)
    df = df.withColumn('text',
                       split_cols(split('class_timeseries', ','))).select(
                           hash(col('text.Timeseries')).alias('id'),
                           col('text.*'))
    '''df_csv = df.select('id','Class', ArrayToString(col('Timeseries')))
    return df, df_csv'''
    print("df.first() is {}".format(df.first()))
    return df
Ejemplo n.º 12
0
    "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp",
    "like_timestamp"
]

from pyspark.sql import functions as f
#for feature in text_features:
#    text_feature_split = f.split(df[feature], '\t')
#    df = df.withColumn(feature, f.when(f.col(feature).isNotNull(), text_feature_split).otherwise(f.array().cast("array<string>")))

from pyspark.sql.types import IntegerType
for feature in numeric_features:
    df = df.withColumn(feature, f.col(feature).cast(IntegerType()))

for feature in id_features:
    output_col = feature + "_hashed"
    df = df.withColumn(output_col, (f.hash(f.col(feature))))
    df = df.withColumn(
        output_col,
        f.when(f.col(output_col) < 0,
               f.col(output_col) * -1 % 50).otherwise(f.col(output_col) % 50))

for col in label_columns:
    df = df.withColumn(col, f.when(f.col(col).isNotNull(), 1).otherwise(0))

##### Same preprocessing for validation (without label_columns transformation)
#for feature in text_features:
#    text_feature_split = f.split(df_val[feature], '\t')
#    df_val = df_val.withColumn(feature, f.when(f.col(feature).isNotNull(), text_feature_split).otherwise(f.array().cast("array<string>")))

for feature in numeric_features:
    df_val = df_val.withColumn(feature, f.col(feature).cast(IntegerType()))
Ejemplo n.º 13
0
    def loadRefine(self):

        s3 = boto3.resource('s3')
        discoveryBucketNode = s3.Bucket(name=self.discoveryBucket)

        lastUpdatedDealerFile = self.findLastModifiedFile(
            discoveryBucketNode, self.prefixDealerDiscoveryPath,
            self.discoveryBucket)
        dfDealerCode = self.sparkSession.read.parquet(lastUpdatedDealerFile)

        dfDealerCode.withColumnRenamed("Company", "CompanyCode").\
            withColumnRenamed("DF", "AssociationType").withColumnRenamed("DCstatus", "AssociationStatus")\
            .registerTempTable("Dealer")

        dfStoreAss = self.sparkSession.sql(
            "select a.DealerCode,a.CompanyCode,a.AssociationType, a.AssociationStatus,"
            "a.TBLoc,a.SMFMapping,a.RankDescription from Dealer a where "
            "a.RankDescription='Open Standard' or a.RankDescription='SMF Only' or "
            "a.RankDescription='Bulk' or a.RankDescription Like '%Close%' or "
            "a.RankDescription Like 'Close%'")

        dfStoreAss = dfStoreAss.where(col('CompanyCode').like("Spring%"))
        dfStoreAsstemp = dfStoreAss
        dfOpenFil = dfStoreAss.filter(dfStoreAss.RankDescription != 'SMF Only')

        #########################################################################################################
        # Transformation starts #
        #########################################################################################################

        dfStoreAssOpen = dfStoreAsstemp.\
            withColumn('StoreNumber', when((dfStoreAsstemp.RankDescription != 'SMF Only'), dfStoreAsstemp.TBLoc).
                       otherwise(dfStoreAsstemp.SMFMapping)).withColumn('dealercode1', dfStoreAsstemp.DealerCode).\
            withColumn('AssociationType1', when((dfStoreAsstemp.RankDescription != 'SMF Only'), 'Retail').
                       otherwise('SMF')).\
            withColumn('AssociationStatus1', when((dfStoreAsstemp.RankDescription == 'SMF Only') |
                                                  (dfStoreAsstemp.RankDescription == 'Open Standard') |
                                                  (dfStoreAsstemp.RankDescription == 'Bulk'), 'Active').
                       otherwise('Closed')).drop(dfStoreAsstemp.DealerCode).\
            drop(dfStoreAsstemp.AssociationType).drop(dfStoreAsstemp.AssociationStatus).\
            select(col('StoreNumber'), col('dealercode1').alias('DealerCode'),
                   col('AssociationType1').alias('AssociationType'),
                   col('AssociationStatus1').alias('AssociationStatus'), col('TBLoc'), col('SMFMapping'),
                   col('RankDescription'), col('CompanyCode'))

        dfStoreAssOpen.registerTempTable("storeAss1")

        #########################################################################################################
        # Code for new entry fields for Open Standard #
        #########################################################################################################

        dfOpenFilNewField = dfOpenFil.withColumn('StoreNumber', dfOpenFil.SMFMapping). \
            withColumn('dealercode1', dfOpenFil.DealerCode). \
            withColumn('AssociationType1', lit('SMF')). \
            withColumn('AssociationStatus1', lit('Active')). \
            drop(dfOpenFil.DealerCode). \
            drop(dfOpenFil.AssociationType). \
            drop(dfOpenFil.AssociationStatus). \
            select(col('StoreNumber'), col('dealercode1').alias('DealerCode'), col('AssociationType1').
                   alias('AssociationType'),
                   col('AssociationStatus1').alias('AssociationStatus'),
                   col('TBLoc'), col('SMFMapping'), col('RankDescription'), col('CompanyCode'))

        dfOpenFilNewField.registerTempTable("storeAss2")

        #########################################################################################################
        # Code for union of two dataframes#
        #########################################################################################################

        joined_DF = self.sparkSession.sql(
            "select cast(StoreNumber as integer),DealerCode, 4 as CompanyCode,"
            "AssociationType,AssociationStatus from storeAss1 union select StoreNumber"
            ",DealerCode,4 as CompanyCode,AssociationType,AssociationStatus "
            "from storeAss2")

        joined_DF.registerTempTable("store_assoc_source")
        self.sparkSession.sql("select StoreNumber, DealerCode, CompanyCode, AssociationType, AssociationStatus from"
                              " store_assoc_source ").withColumn("Hash_Column", hash("StoreNumber", "DealerCode",
                                                                                     "CompanyCode", "AssociationType",
                                                                                     "AssociationStatus")).\
            registerTempTable("store_assoc_curr")

        refinedBucketNode = s3.Bucket(name=self.refinedBucket)
        storeAssocPrevRefinedPath = self.findLastModifiedFile(
            refinedBucketNode, self.prefixStoreAssocPath, self.refinedBucket)

        if storeAssocPrevRefinedPath != '':
            self.sparkSession.read.parquet(storeAssocPrevRefinedPath).\
                withColumn("Hash_Column", hash("StoreNumber", "DealerCode", "CompanyCode", "AssociationType",
                                               "AssociationStatus")).\
                registerTempTable("store_assoc_prev")

            self.sparkSession.sql("select a.StoreNumber, a.DealerCode, a.CompanyCode, a.AssociationType, "
                                  "a.AssociationStatus from store_assoc_prev a left join store_assoc_curr b on "
                                  "a.StoreNumber = b.StoreNumber where a.Hash_Column = b.Hash_Column").\
                registerTempTable("store_assoc_no_change_data")

            dfStoreUpdated = self.sparkSession.sql(
                "select a.StoreNumber, a.DealerCode, a.CompanyCode, "
                "a.AssociationType, a.AssociationStatus from store_assoc_curr a"
                " left join store_assoc_prev b on a.StoreNumber = b.StoreNumber"
                " where a.Hash_Column <> b.Hash_Column")
            updateRowsCount = dfStoreUpdated.count()
            dfStoreUpdated.registerTempTable("store_assoc_updated_data")

            dfStoreNew = self.sparkSession.sql(
                "select a.StoreNumber, a.DealerCode, a.CompanyCode, a.AssociationType,"
                " a.AssociationStatus from store_assoc_curr a left join "
                "store_assoc_prev b on a.StoreNumber = b.StoreNumber where "
                "b.StoreNumber = null")
            newRowsCount = dfStoreNew.count()
            dfStoreNew.registerTempTable("store_assoc_new_data")

            if updateRowsCount > 0 or newRowsCount > 0:
                dfStoreWithCDC = self.sparkSession.sql(
                    "select StoreNumber, DealerCode, CompanyCode, AssociationType, "
                    "AssociationStatus from store_assoc_no_change_data union "
                    "select StoreNumber, DealerCode, CompanyCode, AssociationType,"
                    " AssociationStatus from store_assoc_updated_data union "
                    "select StoreNumber, DealerCode, CompanyCode, AssociationType,"
                    " AssociationStatus from store_assoc_new_data")
                self.log.info("Updated file has arrived..")
                dfStoreWithCDC.coalesce(1).write.mode("overwrite").parquet(
                    self.storeAssociationWorkingPath)
                dfStoreWithCDC.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
                    withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\
                    write.mode("append").partitionBy('year', 'month').format('parquet').\
                    save(self.storeAssociationPartitonPath)
            else:
                self.log.info(
                    " The prev and current files are same. So no file will be generated in refined bucket."
                )
        else:
            self.log.info(
                " This is the first transaformation call, So keeping the file in refined bucket."
            )
            joined_DF.coalesce(1).write.mode("overwrite").parquet(
                self.storeAssociationWorkingPath)
            joined_DF.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
                withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\
                write.mode('append').partitionBy('year', 'month').format('parquet').\
                save(self.storeAssociationPartitonPath)
            self.sparkSession.stop()
    def loadRefined(self):

        self.sparkSession.read.parquet(
            self.dealerCodeIn).registerTempTable("AttDealerCode")

        dfDealerCode = self.sparkSession.sql(
            "select a.dealercode,'4' as companycode,a.dcorigin as dealercodeorigin,"
            "a.dfcode,a.dcstatus,case when a.df = 'No' then '0' when a.df = 'Yes' then '1'"
            " when a.df = 'Off' then '0' when a.df = 'False' then '0' when a.df = 'On' then '1' "
            "when a.df = 'True' then '1' when a.df = 'DF' then '1' end as dfindicator,a.candc,a.opendate,a.closedate,"
            "case when a.ws = 'No' then '0' when a.ws = 'Yes' then '1'"
            " when a.ws = 'Off' then '0' when a.ws = 'False' then '0' when a.ws = 'On' then '1' when"
            " a.ws = 'True' then '1' end as whitestoreindicator,a.wsexpires as whitestoreexpirationdate,"
            "a.sortingrank as sortrank,a.rankdescription,a.storeorigin,"
            "a.acquisitionorigin as origin,a.businessexpert,a.footprintlevel,"
            "a.location as attlocationextendedname,a.attlocationid,a.attlocationname,a.disputemkt as attdisputemarket,"
            "a.attmktabbrev as attmarketcode, a.attmarketname as attmarket,"
            "a.oldcode as olddealercode, "
            "a.oldcode2 as olddealercode2,"
            "a.attregion, a.notes, a.notes2 from AttDealerCode a")
        # to change company code to -1 ##

        FinalDF1 = dfDealerCode.na.fill({
            'companycode': -1,
        })

        # to filter null value recolds for column delaercodes ##

        FinalDF2 = FinalDF1.where(FinalDF1.dealercode != '')

        # dropping duplicates ##
        FinalDF = FinalDF2.dropDuplicates(['dealercode'])
        previousAttDealerCodeFile = self.findLastModifiedFile(
            self.refinedBucketNode, self.prefixAttDealerCodeRefinePath,
            self.refinedBucket)
        if previousAttDealerCodeFile != '':
            FinalDF.withColumn(
                "Hash_Column",
                hash("dealercode", "companycode", "dealercodeorigin", "dfcode",
                     "dcstatus", "dfindicator", "candc", "opendate",
                     "closedate", "whitestoreindicator",
                     "whitestoreexpirationdate", "sortrank", "rankdescription",
                     "storeorigin", "origin", "businessexpert",
                     "footprintlevel", "attlocationextendedname",
                     "attlocationid", "attlocationname", "attdisputemarket",
                     "attmarketcode", "attmarket", "olddealercode",
                     "olddealercode2", "attregion", "notes",
                     "notes2")).registerTempTable("att_dealer_code_curr")

            self.sparkSession.read.parquet(
                previousAttDealerCodeFile).withColumn(
                    "Hash_Column",
                    hash("dealercode", "companycode", "dealercodeorigin",
                         "dfcode", "dcstatus", "dfindicator", "candc",
                         "opendate", "closedate", "whitestoreindicator",
                         "whitestoreexpirationdate", "sortrank",
                         "rankdescription", "storeorigin", "origin",
                         "businessexpert", "footprintlevel",
                         "attlocationextendedname", "attlocationid",
                         "attlocationname", "attdisputemarket",
                         "attmarketcode", "attmarket", "olddealercode",
                         "olddealercode2", "attregion", "notes",
                         "notes2")).registerTempTable("att_dealer_code_prev")

            self.sparkSession.sql("select a.dealercode, a.companycode, a.dealercodeorigin, a.dfcode, a.dcstatus, "
                                  "a.dfindicator, a.candc, a.opendate, a.closedate, a.whitestoreindicator, "
                                  "a.whitestoreexpirationdate, a.sortrank, a.rankdescription, a.storeorigin, a.origin, "
                                  "a.businessexpert, a.footprintlevel, a.attlocationextendedname, a.attlocationid, "
                                  "a.attlocationname, a.attdisputemarket, a.attmarketcode, a.attmarket, "
                                  "a.olddealercode, a.olddealercode2, a.attregion, a.notes, a.notes2 from "
                                  "att_dealer_code_prev a left join att_dealer_code_curr b on "
                                  "a.dealercode = b.dealercode where a.Hash_Column = b.Hash_Column").\
                registerTempTable("att_dealer_no_change_data")

            dfAttDealerCodeUpdated = self.sparkSession.sql(
                "select a.dealercode, a.companycode, a.dealercodeorigin, a.dfcode, a.dcstatus, a.dfindicator, a.candc,"
                "a.opendate, a.closedate, a.whitestoreindicator, a.whitestoreexpirationdate, a.sortrank, "
                "a.rankdescription, a.storeorigin, a.origin, a.businessexpert, a.footprintlevel, "
                "a.attlocationextendedname, a.attlocationid, a.attlocationname, a.attdisputemarket, a.attmarketcode,"
                "a.attmarket, a.olddealercode, a.olddealercode2, a.attregion, a.notes, a.notes2 from "
                "att_dealer_code_curr a left join att_dealer_code_prev b on a.dealercode = b.dealercode "
                "where a.Hash_Column <> b.Hash_Column")
            updateRowsCount = dfAttDealerCodeUpdated.count()
            dfAttDealerCodeUpdated.registerTempTable("att_dealer_updated_data")

            dfAttDealerCodeNew = self.sparkSession.sql(
                "select a.dealercode, a.companycode, a.dealercodeorigin, a.dfcode, a.dcstatus, a.dfindicator, a.candc,"
                "a.opendate, a.closedate, a.whitestoreindicator, a.whitestoreexpirationdate, a.sortrank, "
                "a.rankdescription, a.storeorigin, a.origin, a.businessexpert, a.footprintlevel, "
                "a.attlocationextendedname, a.attlocationid, a.attlocationname, a.attdisputemarket, a.attmarketcode,"
                "a.attmarket, a.olddealercode, a.olddealercode2, a.attregion, a.notes, a.notes2 from "
                "att_dealer_code_curr a left join att_dealer_code_prev b on a.dealercode = b.dealercode "
                "where b.dealercode = null")
            newRowsCount = dfAttDealerCodeNew.count()
            dfAttDealerCodeNew.registerTempTable("att_dealer_new_data")

            if updateRowsCount > 0 or newRowsCount > 0:
                self.sparkSession.sql(
                    "select * from att_dealer_no_change_data union "
                    "select * from att_dealer_updated_data union "
                    "select * from att_dealer_new_data").registerTempTable(
                        "att_dealer_cdc")
                self.log.info("Updated file has arrived..")
                FinalDF = self.sparkSession.sql(
                    "select dealercode, companycode, dealercodeorigin, dfcode, dcstatus,"
                    " dfindicator, candc, opendate, closedate, whitestoreindicator, "
                    "whitestoreexpirationdate, sortrank, rankdescription, storeorigin, "
                    "origin, businessexpert, footprintlevel, attlocationextendedname, "
                    "attlocationid, attlocationname, attdisputemarket, attmarketcode, "
                    "attmarket, olddealercode, olddealercode2, attregion, notes, notes2"
                    " from att_dealer_cdc")
                FinalDF.coalesce(1).select("*").write.mode(
                    "overwrite").parquet(self.dealerCodeOutput + '/' +
                                         'Working')
                FinalDF.coalesce(1).select("*").write.mode("overwrite").csv(
                    self.dealerCodeOutput + '/csv', header=True)

                FinalDF.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))). \
                    withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).select("*").write.mode(
                    "append").partitionBy('year', 'month').format('parquet').save(self.dealerCodeOutput)
            else:
                self.sparkSession.createDataFrame(self.sparkSession.sparkContext.emptyRDD()).write.mode("overwrite")\
                    .csv(self.dealerCodeOutput + '/' + 'Working')
                self.log.info(
                    "The prev and current files same.So zero size delta file generated in delivery bucket."
                )

        else:
            #########################################################################################################
            # write output in parquet file #
            #########################################################################################################
            self.log.info(
                " This is the first transaformation call, So keeping the file in refined bucket."
            )
            FinalDF.coalesce(1).select("*").write.mode("overwrite").parquet(
                self.dealerCodeOutput + '/' + 'Working')
            FinalDF.coalesce(1).select("*").write.mode("overwrite").csv(
                self.dealerCodeOutput + '/csv', header=True)
            FinalDF.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
                withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).select("*").write.mode("append").\
                partitionBy('year', 'month').format('parquet').save(
                self.dealerCodeOutput)

        self.sparkSession.stop()
Ejemplo n.º 15
0
 def _assert_df_equal(df_1, df_2):
     df_1 = df_1.select(F.hash(*df_1.columns).alias("hash")).orderBy("hash")
     df_2 = df_2.select(F.hash(*df_2.columns).alias("hash")).orderBy("hash")
     assert df_1.subtract(df_2).count() == 0
Ejemplo n.º 16
0
def main():
    spark = SparkSession \
        .builder \
        .appName("RedditRecommender") \
        .getOrCreate()

    output_file = open('als-output.txt', 'w')

    data = spark.read.json('./sample_data.json')

    cols_to_keep = [data.author, data.id, data.subreddit]
    data = data.select(*cols_to_keep)
    data = data.filter(data.author != "[deleted]")

    @udf("boolean")
    def isNotDefault(x):
        defaultSubs = [
            "Art", "AskReddit", "DIY", "Documentaries", "EarthPorn",
            "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful",
            "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts",
            "UpliftingNews", "announcements", "askscience", "aww", "blog",
            "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food",
            "funny", "gadgets", "gaming", "gifs", "history", "listentothis",
            "mildlyinteresting", "movies", "news", "nosleep", "nottheonion",
            "personalfinance", "philosophy", "photoshopbattles", "pics",
            "science", "space", "sports", "television", "tifu",
            "todayilearned", "videos", "worldnews"
        ]
        return x not in defaultSubs

    data = data.filter(isNotDefault(data.subreddit))

    data = data.groupBy([data.author,
                         data.subreddit]).count().orderBy(data.author)
    data = data.withColumn('author_id', hash(data.author))
    data = data.withColumn('subreddit_id', hash(data.subreddit))

    (training, test) = data.randomSplit([0.8, 0.2])

    als = ALS(maxIter=5,
              rank=70,
              regParam=0.01,
              userCol="author_id",
              itemCol="subreddit_id",
              ratingCol="count",
              coldStartStrategy="drop",
              implicitPrefs=True)
    model = als.fit(training)

    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="count",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    output_file.write('Root mean squared error: ' + str(rmse) + '\n\n')

    users = data.select(als.getUserCol()).distinct().limit(30)
    user_subset_recs = model.recommendForUserSubset(users, 10)

    subreddit_recs = {}

    for row in user_subset_recs.collect():
        author = get_author_from_id(data, row['author_id'])
        subreddit_recs[author] = []
        for rec in row['recommendations']:
            subreddit_recs[author].append(get_subreddit_from_id(data, rec[0]))

    for author in subreddit_recs.keys():
        output_file.write('Top 10 recommendations for user ' + author + ':\n')
        for rec in subreddit_recs[author]:
            output_file.write(rec)
            output_file.write('\n')
        output_file.write('\n')
import os

common_path = '/data/MobodMovieLens/train'

ratings = spark.read.csv(os.path.join(common_path, 'ratings.csv'), header=True).cache()
movies = spark.read.csv(os.path.join(common_path, 'movies.csv'), header=True).cache()

ratings_tmp = (
    ratings
    .withColumn(
        'order',
        f.row_number().over(Window.partitionBy('userId').orderBy('timestamp')) /
        f.count('*').over(Window.partitionBy('userId'))
    )
    .withColumn('hash', f.abs(f.hash('userId')) % 211)
)
ratings_train_A = (
    ratings_tmp
    .filter((f.col('hash') > 0) & (f.col('hash') <= 105) & (f.col('order') < 0.905))
    .drop('order', 'hash')
    .cache()
)
ratings_train_B = (
    ratings_tmp
    .filter((f.col('hash') > 106) & (f.col('order') < 0.905))
    .drop('order', 'hash')
    .cache()
)
ratings_dev = (
    ratings_tmp