コード例 #1
0
def apply_reference_action_type(df):
    df["reference_" + clean_filename("interaction item image")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "interaction item image" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("search for poi")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "search for poi" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("interaction item rating")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "interaction item rating" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("clickout item")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "clickout item" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("interaction item deals")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "interaction item deals" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("change of sort order")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "change of sort order" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("search for item")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "search for item" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("search for destination")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "search for destination" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("filter selection")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "filter selection" else "<none>",
        axis=1,
    )
    df["reference_" + clean_filename("interaction item info")] = df.apply(
        lambda row: row["reference"]
        if row["action_type"] == "interaction item info" else "<none>",
        axis=1,
    )

    return df
コード例 #2
0
 def output(self):
     return (
         luigi.LocalTarget(
             os.path.join(
                 DATASET_DIR,
                 clean_filename(self.filter_city),
                 "train_indexed__size=%d.csv" % (self.sample_size),
             )),
         luigi.LocalTarget(
             os.path.join(
                 DATASET_DIR,
                 clean_filename(self.filter_city),
                 "item_metadata_indexed__size=%d.csv" % (self.sample_size),
             )),
     )
コード例 #3
0
def apply_reference_action_type(df):
    """
    Split click type in columns        
    """
    df['reference_' + clean_filename("interaction item image")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "interaction item image" else "<none>",
        axis=1)
    df['reference_' + clean_filename("search for poi")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "search for poi" else "<none>",
        axis=1)
    df['reference_' + clean_filename("interaction item rating")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "interaction item rating" else "<none>",
        axis=1)
    df['reference_' + clean_filename("clickout item")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "clickout item" else "<none>",
        axis=1)
    df['reference_' + clean_filename("interaction item deals")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "interaction item deals" else "<none>",
        axis=1)
    df['reference_' + clean_filename("change of sort order")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "change of sort order" else "<none>",
        axis=1)
    df['reference_' + clean_filename("search for item")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "search for item" else "<none>",
        axis=1)
    df['reference_' + clean_filename("search for destination")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "search for destination" else "<none>",
        axis=1)
    df['reference_' + clean_filename("filter selection")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "filter selection" else "<none>",
        axis=1)
    df['reference_' + clean_filename("interaction item info")] = df.apply(
        lambda row: row['reference']
        if row['action_type'] == "interaction item info" else "<none>",
        axis=1)

    return df
コード例 #4
0
 def output(self):
     return luigi.LocalTarget(
         os.path.join(
             DATASET_DIR,
             clean_filename(self.filter_city),
             "train__agg_indexed__size=%d_window=%d.csv" %
             (self.sample_size, self.window_hist),
         ))
コード例 #5
0
 def output(self):
     return luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city), "item_indices__size=%d.csv" % (self.sample_size))),\
         luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city),  "user_indices__size=%d.csv" % (self.sample_size))),\
         luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city),  "session_indices__size=%d.csv" % (self.sample_size))),\
         luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city),  "action_type_indices__size=%d.csv" % (self.sample_size))),\
         luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city),  "platform_indices__size=%d.csv" % (self.sample_size))),\
         luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city),  "city_indices__size=%d__csv" % (self.sample_size))),\
         luigi.LocalTarget(os.path.join(DATASET_DIR, clean_filename(self.filter_city),  "device_indices__size=%d.csv" % (self.sample_size))),\
コード例 #6
0
 def output(self):
     return (
         luigi.LocalTarget(
             os.path.join(
                 DATASET_DIR,
                 clean_filename(self.filter_city),
                 "train_transform__size=%d.csv" % (self.sample_size),
             )),
         luigi.LocalTarget(
             os.path.join(
                 DATASET_DIR,
                 clean_filename(self.filter_city),
                 "text_vocabulary__size=%d.csv" % (self.sample_size),
             )),
         luigi.LocalTarget(
             os.path.join(
                 DATASET_DIR,
                 clean_filename(self.filter_city),
                 "filter_session_size=%d.csv" % (self.sample_size),
             )),
     )
コード例 #7
0
    def main(self, sc: SparkContext, *args):
        os.makedirs(os.path.join(DATASET_DIR,
                                 clean_filename(self.filter_city)),
                    exist_ok=True)

        spark = SparkSession(sc)
        train_df = spark.read.csv(self.input()[0].path,
                                  header=True,
                                  inferSchema=True)
        train_df = train_df.withColumn("impressions_array",
                                       F.split(train_df.impressions, "\|"))

        meta_df = spark.read.csv(self.input()[1].path,
                                 header=True,
                                 inferSchema=True)

        # Filter dataset
        if self.filter_city != 'all':
            if self.filter_city == 'recsys':
                train_df = train_df.filter(train_df.city.isin(RECSYS_CITIES))
            else:
                train_df = train_df.filter(train_df.city == self.filter_city)

            # Filter reference
            reference_df = train_df.select("reference").distinct()

            # Filte item impressions
            item_id_df = train_df.select(
                posexplode("impressions_array").alias(
                    "pos_item_idx",
                    "reference")).select("reference").distinct()
            #raise(Exception(train_df.show()))
            item_id_df = item_id_df.union(reference_df).select(
                "reference").distinct()

            meta_df = meta_df.join(
                item_id_df, meta_df.item_id == item_id_df.reference).select(
                    "item_id", "properties")

        if self.sample_size > 0:
            train_df = train_df.sort("timestamp",
                                     ascending=False).limit(self.sample_size)

        # Save
        train_df.toPandas().to_csv(self.output()[0].path, index=False)
        meta_df.toPandas().to_csv(self.output()[1].path, index=False)
コード例 #8
0
 def dataset_dir(self) -> str:
     return os.path.join(DATASET_DIR, clean_filename(self.filter_city))
コード例 #9
0
    def main(self, sc: SparkContext, *args):
        os.makedirs(DATASET_DIR, exist_ok=True)

        spark = SparkSession(sc)
        print("Load Data...")

        # Load
        df = spark.read.csv(self.input()[0].path,
                            header=True,
                            inferSchema=True)  # .limit(500000)
        df = df.withColumn("idx", F.monotonically_increasing_id())

        print("Transform Interactions data...")

        def to_int_array(x):
            return [] if x == "" or x == None else [
                int(i) or 0 for i in x.split("|")
            ]

        to_int_array_udf = udf(lambda x: to_int_array(x),
                               ArrayType(IntegerType()))

        def to_float_array(x):
            return [] if x == "" or x == None else [
                float(i) or 0 for i in x.split("|")
            ]

        to_float_array_udf = udf(lambda x: to_float_array(x),
                                 ArrayType(FloatType()))

        df = df.\
            withColumn("impressions", to_int_array_udf(col('impressions'))).\
            withColumn("prices", to_float_array_udf(col('prices')))

        def to_reference_action(action_type, text, reference):
            return reference if action_type == text else "<none>"

        to_reference_action_udf = udf(
            lambda a, b, c: to_reference_action(a, b, c), StringType())

        for ref in [
                "interaction item image", "search for poi",
                "interaction item rating", "clickout item",
                "interaction item deals", "change of sort order",
                "search for item", "search for destination",
                "filter selection", "interaction item info"
        ]:
            df = df.\
                withColumn('reference_'+clean_filename(ref),
                           to_reference_action_udf(col('action_type'), lit(ref), col('reference')))

        print("Split filter session data...")
        df, df_filters = self.split_columns_filter(df)

        print(df.columns)
        print("Tokenizer reference search...")
        # Transform columns with text
        columns_with_string = [
            "reference_search_for_poi", "reference_change_of_sort_order",
            "reference_search_for_destination", "reference_filter_selection"
        ]

        df_text = df.select(["idx"] + columns_with_string).toPandas()

        # vocabulario
        vocab = ["<none>"]
        for c in columns_with_string:
            df_text[c] = df_text[c].fillna("<none>")
            vocab += df_text[c].tolist()

        # Tokenizer
        tokenizer = StaticTokenizerEncoder(
            vocab,
            tokenize=lambda x: re.split('\W+', x),
            min_occurrences=10,
            reserved_tokens=[])
        df_vocabulary = pd.DataFrame(tokenizer.vocab, columns=['vocabulary'])

        #Apply tokenizer
        for text_column in columns_with_string:
            df_text[text_column] = tokenizer.batch_encode(
                df_text[text_column])[0].cpu().detach().numpy().tolist()
            df_text[text_column + '_max_words'] = len(df_text[text_column][0])

        df_text = spark.createDataFrame(df_text)
        df = df.drop(*columns_with_string).join(df_text, ['idx'])

        # Save
        df.toPandas().to_csv(self.output()[0].path, index=False)
        df_vocabulary.to_csv(self.output()[1].path)
        df_filters.toPandas().to_csv(self.output()[2].path)
        return
コード例 #10
0
 def output(self):
     return luigi.LocalTarget(
         os.path.join(
             DATASET_DIR, clean_filename(self.filter_city),
             "item_metadata_transform__size=%d.csv" % (self.sample_size)))