def prepare_datasets(data_dir): deviceinfo = utils.prepare_device_related_datasets(data_dir) # Count number of events per hour for each device (ephpd) ephpd = utils.prepare_events_per_hour_per_device_dataset(data_dir) # Events spread over 6 windows/splits through the day esd = utils.prepare_events_spread_dataset(data_dir) # Read the training & test datasets train = utils.read_gz(data_dir, "gender_age_train.csv.gz") test = utils.read_gz(data_dir, "gender_age_test.csv.gz") # Merge train and test with the events per hour per device dataset, ephpd train = pd.merge(train, ephpd, how="left") test = pd.merge(test, ephpd, how="left") for col in list(ephpd.columns.values): train[col].fillna(0, inplace=True) test[col].fillna(0, inplace=True) # Merge train and test with the events spread dataset, esd train = pd.merge(train, esd, how="left") test = pd.merge(test, esd, how="left") for col in list(esd.columns.values): train[col].fillna(0, inplace=True) test[col].fillna(0, inplace=True) # Merge train and test with a subset of columns of the device info dataset df2 = deviceinfo[["device_id", "phone_brand_id", "is_foreign_brand", "device_model_id"]].copy() df2 = df2.drop_duplicates(subset=["device_id"], keep="last") train = pd.merge(train, df2, how="left", on="device_id") test = pd.merge(test, df2, how="left", on="device_id") # Prepare the train and test datasets hour_of_day_cols = ["h" + str(x) for x in np.arange(0, 24).tolist()] cols_to_drop = list(hour_of_day_cols) test.drop(cols_to_drop, axis=1, inplace=True) test.fillna(-1, inplace=True) cols_to_drop.extend(["gender", "age"]) train.drop(cols_to_drop, axis=1, inplace=True) target = train.group.values train = train.drop(["group"], axis=1) train.fillna(-1, inplace=True) logger.info("train.columns : {}".format(list(train.columns.values))) logger.info(train.head()) return train, test, target
def prepare_datasets(data_dir): # Bag-of-Apps features based on # https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/ # bag-of-app-id-python-2-27392/code # Read App Events app_events = utils.read_gz(data_dir, "app_events.csv.gz") app_events = app_events.groupby("event_id")["app_id"].apply( lambda x: " ".join(set("app_id:" + str(s) for s in x))) # Read Events events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"), dtype={"device_id": np.str}) events["app_id"] = events["event_id"].map(app_events) events = events.dropna() del app_events events = events[["device_id", "app_id"]] events = events.groupby("device_id")["app_id"]\ .apply(lambda x: " " .join(set(str(" ".join(str(s) for s in x)).split(" ")))) events = events.reset_index(name="app_id") # expand to multiple rows events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' ')) for _, row in events.iterrows()]).reset_index() events.columns = ["app_id", "device_id"] # Read Phone Brand Device Model pbd = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv.gz"), dtype={"device_id": np.str}) pbd.drop_duplicates("device_id", keep="first", inplace=True) # Read Train and Test train = pd.read_csv(os.path.join(data_dir, "gender_age_train.csv.gz"), dtype={"device_id": np.str}) train.drop(["age", "gender"], axis=1, inplace=True) test = pd.read_csv(os.path.join(data_dir, "gender_age_test.csv.gz"), dtype={"device_id": np.str}) test["group"] = np.nan Y = train["group"] label_group = LabelEncoder() Y = label_group.fit_transform(Y) # Concat train and test, # before concatenating the features (phone_brand, device_model and app_id) df_all = pd.concat((train, test), axis=0, ignore_index=True) df_all = pd.merge(df_all, pbd, how="left", on="device_id") df_all["phone_brand"] = df_all["phone_brand"]\ .apply(lambda x: "phone_brand:" + str(x)) df_all["device_model"] = df_all["device_model"]\ .apply(lambda x: "device_model:" + str(x)) f1 = df_all[["device_id", "phone_brand"]] # phone_brand f2 = df_all[["device_id", "device_model"]] # device_model f3 = events[["device_id", "app_id"]] # app_id del df_all # Rename the 2nd column f1.columns.values[1] = "feature" f2.columns.values[1] = "feature" f3.columns.values[1] = "feature" FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True) FLS = FLS.reset_index() # User-Item Feature device_ids = FLS["device_id"].unique() feature_cs = FLS["feature"].unique() data = np.ones(len(FLS)) device_id_enc = LabelEncoder().fit(FLS["device_id"]) row = device_id_enc.transform(FLS["device_id"]) col = LabelEncoder().fit_transform(FLS["feature"]) sparse_matrix = sparse.csr_matrix((data, (row, col)), shape=(len(device_ids), len(feature_cs))) sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0] logger.info("sparse_matrix {}".format(sparse_matrix.shape)) # Data Prep train_row = device_id_enc.transform(train["device_id"]) train_sp = sparse_matrix[train_row, :] test_row = device_id_enc.transform(test["device_id"]) test_sp = sparse_matrix[test_row, :] random_state = cfg["common"]["seed"] X_train, X_val, y_train, y_val = cv.train_test_split( train_sp, Y, train_size=.80, random_state=random_state) # Feature Selection selector = SelectPercentile(f_classif, percentile=23) selector.fit(X_train, y_train) X_train = selector.transform(X_train) X_val = selector.transform(X_val) train_sp = selector.transform(train_sp) test_sp = selector.transform(test_sp) logger.info("# Num of Features: {}".format(X_train.shape[1])) return X_train, X_val, y_train, y_val, test_sp