def data_preprocess(csv_file): data = pd.read_csv(csv_file) sparse_features = ["movie_id", "gender", "age"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] # 他自己的資料型態 # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns) # movie_id, gender, age. test_model_input = {name: data[name] for name in feature_names } # dict of movie_id, gender, age value return test_model_input, linear_feature_columns, dnn_feature_columns
def data_preprocess(csv_file): data = pd.read_csv(csv_file) sparse_features = ["movie_id", "gender", "age"] movie_genres = [ 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western' ] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) sparse_features.extend(movie_genres) # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] # 他自己的資料型態 # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns) # movie_id, gender, age. test_model_input = {name: data[name] for name in feature_names } # dict of movie_id, gender, age value return test_model_input, linear_feature_columns, dnn_feature_columns
def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y, cat_cols): sparse_features = cat_cols dense_features = [ idx for idx in range(train_X.shape[1]) if idx not in cat_cols ] sparse_feature_columns = [ SparseFeat(str(feat), vocabulary_size=len(set(train_X[:, feat])) + 1, embedding_dim=4) for i, feat in enumerate(sparse_features) ] dense_feature_columns = [ DenseFeat( str(feat), 1, ) for feat in dense_features ] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = { name: train_X[:, int(name)] for name in feature_names } val_model_input = {name: val_X[:, int(name)] for name in feature_names} test_model_input = { name: test_X[:, int(name)] for name in feature_names } use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') self.device = 'cuda:0' self.model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=self.device) self.model.compile( Adam(self.model.parameters(), 0.0001), "binary_crossentropy", metrics=['binary_crossentropy'], ) es = EarlyStopping(monitor='val_binary_crossentropy', min_delta=0, verbose=1, patience=30, mode='min') lbe = LabelEncoder() self.model.fit(train_model_input, lbe.fit_transform(train_Y), batch_size=512, epochs=21, verbose=2, validation_data=(val_model_input, lbe.transform(val_Y))) pred_ans = self.model.predict(test_model_input, batch_size=256) print(f'{log_loss(test_Y, pred_ans):.5f}')
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', vocabulary_size=2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2, 3]) gender = np.array([0, 1, 0, 1]) item_id = np.array([1, 2, 3, 2]) # 0 is mask value cate_id = np.array([1, 2, 1, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3, 0.2]) hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2, 2]) feature_dict = { 'user': uid, 'gender': gender, 'item_id': item_id, 'cate_id': cate_id, 'hist_item_id': hist_item_id, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1, 0]) return x, y, feature_columns, behavior_feature_list
def __init__(self, stage, action): """ :param linear_feature_columns: List of tensorflow feature_column :param dnn_feature_columns: List of tensorflow feature_column :param stage: String. Including "online_train"/"offline_train"/"evaluate"/"submit" :param action: String. Including "read_comment"/"like"/"click_avatar"/"favorite"/"forward"/"comment"/"follow" """ super(WideAndDeep, self).__init__() self.num_epochs_dict = {"read_comment": 1, "like": 1, "click_avatar": 1, "favorite": 1, "forward": 1, "comment": 1, "follow": 1} self.estimator = None self.stage = stage self.action = action self.dnn_feature_columns, self.linear_feature_columns = self.get_feature_columns() self.feature_names = get_feature_names(self.dnn_feature_columns + self.linear_feature_columns) tf.logging.set_verbosity(tf.logging.INFO)
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=8), SparseFeat('gender', 2, embedding_dim=8), SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=8), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8), 4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=8), 4, length_name="seq_length") ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score, "seq_length": behavior_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def data_preprocess(data_df): data = data_df.copy(deep=True) sparse_features = ["movie_id", "gender", "age"] # movie_genres = [ # 'Action','Adventure','Animation','Childrens','Comedy','Crime', # 'Documentary','Drama','Fantasy','Film_Noir','Horror','Musical', # 'Mystery','Romance','Sci_Fi','Thriller','War','Western' # ] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features # TODO: Add arguments encoder_list = get_train_LabelEncoder_info('./recommend_data/data.csv') for feat in sparse_features: # print(feat) # print(list(encoder_list[feat].classes_)) # The range of age is from 1-100 if feat == 'age': age_encoder = np.array([str(i) for i in range(1, 101)]) encoder_list[feat].classes_ = age_encoder elif feat == 'gender': gender_encoder = np.array(['M', 'F']) encoder_list[feat].classes_ = gender_encoder data[feat] = encoder_list[feat].transform(data[feat]) # 2.count #unique features for each sparse field # TODO: Add arguments fixlen_feature_columns = get_train_fixlen_feature_columns( './recommend_data/data.csv') # 他自己的資料型態 # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns) # movie_id, gender, age. test_model_input = {name: data[name] for name in feature_names } # dict of movie_id, gender, age value return test_model_input, linear_feature_columns, dnn_feature_columns
"age", "skin_type", "idThirdCategory", ] target = ['rating'] # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, glowpick[feat].nunique(), embedding_dim=4) for feat in sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # linear_feature_columns list 와 dnn_feature_columns list 를 load with open(FILE_PATH + 'linear_feature_columns_list.pickle', 'rb') as fp: linear_feature_columns = pickle.load(fp) with open(FILE_PATH + 'dnn_feature_columns_list.pickle', 'rb') as fp: dnn_feature_columns = pickle.load(fp) feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(glowpick, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names}
def train_recommend_movies(csv_file, DEVICE): """ Description: Train recommend system on: Model: "xDeepFM", Target: "rating", Input features: ["movie_id", "gender", "age"], Save model to: "save_model/xDeepFM_MSE{}.h5" Parameters: csv_file: "path to *.csv" DEVICE: "cuda:0" """ data = pd.read_csv(csv_file) sparse_features = ["movie_id", "gender", "age"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] # 他自己的資料型態 # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns ) # movie_id, user_id, gender, age, occupation, zip. # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = { name: test[name] for name in feature_names } # dict of movie_id, user_id, gender, age, occupation, zip values # 4.Define Model,train,predict and evaluate device = 'cpu' use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') device = DEVICE # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device) # model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='regression', device=device) model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device) model.compile( "adam", "mse", metrics=['mse'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test MSE", round(mean_squared_error(test[target].values, pred_ans), 4)) torch.save( model.state_dict(), 'save_model/xDeepFM_MSE{}.h5'.format( round(mean_squared_error(test[target].values, pred_ans), 4)))
def task(action): print('-----------action-----------', action) USE_FEAT = [action] + SELECT_FRTS train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[USE_FEAT] train = train.sample(frac=1, random_state=42).reset_index(drop=True) print("posi prop:") print(sum((train[action] == 1) * 1) / train.shape[0]) test = pd.read_csv(ROOT_PATH + '/test_data.csv')[SELECT_FRTS] target = [action] test[target[0]] = 0 test = test[USE_FEAT] data = pd.concat((train, test)).reset_index(drop=True) print(train.shape, test.shape, data.shape) dense_features = DENSE_FEATURE sparse_features = [ i for i in USE_FEAT if i not in dense_features and i not in target ] data[sparse_features] = data[sparse_features].fillna(0) data[dense_features] = data[dense_features].fillna(0) # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] # dnn_feature_columns = fixlen_feature_columns #linear_feature_columns = [SparseFeat(feat, data[feat].nunique()) # for feat in sparse_features] linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = data.iloc[:train.shape[0]].reset_index( drop=True), data.iloc[train.shape[0]:].reset_index(drop=True) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} #------- eval_ratio = 0. eval_df = train[int((1 - eval_ratio) * train.shape[0]):].reset_index(drop=True) userid_list = eval_df['userid'].astype(str).tolist() print('val len:', len(userid_list)) # 4.Define Model,train,predict and evaluate device = 'cpu' use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') device = 'cuda:0' model = MyDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, use_fm=True, dnn_hidden_units=(256, 128), l2_reg_linear=1e-1, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0., dnn_activation='relu', dnn_use_bn=False, task='binary', device=device) model.compile("adagrad", "binary_crossentropy", metrics=["auc"]) history = model.fit(train_model_input, train[target].values, batch_size=1024, epochs=NUM_EPOCH_DICT[action], verbose=1, validation_split=eval_ratio, userid_list=userid_list) pred_ans = model.predict(test_model_input, 128) #submit[action] = pred_ans torch.cuda.empty_cache() return pred_ans
def train_recommend_movies(csv_file, DEVICE): """ Description: Train recommend system on: Model: "xDeepFM", Target: "rating", Input features: ["movie_id", "gender", "age"], Save model to: "save_model/xDeepFM_MSE{}.h5" Parameters: csv_file: "path to *.csv" DEVICE: "cuda:0" """ data = pd.read_csv(csv_file) # sparse_features = ["movie_id", "user_id", # "gender", "age", "occupation", "zip"] sparse_features = ["movie_id", "gender", "age"] movie_genres = [ 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western' ] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) sparse_features.extend(movie_genres) # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] # 他自己的資料型態 # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns ) # movie_id, user_id, gender, age, occupation, zip. # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = { name: test[name] for name in feature_names } # dict of movie_id, user_id, gender, age, occupation, zip values # 4.Define Model,train,predict and evaluate # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device) model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='regression', device=DEVICE) # model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device) model.compile( "adam", "mse", metrics=['mse'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test MSE", round(mean_squared_error(test[target].values, pred_ans), 4)) print("test MAE", round(mean_absolute_error(test[target].values, pred_ans), 4)) # torch.save(model.state_dict(), './recommend_system/save_model/xDeepFM_MSE{}.h5' .format(round(mean_squared_error(test[target].values, pred_ans), 4))) torch.save( model.state_dict(), './recommend_system/save_model/FiBiNET_MSE{}.h5'.format( round(mean_squared_error(test[target].values, pred_ans), 4)))
ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } x = {name: feature_dict[name] for name in get_feature_names(feature_columns)} y = np.array([[1], [0], [1]]) model = DIN([], feature_columns, behavior_feature_list, hist_len_max=4, device='cpu', dnn_activation='dice', att_activation='dice') model.compile('adagrad', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, batch_size=32, epochs=3, validation_split=0.0, verbose=1)