def test_DIN_sum(): model_name = "DIN_sum" x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, use_din=False, hidden_size=[4, 4, 4], keep_prob=0.6, activation="sigmoid") model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, verbose=1, validation_split=0.5) print(model_name + " test train valid pass!") model.save_weights(model_name + '_weights.h5') model.load_weights(model_name + '_weights.h5') print(model_name + " test save load weight pass!") save_model(model, model_name + '.h5') model = load_model(model_name + '.h5', custom_objects) print(model_name + " test save load model pass!") print(model_name + " test pass!")
def test_DIN_att(): model_name = "DIN_att" x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN( feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, use_din=True, hidden_size=[4, 4, 4], keep_prob=0.6, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, verbose=1, validation_split=0.5) print(model_name + " test train valid pass!") model.save_weights(model_name + '_weights.h5') model.load_weights(model_name + '_weights.h5') print(model_name + " test save load weight pass!") # try: # save_model(model, name + '.h5') # model = load_model(name + '.h5', custom_objects) # print(name + " test save load model pass!") # except: # print("【Error】There is a bug when save model use Dice---------------------------------------------------") print(model_name + " test pass!")
def test_DIN_att(): model_name = "DIN_att" x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, use_din=True, hidden_size=[4, 4, 4], keep_prob=0.6,) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, verbose=1, validation_split=0.5) print(model_name+" test train valid pass!") model.save_weights(model_name + '_weights.h5') model.load_weights(model_name + '_weights.h5') print(model_name+" test save load weight pass!") # try: # save_model(model, name + '.h5') # model = load_model(name + '.h5', custom_objects) # print(name + " test save load model pass!") # except: # print("【Error】There is a bug when save model use Dice---------------------------------------------------") print(model_name + " test pass!")
def test_DIN_model_io(): model_name = "DIN_att" _, _, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, att_activation=Dice, use_din=True, hidden_size=[4, 4, 4], keep_prob=0.6,) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) #model.fit(x, y, verbose=1, validation_split=0.5) save_model(model, model_name + '.h5') model = load_model(model_name + '.h5', custom_objects) print(model_name + " test save load model pass!")
def test_DIN_model_io(): name = "DIN_att" x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN( feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, use_din=True, hidden_size=[4, 4, 4], keep_prob=0.6, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) #model.fit(x, y, verbose=1, validation_split=0.5) save_model(model, name + '.h5') model = load_model(name + '.h5', custom_objects) print(name + " test save load model pass!")
def example_din(): """ 1. 生成训练数据为txt格式的,逗号分割字段 2. 转换成tfrecord 3. 读取数据,区分dense, sparse, VarLenSparse, 用户行为序列特征 4. 分别喂到模型中,看看会怎么样 :return: """ # x, y, feature_columns, behavior_feature_list = get_xy_fd() #说一下哪几列是当前的item需要和历史的行为进行attention的。所以之后就可以尝试,还是像之前一样读数据,然后只是把需要attention的列名单拿出来,放到list中就可以了 x, y, feature_columns, behavior_feature_list = get_xy_from_txt( ) #说一下哪几列是当前的item需要和历史的行为进行attention的。所以之后就可以尝试,还是像之前一样读数据,然后只是把需要attention的列名单拿出来,放到list中就可以了 # dataset = tf.data.Dataset.from_tensor_slices((x.values, y.values)) model = DIN(feature_columns, behavior_feature_list) model.compile( 'adam', keras.losses.binary_crossentropy, metrics=[keras.metrics.AUC(), keras.metrics.categorical_accuracy]) history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5) # history = model.fit(dataset, verbose=1, epochs=10, validation_data=(x,y)) # history = model.fit(dataset, verbose=1, epochs=10, validation_split=0.5) print("history: ", history)
def test_DIN_sum(): model_name = "DIN_sum" x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, use_din=False, hidden_size=[4, 4, 4], keep_prob=0.6, activation="sigmoid") check_model(model, model_name, x, y)
def get_xy_fd(): feature_dim_dict = {"sparse": [SingleFeat('user', 3), SingleFeat( 'gender', 2), SingleFeat('item', 3+1), SingleFeat('item_gender', 2+1)], "dense": [SingleFeat('score', 0)]} behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in feature_dim_dict["dense"]] + [feature_dict['hist_'+feat] for feat in behavior_feature_list] y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list if __name__ == "__main__": x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4,) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) seq_length = np.array([3, 3, 2]) # the actual length of the behavior sequence feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score, 'seq_length': seq_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list if __name__ == "__main__": x, y, feature_columns, behavior_feature_list = get_xy_fd() model = DIN(feature_columns, behavior_feature_list) # model = BST(feature_columns, behavior_feature_list,att_head_num=4) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
elif column == 'action_type': feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)] else: feature_columns += [DenseFeat(column, 1)] # maxlen为历史信息的长度,vocabulary_size为onehot的长度 feature_columns += [ VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8, embedding_name='merchant_id'), maxlen=M), VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4, embedding_name='action_type'), maxlen=M)] history_features = ['merchant_id', 'action_type'] print(len(feature_columns)) # 使用DIN模型 model = DIN(feature_columns, history_features) # 使用Adam优化器,二分类的交叉熵 model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) # model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"]) # 组装train_model_input,得到feature names,将train_X转换为字典格式 feature_names = list(train_X.columns) train_model_input = {name: train_X[name].values for name in get_feature_names(feature_columns)} print("########################################") # histroy输入必须是二维数组 from tqdm import tqdm for fea in ['hist_merchant_id', 'hist_action_type']: list = [] for i in tqdm(train_model_input[fea]):
sess_len_max = SESS_MAX_LEN BATCH_SIZE = 1024 sess_feature = ['item_id'] # def auc(y_true,y_pred): # return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double) EMBEDDING_SIZE = int(ebs) if EMBEDDING_SIZE == 0: EMBEDDING_SIZE = "auto" model = DIN(fd, sess_feature, embedding_size=EMBEDDING_SIZE, dnn_dropout=float(dnn_dropout), att_activation='dice', att_weight_normalization=False, hist_len_max=sess_len_max, dnn_hidden_units=(200, 80), att_hidden_size=( 64, 16, )) model.compile('adagrad', 'binary_crossentropy', metrics=['binary_crossentropy']) model_dir = "../model_dir_" + str(EMBEDDING_SIZE) if not os.path.exists(model_dir): os.mkdir(model_dir) if os.path.exists(model_dir + '/ckpt.h5'): model.load_weights(model_dir + '/ckpt.h5') """ test_input_pos = pd.read_pickle(
user_age = np.array([1, 2, 3]) user_gender = np.array([0, 1, 0]) item_id = np.array([0, 1, 2]) item_gender = np.array([0, 1, 0]) # multi-value feature input hist_item_id = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 0]]) hist_item_gender = np.array([[0, 1, 0, 1], [0, 1, 1, 1], [0, 0, 1, 0]]) # valid length of behavior sequence of every sample hist_length = np.array([4, 4, 3]) feature_dict = {'user_age': user_age, 'user_gender': user_gender, 'item_id': item_id, 'item_gender': item_gender, 'hist_item_id': hist_item_id, 'hist_item_gender': hist_item_gender, } x = [feature_dict[feat] for feat in feature_dim_dict["sparse"]] + \ [feature_dict['hist_'+feat] for feat in behavior_feature_list] + [hist_length] # Notice the concatenation order: single feature + multi-value feature + length # Since the length of the historical sequences of different features in DIN are the same(they are all extended from item_id),only one length vector is enough. y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list if __name__ == "__main__": x, y, feature_dim_dict, behavior_feature_list = get_xy_fd() model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4,) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(x, y, verbose=1, validation_split=0.5)
elif sys.argv[1] == 'PNN_UDG': model = PNN_UDG(dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'PNN': model = PNN(dnn_feature_columns, untrainable_features_columns, (200, 80)) elif sys.argv[1] == 'WDL': model = WDL(linear_feature_columns, dnn_feature_columns, [], (200, 80)) elif sys.argv[1] == 'WDL_UDG': model = WDL_UDG(linear_feature_columns, dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'DIEN': model = DIEN(fixlen_feature_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, gru_type="AUGRU", use_negsampling=True) elif sys.argv[1] == 'DIEN_UDG': model = DIEN_UDG(fixlen_feature_columns, untrainable_features_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, gru_type="AUGRU", use_negsampling=True, uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) elif sys.argv[1] == 'DIN': model = DIN(fixlen_feature_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0) elif sys.argv[1] == 'DIN_UDG': model = DIN_UDG(fixlen_feature_columns, untrainable_features_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5])) if sys.argv[4] == 'focal': model.compile("adam", loss=focal_loss, metrics=['binary_crossentropy'], ) else: model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) init_lr = float(tf.keras.backend.get_value(model.optimizer.learning_rate)) lr = [init_lr, init_lr/2, init_lr/4] history_all = {} max_auc, min_log, min_rmse, max_rig = 0, 0, 0, 0 for x in range(epoch): tf.keras.backend.set_value(model.optimizer.lr, lr[x]) history = CustomCallback() model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=1,
test_label = label[test_idx] sess_len_max = SESS_MAX_LEN BATCH_SIZE = 4096 sess_feature = ['cate_id', 'brand'] TEST_BATCH_SIZE = 2**17 REG = 1e-6 model = DIN(fd, sess_feature, embedding_size=4, att_activation='dice', att_weight_normalization=False, hist_len_max=sess_len_max, dnn_hidden_units=(200, 80), att_hidden_size=( 64, 16, ), l2_reg_embedding=REG, seed=2019) model.compile('adagrad', 'binary_crossentropy', metrics=[ 'binary_crossentropy', ]) hist_ = model.fit( train_input[:],
def buildModel(self): feature_columns = self.encoder.getFeatureColumns() self.model = DIN(feature_columns, self.encoder.behavior_list) self.model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
class Trainer: def __init__(self): self.data = None self.encoder = None self.model = None # number of positive samples self.num_pos = None self.recipeDomain = None def loadData(self, url: str): self.data = pd.read_csv(url) self.recipeDomain = pd.read_csv(url) self.num_pos = self.data.shape[0] self.encoder = ModelEncoder() self.encoder.train() def preProcessData(self): self.data = self.encoder.encode(self.data) self.data['result'] = [1] * self.num_pos self.recipeDomain = self.encoder.encode(self.recipeDomain) self.build_negative_data() def buildModel(self): feature_columns = self.encoder.getFeatureColumns() self.model = DIN(feature_columns, self.encoder.behavior_list) self.model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) def train(self): model_input = {} for feat, _ in self.encoder.fixed_sparse_dict + self.encoder.var_sparse_dict: model_input[feat] = self.data[feat] history = self.model.fit(model_input, self.data['gt'].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, shuffle=True) def dump_model(self, path: str): save_model(self.model, path) def dump_encoder(self, path: str): pickle.dump(self.encoder, self, protocol=4) def update(self): url = '' self.recipeDomain = pd.read_csv(url) self.encoder.encode(self.recipeDomain) def build_negative_data(self): for i in range(self.num_pos): record = self.data.iloc[i] clicked_set = record['hist_recipe'] + record['recipe'][i] for j in self.recipeDomain.shape[0]: if self.recipeDomain.iloc[j]['recipe'] not in clicked_set: # valid unclicked combination for feat in record.columns: if feat in self.recipeDomain.columns: record[feat] = self.recipeDomain.iloc[j][feat] record['result'] = 0 self.data.append(record)