def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', vocabulary_size=2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2, 3]) gender = np.array([0, 1, 0, 1]) item_id = np.array([1, 2, 3, 2]) # 0 is mask value cate_id = np.array([1, 2, 1, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3, 0.2]) hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2, 2]) feature_dict = { 'user': uid, 'gender': gender, 'item_id': item_id, 'cate_id': cate_id, 'hist_item_id': hist_item_id, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1, 0]) return x, y, feature_columns, behavior_feature_list
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max'], classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append(VarLenSparseFeat(SparseFeat(prefix+"weighted_seq",vocabulary_size=2,embedding_dim=embedding_size),maxlen=3,length_name=prefix+"weighted_seq"+"_seq_length",weight_name=prefix+"weight")) s_input, s_len_input = gen_sequence( 2, 3, sample_size) model_input[prefix+"weighted_seq"] = s_input model_input[prefix+'weight'] = np.random.randn(sample_size,3,1) model_input[prefix+"weighted_seq"+"_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append(SparseFeat(prefix+'sparse_feature_'+str(i), dim,embedding_size,dtype=torch.int32)) for i in range(dense_feature_num): feature_columns.append(DenseFeat(prefix+'dense_feature_'+str(i), 1,dtype=torch.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix +'sequence_' + mode,vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc,SparseFeat): model_input[fc.name]= np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc,DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence( fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix+"sequence_"+str(i)+'_seq_length' model_input[prefix+"sequence_"+str(i)+'_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=('sum', 'mean', 'max'), classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag, torch.int32)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, torch.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode)) model_input = [] sequence_input = [] sequence_len_input = [] for fc in feature_columns: if isinstance(fc, SparseFeat): model_input.append(np.random.randint(0, fc.dimension, sample_size)) elif isinstance(fc, DenseFeat): model_input.append(np.random.random(sample_size)) else: s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen, sample_size) sequence_input.append(s_input) sequence_len_input.append(s_len_input) if classification: y = np.random.randint(0, 2, sample_size) while sum(y) < 0.3 * sample_size: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) x = model_input + sequence_input if include_length: for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length', 1, embedding=False)) x += sequence_len_input return x, y, feature_columns
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=8), SparseFeat('gender', 2, embedding_dim=8), SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=8), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8), 4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=8), 4, length_name="seq_length") ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score, "seq_length": behavior_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def build_model( self, embedding_dim=4, task='binary', optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], device='cpu', ): fixlen_feature_columns = [ SparseFeat( feat, vocabulary_size=self.vocabulary_size_dict[feat], embedding_dim=embedding_dim, ) for feat in self.sparse_features ] if self.variable_length_features: varlen_feature_columns = [ VarLenSparseFeat( SparseFeat( feat, vocabulary_size=self.vocabulary_size_dict[feat], embedding_dim=embedding_dim, ), maxlen=self.variable_length_features_max_len[feat], combiner='mean', ) for feat in self.variable_length_features ] else: varlen_feature_columns = [] linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns model = DeepFM(linear_feature_columns, dnn_feature_columns, task=task, device=device) model.compile(optimizer, loss, metrics) return model
genres_list, maxlen=max_len, padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for feat in sparse_features ] varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name=None) ] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model model_input = {name: data[name] for name in sparse_features} # model_input["genres"] = genres_list # 4.Define Model,compile and train
genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences( genres_list, maxlen=max_len, padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] varlen_feature_columns = [ VarLenSparseFeat('genres', len(key2index) + 1, max_len, 'mean') ] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model model_input = {name: data[name] for name in feature_names} model_input['genres'] = genres_list # 4.Define Model,compile and train model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile( "adam",
data[feat] = lbe.fit_transform(data[feat]) # preprocess the sequence feature key2index = {} genres_list = list(map(split, data['Genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for feat in sparse_features] varlen_feature_columns = [VarLenSparseFeat(SparseFeat('Genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model model_input = {name: data[name] for name in sparse_features} # model_input["Genres"] = genres_list # 4.Define Model,compile and train device = 'cpu' use_cuda = True if use_cuda and torch.cuda.is_available():
from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MinMaxScaler from deepctr_torch.models import * from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names import torch import torch.nn.functional as F feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])