def load_idlist(id_list_nm='id_list_dict_150_normal', zero_pre_post='pre'): """ zero_pre_post: "pre"表示序列开头填充0,"post"表示序列尾部填充0 """ # id_list_dict: 包含padding后的序列特征字典以及词表 id_list_dict = Cache.reload_cache(file_nm=id_list_nm, base_dir=INPUT_DATA_BASE_DIR, pure_nm=True) # # time补上'-1' # id_list_dict['time_list']['key2index']['-1'] = 92 # truncate: if USE_SEQ_LENGTH < 150: if zero_pre_post == 'pre': # 前面填充0,从后序开始截断:-USE_SEQ_LENGTH: for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:] elif zero_pre_post == 'post': # 后面填充0,从前序开始截断:0:USE_SEQ_LENGTH for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, 0:USE_SEQ_LENGTH] else: raise NotImplementedError KEY2INDEX_DICT = {} # 每个序列特征的词表组成的字典 SEQ_LENTH_DICT = {} # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150 for key in EMB_keys2do: KEY2INDEX_DICT[key] = id_list_dict[f'{key}_list']['key2index'] SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1] if len(set(SEQ_LENTH_DICT.values())) == 1: print("GlobalSeqLength:", SEQ_LENTH_DICT[key]) else: print( "GlobalSeqLength is Not Unique!!! If you are sure, comment the line after to avoid exception." ) raise # 生成mask 放入click_times_list array_new = id_list_dict['industry_list']['id_list'].copy() array_new = (array_new == 0).astype(np.int32) id_list_dict['click_times_list'] = {} id_list_dict['click_times_list']['id_list'] = array_new # mask del array_new gc.collect() input_dict_all = {} for col in EMB_keys2do: input_dict_all[col] = id_list_dict[col + '_list']['id_list'] # 加入time input_dict_all['click_times'] = id_list_dict['click_times_list']['id_list'] return input_dict_all, KEY2INDEX_DICT
def write(feature1_feature2): list_df = Cache.reload_cache('CACHE_list_df_adv_userseq_'+feature1_feature2+'.pkl')[0].values.tolist() f = open('adv_userseq_'+feature1_feature2+'.txt', 'w') for i in list_df: if i: for j in i: f.write(str(j)) f.write(' ') f.write('\n') else: f.write(str(-2)) f.write(' ') f.write('\n') f.close()
def get_embedding(f1_f2, f1): avg_f1 = Cache.reload_cache('CACHE_list_df_avg_' + f1_f2 + '.pkl') feature_tokens = avg_f1[[1]].values.flatten().astype(str).tolist() tokenizer = Tokenizer(num_words=len(feature_tokens) + 1) tokenizer.fit_on_texts(feature_tokens) embedding_dim = 64 embedding_matrix = np.random.randn(len(feature_tokens) + 1, embedding_dim) avg_f1_copy = avg_f1.copy() avg_f1_copy = avg_f1_copy.set_index(1) for feature in feature_tokens: embedding_vector = np.array(avg_f1_copy.loc[int(feature), :].values[0]) if embedding_vector is not None: index = tokenizer.texts_to_sequences([feature])[0][0] embedding_matrix[index] = embedding_vector return embedding_matrix
def input_w2v(f1_f2, all_data, f2): feature_seq = all_data[[f2]].values.flatten().astype(str).tolist() avg_f1 = Cache.reload_cache('CACHE_list_df_avg_' + f1_f2 + '.pkl') feature_tokens = avg_f1[[1]].values.flatten().astype(str).tolist() tokenizer = Tokenizer(num_words=len(feature_tokens) + 1) tokenizer.fit_on_texts(feature_tokens) npy_path = f1_f2 sequences = tokenizer.texts_to_sequences(feature_seq[:41907133]) x_train = pad_sequences(sequences, maxlen=1, padding='post') print(x_train.shape) np.save(npy_path + '_f2_train.npy', x_train) sequences = tokenizer.texts_to_sequences(feature_seq[41907133:]) x_test = pad_sequences(sequences, maxlen=1, padding='post') print(x_test.shape) np.save(npy_path + '_f2_test.npy', x_test)
def get_emb_matrix(col): """ inputs: col 需要做成预训练emb_matrix的列 加载: emb_dict 预训练的词向量 word_emb_dict 字典 id_list_dict 字典索引序列 得出id_list_dict+emb_matrix """ id_list_dict_all = Cache.reload_cache( f'CACHE_EMB_INPUTSEQ_stage2_{col}.pkl') # id_list_dict = id_list_dict_all['id_list'] # key2index = id_list_dict_all['key2index'] # emb = id_list_dict_all['emb'] key_to_represent_rare = '-1' words = list(id_list_dict_all['emb'].keys()) emb_size = id_list_dict_all['emb'][words[0]].shape[0] voc_size = len(words) emb_matrix = np.zeros((voc_size + 1, emb_size)) # emb 中必须要有'-1' 作为index 0 if '-1' not in id_list_dict_all['key2index'].keys(): # emb中无-1 为全词表数据!需要自行计算均值emb vec # 为embi 添加一个embedding # 这些词的vector求均值 vector_low_frequency_words = np.zeros((emb_size, )) for w in words: vector_low_frequency_words += id_list_dict_all['emb'][w] vector_low_frequency_words = vector_low_frequency_words / voc_size # emb添加一个key value id_list_dict_all['emb'][ key_to_represent_rare] = vector_low_frequency_words # print(f'{col} file has no key_to_represent_rare add low frequency words and fill vector as:', vector_low_frequency_words) for k, idx in id_list_dict_all['key2index'].items(): try: emb_matrix[idx, :] = id_list_dict_all['emb'][k] except KeyError: # 如果k不在不在word_emb_dict中,则默认用max_key_to_represent_rare填充 # print('find oov:',(k, idx)) emb_matrix[idx, :] = id_list_dict_all['emb'][key_to_represent_rare] emb_matrix = np.float32(emb_matrix) return {col: [id_list_dict_all['id_list'], emb_matrix]}
def get_embedding(f1_f2,f1): path = 'adv_userseq_'+f1_f2+'_word2vec.kv' wv = KeyedVectors.load(path, mmap='r') list_df = Cache.reload_cache('CACHE_list_df_adv_userseq_'+f1_f2+'.pkl') list_df.columns=['list',f1] f = open('adv_userseq_'+f1_f2+'.txt','r') ind = 0 buf = [] for i in f: buf_ = np.zeros(64) for j in i.strip().split(' '): buf_ = buf_+wv[j] buf_ = buf_/len(i) # 求平均 buf_f1 = list_df.at[ind, f1] buf__ = [] buf_ = buf_.tolist() buf__.append(buf_) buf__.append(buf_f1) buf.append(buf__) ind = ind+1 df_f1_list = pd.DataFrame(buf) Cache.cache_data(df_f1_list, nm_marker='list_df_avg_adv_userseq_'+f1_f2) return 0
list_feature2_.append(i) list_feature2.append(list_feature2_) list_df = pd.DataFrame(list_feature2) Cache.cache_data(list_df, nm_marker='list_df_adv_userseq_'+feature1+'_'+feature2) del list_df,data_group,feature2_name_list,list_feature2_,index_get_group,list_feature2 gc.collect() return True except: return False # In[4]: train = Cache.reload_cache('CACHE_train_raw.pkl').drop(columns = ['communication_onlinerate']).astype(int) train = reduce_mem(train, use_float16=True) test = Cache.reload_cache('CACHE_test_B_raw.pkl').drop(columns = ['id','communication_onlinerate']).astype(int) test = reduce_mem(test, use_float16=True) data = pd.concat([train,test],axis=0,ignore_index=True) data = reduce_mem(data, use_float16=True) del train,test gc.collect() poc_feature1_list = [['task_id','age'],['task_id','city'],['task_id','city_rank'],['task_id','device_name'],['task_id','career'], ['task_id','gender'],['task_id','residence'],['adv_id','age'],['adv_id','city'],['adv_id','city_rank'], ['adv_id','device_name'],['adv_id','career'],['adv_id','gender'],['adv_id','residence'],['creat_type_cd','age'], ['creat_type_cd','city'],['creat_type_cd','city_rank'],['creat_type_cd','device_name'],['creat_type_cd','career'], ['creat_type_cd','gender'],['creat_type_cd','residence'],['indu_name','age'],['indu_name','city'],['indu_name','city_rank'], ['indu_name','device_name'],['indu_name','career'],['indu_name','gender'],['indu_name','residence'],['adv_prim_id','age'], ['adv_prim_id','city'],['adv_prim_id','city_rank'],['adv_prim_id','device_name'],['adv_prim_id','career'],['adv_prim_id','gender'], ['adv_prim_id','residence']]
train = pd.read_csv( r'train_data.csv', sep='|', dtype=str).drop(columns=['communication_onlinerate']).astype(int) train = reduce_mem(train, use_float16=True) test = pd.read_csv( r'test_data_A.csv', sep='|', dtype=str).drop(columns=['id', 'communication_onlinerate']).astype(int) test.insert(0, 'label', np.ones([1000000])) test['label'] = 2 test = reduce_mem(test, use_float16=True) data = pd.concat([train, test], axis=0, ignore_index=True) data = reduce_mem(data, use_float16=True) data_uid_ptd_feature = data[['uid', 'pt_d', feature]] list_data = Cache.reload_cache('CACHE_list_df_adv_id.pkl') list_data.columns = ['list', 'pt_d', 'uid'] list_data = pd.merge(data_uid_ptd_feature, list_data, how='left', on=('uid', 'pt_d')) list_data = list_data['list'].values.tolist() index = 0 list_data_ = [] for i in list_data: i.append(data_uid_ptd_feature.at[index, feature]) list_data_.append(i) index = index + 1 f = open(f_path + '.txt', 'w') for i in tqdm(list_data_):
def gen_list_df(feature): print(f'{feature} start!') data = Cache.reload_cache( 'CACHE_data_sampling_pos1_neg5.pkl') # 直接对采样后的数据做序列 if feature == 'label': data.loc[data['pt_d'] >= 8, 'label'] = -1 # test的label做mask data['label'] = data['label'].astype(np.int8) data['label'] = data['label'] + 1 # 因为0用于padding置为0 data = data[['uid', feature, 'pt_d']] gc.collect() print(data.shape) data_group = data.groupby(['uid']) gc.collect() index_list = [] feature_list = [] print('index_list start') for name, group in tqdm(data_group): index_list.append(name) print('feature_list start') for i in tqdm(index_list): index_get_group = data_group.get_group(i) ptd_set = set(index_get_group['pt_d'].values.flatten().tolist()) for j in ptd_set: feature_list_ = [] buf_list = [] buf_list = index_get_group.query('pt_d < @j')[ feature].values.flatten().tolist() # 本行样本之前的点击行为序列 buf_list.append(0) # padding 0 feature_list_.append(buf_list) # 行为序列 feature_list_.append(j) # pt_d feature_list_.append(i) # uid feature_list.append(feature_list_) list_df = pd.DataFrame(feature_list) del index_list, feature_list, feature_list_, data_group, index_get_group, ptd_set gc.collect() list_df.columns = ['list', 'pt_d', 'uid'] list_df['list'] = list_df['list'].map(lambda x: [str(i) for i in x]) # 转str list_df = list_df.drop_duplicates(subset=['pt_d', 'uid']) list_df = data.merge(list_df, how='left', on=('uid', 'pt_d')) # 顺序还是用data的顺序 # 加入当天本样本 label不加 if feature != 'label': list_df['list'] = list_df[feature].map( lambda x: [str(x)]) + list_df['list'] print('w2v start!') emb_size = 32 # 预训练 embedding dim model = Word2Vec(list_df['list'].values.tolist(), size=emb_size, window=5, workers=5, min_count=1, sg=0, hs=0, negative=5, iter=5, seed=0) # 1 获取seq id_list, key2index = get_sequence(list_df, 'list', max_len=40) # 2 获取key2index emb_dict = {} for word_i in list(model.wv.vocab.keys()): if word_i in model.wv: emb_dict[word_i] = model.wv[word_i] else: emb_dict[word_i] = np.zeros(emb_size) # 3 保存 id_list_dict = {} id_list_dict['id_list'] = id_list id_list_dict['key2index'] = key2index id_list_dict['emb'] = emb_dict Cache.cache_data(id_list_dict, nm_marker=f'EMB_INPUTSEQ_stage2_{feature}') print(f'{feature} done!')
pd.set_option('max_colwidth', 200) pd.set_option('display.width', 5000) os.environ['TF_DETERMINISTIC_OPS'] = '1' SEED = 999 random.seed(SEED) np.random.seed(SEED) tf.random.set_seed(SEED) os.environ["PYTHONHASHSEED"] = str(SEED) os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID' os.environ["CUDA_VISIBLE_DEVICES"] = '4' gpus = tf.config.list_physical_devices("GPU") tf.config.experimental.set_memory_growth(gpus[0], True) # %% data = Cache.reload_cache('CACHE_data_step_1_feature_0924_r5.pkl') data.drop(columns=['communication_onlinerate'], inplace=True) sparse_features = [ 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class', 'app_second_class', 'city', 'device_name', 'career', 'gender', 'net_type', 'residence', 'emui_dev', 'indu_name', 'cmr_0', 'cmr_1', 'cmr_2', 'cmr_3', 'cmr_4', 'cmr_5', 'cmr_6', 'cmr_7', 'cmr_8', 'cmr_9', 'cmr_10', 'cmr_11', 'cmr_12', 'cmr_13', 'cmr_14', 'cmr_15', 'cmr_16', 'cmr_17', 'cmr_18', 'cmr_19', 'cmr_20', 'cmr_21', 'cmr_22', 'age', 'city_rank' ] # 删除掉cmr_23 dense_features = [ i for i in data.columns if i not in sparse_features + ['index', 'id', 'uid', 'level_0', 'pt_d', 'label']
period=1) earlystop_callback = EarlyStopping( monitor="val_AUC", min_delta=0.00001, patience=3, verbose=1, mode="max", baseline=None, restore_best_weights=True, ) reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_AUC', factor=0.5, patience=1, min_lr=0.0000001) #!################################################################################################################ deepfm_data = Cache.reload_cache('CACHE_data_deepfm.pkl') label = Cache.reload_cache('CACHE_train_NONcmr.pkl')['label'].values emb1 = np.load('adv_idembedding_matrix.npy', allow_pickle=True) emb2 = np.load('adv_prim_idembedding_matrix.npy', allow_pickle=True) emb3 = np.load('creat_type_cdembedding_matrix.npy', allow_pickle=True) emb4 = np.load('indu_nameembedding_matrix.npy', allow_pickle=True) emb5 = np.load('task_idembedding_matrix.npy', allow_pickle=True) emb_label = np.load('labelembedding_matrix.npy', allow_pickle=True) trans_1_train = np.load('adv_idx_train.npy', allow_pickle=True) trans_2_train = np.load('adv_prim_idx_train.npy', allow_pickle=True) trans_3_train = np.load('creat_type_cdx_train.npy', allow_pickle=True) trans_4_train = np.load('indu_namex_train.npy', allow_pickle=True) trans_5_train = np.load('task_idx_train.npy', allow_pickle=True) trans_label_train = np.load('labelx_train.npy', allow_pickle=True)
if use_float16 and c_min > np.finfo( np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df print('start!') data = Cache.reload_cache('CACHE_dataall_stage2_0924.pkl') print(data.dtypes) data['communication_onlinerate'] = data['communication_onlinerate'].map( lambda x: x.replace('^', ' ')) route = Cache.reload_cache('CACHE_cmr_stage2_0924.pkl') route_columns = [i for i in route.columns] data = pd.concat([data, route], axis=1) # 无index data = data.reset_index(drop=True).reset_index() # 添加index cols = [i for i in data.columns if i not in ['id', 'index']] data1 = data.query('pt_d<8').drop_duplicates( subset=cols) # 重复样本去掉 不清楚test_b是不是pt_d=8 data2 = data.query('pt_d>=8') def get_sample(df, day, rate=5):
test_B['label'] = 2 Cache.cache_data(test_B, nm_marker='test_B_raw') # # cmr-onehot # In[5]: tokenizer = Tokenizer(num_words=24, filters='^') communication_onlinerate_dict = [ '0^1^2^3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23' ] tokenizer.fit_on_texts(communication_onlinerate_dict) # In[6]: data = Cache.reload_cache('CACHE_train_raw.pkl') communication_onlinerate_raw = data['communication_onlinerate'].tolist() communication_onlinerate_sequences = tokenizer.texts_to_sequences( communication_onlinerate_raw) communication_onlinerate_sequences = pad_sequences( communication_onlinerate_sequences, maxlen=24, padding='post') communication_onlinerate_onehot = [] with tqdm(total=communication_onlinerate_sequences.shape[0]) as pbar: for i in communication_onlinerate_sequences: communication_onlinerate_onehot.append( np.delete(np.eye(25)[i], 0, axis=1).sum(axis=0)) pbar.update(1) communication_onlinerate_onehot = pd.DataFrame( communication_onlinerate_onehot).astype(int) communication_onlinerate_onehot = reduce_mem(communication_onlinerate_onehot, use_float16=True)
print("EPOCHS: ", EPOCHS) print("NUM_WORKERS: ", NUM_WORKERS) print("Cards to use:", os.environ["CUDA_VISIBLE_DEVICES"]) print("BATCH_SIZE: ", BATCH_SIZE) print("EMB_keys2do: ", EMB_keys2do) print("NUM_CLASSES: ", NUM_CLASSES) print("USE_SEQ_LENGTH: ", USE_SEQ_LENGTH) print("###" * 35) ############################################################################## print("###" * 35) print("@@@Load id_list_dict...") print("###" * 35) id_list_dict = Cache.reload_cache( file_nm= '/home/tione/notebook/cached_data/CACHE_id_list_dict_150_normal.pkl', pure_nm=False) gc.collect() # id_list_dict 包含padding后的序列特征字典以及词表 # truncate: if USE_SEQ_LENGTH < 150: for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:] SEQ_LENTH_DICT = {} # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150 for key in EMB_keys2do: SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1] if len(set(SEQ_LENTH_DICT.values())) == 1:
df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df ############################################################################################################## # 这块效率好低啊...算了也不想改了,seq='|'应该可以跑通 datatrain = pd.read_csv('./data/train_data.csv') datatraintestA = Cache.reload_cache('CACHE_dataall0816.pkl') datatest = pd.read_csv('./data/test_data_B.csv') columns_str = datatest.columns[0] dflisttst = [] for i in tqdm(range(datatest.shape[0])): dflisttst.append([ int(j) if index != 32 else j for index, j in enumerate(datatest[columns_str].iloc[i].split('|')) ]) del datatest gc.collect() dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|')) dataall = pd.concat([datatraintestA, dflisttst], ignore_index=True) dataall = reduce_mem(dataall, use_float16=False) Cache.cache_data( dataall,
print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df train = pd.read_csv( r'train_data.csv', sep='|', dtype=str).drop(columns=['communication_onlinerate']).astype(int) train = reduce_mem(train, use_float16=True) test = pd.read_csv( r'test_data_A.csv', sep='|', dtype=str).drop(columns=['id', 'communication_onlinerate']).astype(int) test.insert(0, 'label', np.ones([1000000])) test['label'] = 2 test = reduce_mem(test, use_float16=True) data = pd.concat([train, test], axis=0, ignore_index=True) data = reduce_mem(data, use_float16=True) data_uid_ptd = data[['uid', 'pt_d']] list_data = Cache.reload_cache('CACHE_list_df_2label.pkl') list_data.columns = ['list', 'pt_d', 'uid'] list_data = pd.merge(data_uid_ptd, list_data, how='left', on=('uid', 'pt_d')) list_data = list_data['list'].values.tolist() f = open(f_path + '.txt', 'w') for i in tqdm(list_data): for j in i: f.write(str(j)) f.write(' ') f.write('\n') f.close()
if use_float16 and c_min > np.finfo( np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df # data = reduce_mem(data, use_float16=False) data = Cache.reload_cache('CACHE_zlh_nn_feature_stage_2.pkl') data = reduce_mem(data, use_float16=True) # 重置index唯一值 del data['raw_index'] # del data['communication_onlinerate'] gc.collect() data = data.reset_index(drop=True).reset_index() # 加载cross emb dense_feature_size = 128 # m_user_0 = np.load('./cached_data/m0_user_stage2.npy').astype(np.float16) m_user_1 = np.load('./cached_data/m1_user_stage2.npy').astype(np.float16) # m_item_0 = np.load('./cached_data/m0_item_stage2.npy').astype(np.float16) m_item_1 = np.load('./cached_data/m1_item_stage2.npy').astype(np.float16) dataindex_base = np.load('./cached_data/dataindex_stage2.npy') # 从matrix中取出采样后的输入 # m_user_0 = np.hstack([dataindex_base.reshape(-1,1),m_user_0])
df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df # In[2]: data = Cache.reload_cache('CACHE_data_sampling_pos1_neg5.pkl') # ## count encode # In[3]: cate_cols = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class', 'app_second_class', 'city', 'device_name', 'career', 'gender', 'age', 'net_type', 'residence', 'emui_dev', 'indu_name', 'communication_onlinerate_1', 'communication_onlinerate_2', 'communication_onlinerate_3', 'communication_onlinerate_4', 'communication_onlinerate_5', 'communication_onlinerate_6', 'communication_onlinerate_7', 'communication_onlinerate_8', 'communication_onlinerate_9', 'communication_onlinerate_10', 'communication_onlinerate_11', 'communication_onlinerate_12', 'communication_onlinerate_13', 'communication_onlinerate_14', 'communication_onlinerate_15',
m1_user = [] for i in tqdm(list(data['user_f'])): try: m1_user.append(model.wv[i]) except: m1_user.append([0] * 128) m1_user = np.array(m1_user, dtype=np.float32) print(m1_item.shape) np.save('./cached_data/m0_item_stage2.npy', m1_item) print(m1_user.shape) np.save('./cached_data/m0_user_stage2.npy', m1_user) if __name__ == '__main__': print('start!') data = Cache.reload_cache('CACHE_sampling_pro_feature.pkl') print(data.shape) data['label'] = data['label'].fillna(2).astype(int) # mask gc.collect() print('w2v start!') # 生成一个emb matrix user_fe_list = ['age', 'city_rank', 'gender', 'slot_id', 'net_type'] # 'city_rank' item_fe_list = [ 'task_id', 'adv_id', 'creat_type_cd', 'dev_id', 'inter_type_cd', 'indu_name', 'adv_prim_id', 'tags', 'spread_app_id', 'app_first_class', 'his_on_shelf_time' ] print('join!') # 简化的预训练方式 将用户属性,广告属性做拼接,大窗口做预训练学习共现分布 data['user_f'] = '' for i, vari in enumerate(user_fe_list):
sequences = tokenizer.texts_to_sequences(feature_seq[41907133:]) x_test = pad_sequences(sequences, maxlen=1, padding='post') print(x_test.shape) np.save(npy_path + '_f2_test.npy', x_test) if __name__ == '__main__': f1_f2_list = [['task_id', 'age'], ['task_id', 'city'], ['task_id', 'city_rank'], ['task_id', 'device_name'], ['task_id', 'career'], ['task_id', 'gender'], ['task_id', 'residence'], ['adv_id', 'age'], ['adv_id', 'city'], ['adv_id', 'city_rank'], ['adv_id', 'device_name'], ['adv_id', 'career'], ['adv_id', 'gender'], ['adv_id', 'residence'], ['creat_type_cd', 'age'], ['creat_type_cd', 'city'], ['creat_type_cd', 'city_rank'], ['creat_type_cd', 'device_name'], ['creat_type_cd', 'career'], ['creat_type_cd', 'gender'], ['creat_type_cd', 'residence'], ['indu_name', 'age'], ['indu_name', 'city'], ['indu_name', 'city_rank'], ['indu_name', 'device_name'], ['indu_name', 'career'], ['indu_name', 'gender'], ['indu_name', 'residence'], ['adv_prim_id', 'age'], ['adv_prim_id', 'city'], ['adv_prim_id', 'city_rank'], ['adv_prim_id', 'device_name'], ['adv_prim_id', 'career'], ['adv_prim_id', 'gender'], ['adv_prim_id', 'residence']] all_data = Cache.reload_cache('CACHE_data_deepfm.pkl') for i in tqdm(f1_f2_list): input_w2v(str(i[0]) + '_' + str(i[1]), all_data, str(i[1]))
# window特征+2k last_seq_list = [ 'creat_type_cd', 'tags', 'spread_app_id', 'task_id', 'adv_id', 'label' ] user_fe_list = [ 'age', 'city_rank', 'career', 'gender', 'city', 'device_name', 'residence', 'emui_dev' ] item_fe_list = ['task_id', 'adv_id', 'adv_prim_id', 'tags', 'spread_app_id'] cross_emb_dict = {} # 成对做拼接+slotnettype for i, vari in enumerate(user_fe_list): for j, varj in enumerate(item_fe_list): if j > i: # 拼接emb df1 = Cache.reload_cache( f'CACHE_EMB_TARGET_DICT_{vari}__{varj}_w2v.pkl') df2 = Cache.reload_cache( f'CACHE_EMB_TARGET_DICT_{varj}__{vari}_w2v.pkl') embvari = {} # 都转int key ,拼接 for key, value in df1['key'].items(): embvari[key] = np.hstack([value, df2['value'][str(key)]]) embvarj = {} # 都转int key ,拼接 for key, value in df2['key'].items(): embvarj[key] = np.hstack([value, df1['value'][str(key)]]) cross_emb_dict[vari + '__' + varj] = (embvari, embvarj) print('load data finish!') # ## 处理做交叉相似度计算的列 生成索引
def get_sample(df, day, neg_rate=5): set1 = df.query('pt_d=={}'.format(day)) set1_pos = set1.query('label==1') nums_pos = set1_pos.shape[0] nums_neg = nums_pos * neg_rate set1_neg = set1.query('label==0') set1_neg = set1_neg.sample(nums_neg, random_state=0) df_sample = pd.concat([set1_pos, set1_neg]) print(df_sample['label'].value_counts(), df_sample['label'].mean()) return df_sample # In[4]: train = Cache.reload_cache('CACHE_train.pkl') train = train.reset_index() train.rename(columns={'index': 'raw_index'}, inplace=True) test_B = Cache.reload_cache('CACHE_test_B.pkl').drop(columns=['id']) test_B = test_B.reset_index() test_B.rename(columns={'index': 'raw_index'}, inplace=True) test_B['raw_index'] = test_B['raw_index'] + 41907133 train_ptd_1 = get_sample(train, 1) train_ptd_2 = get_sample(train, 2) train_ptd_3 = get_sample(train, 3) train_ptd_4 = get_sample(train, 4) train_ptd_5 = get_sample(train, 5) train_ptd_6 = get_sample(train, 6) train_ptd_7 = get_sample(train, 7)