def load_idlist(id_list_nm='id_list_dict_max_len_200_all', zero_pre_post='pre'): """ zero_pre_post: "pre"表示序列开头填充0,"post"表示序列尾部填充0 """ # id_list_dict: 包含padding后的序列特征字典以及词表 id_list_dict = Cache.reload_cache(file_nm=id_list_nm, base_dir=INPUT_DATA_BASE_DIR, pure_nm=True) # truncate: if USE_SEQ_LENGTH < 200: if zero_pre_post == 'pre': # 前面填充0,从后序开始截断:-USE_SEQ_LENGTH: for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:] elif zero_pre_post == 'post': # 后面填充0,从前序开始截断:0:USE_SEQ_LENGTH for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, 0:USE_SEQ_LENGTH] else: raise NotImplementedError KEY2INDEX_DICT = {} # 每个序列特征的词表组成的字典 SEQ_LENTH_DICT = {} # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150 for key in EMB_keys2do: KEY2INDEX_DICT[key] = id_list_dict[f'{key}_list']['key2index'] SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1] if len(set(SEQ_LENTH_DICT.values())) == 1: print("GlobalSeqLength:", SEQ_LENTH_DICT[key]) else: print( "GlobalSeqLength is Not Unique!!! If you are sure, comment the line after to avoid exception." ) raise # 生成mask 放入click_times_list array_new = id_list_dict['industry_list']['id_list'].copy() array_new = (array_new == 0).astype(np.int32) id_list_dict['click_times_list'] = {} id_list_dict['click_times_list']['id_list'] = array_new # mask del array_new gc.collect() input_dict_all = {} for col in EMB_keys2do: input_dict_all[col] = id_list_dict[col + '_list']['id_list'] input_dict_all['click_times'] = id_list_dict['click_times_list'][ 'id_list'] # 加入time return input_dict_all, KEY2INDEX_DICT
def load_datalabel(): ''' :return: train datalabel and matrix to save modelresult ''' datalabel = Cache.reload_cache(file_nm='datalabel_with_seq_length', base_dir=INPUT_DATA_BASE_DIR, pure_nm=True) if datalabel['age'].min() == 1: datalabel['age'] = datalabel['age'] - 1 if datalabel['gender'].min() == 1: datalabel['gender'] = datalabel['gender'] - 1 assert datalabel['age'].min() == 0 assert datalabel['gender'].min() == 0 datalabel = datalabel[['user_id', 'gender', 'age']] traindata = datalabel.loc[~datalabel['age'].isna()].reset_index(drop=True) testdata = datalabel.loc[datalabel['age'].isna()].copy().reset_index( drop=True) traindata['age'] = traindata['age'].astype(np.int8) traindata['gender'] = traindata['gender'].astype(np.int8) traindata['age_gender'] = traindata['gender'] * 10 + traindata['age'] # gender = 0, age => 0~9 # gender = 1, age+=10 => 10~19 print( f"traindata['age_gender'].unique(): {sorted(traindata['age_gender'].unique())}" ) print(traindata.shape, testdata.shape) # init array to store oof and model prob train_shape = traindata.shape[0] test_shape = testdata.shape[0] model_prob = np.zeros((train_shape + test_shape, NUM_CLASSES, N_FOLDS), dtype='float32') all_uid_df = datalabel[['user_id']].copy() # to save the model_prob train_uid_df = traindata[['user_id']].copy() # to save the oof_prob if not isTEST: os.makedirs(f"../../05_RESULT/META/{TRAIN_MARKER}", exist_ok=True) os.makedirs("../../05_RESULT/SUB", exist_ok=True) all_uid_df.to_csv( f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_all_uid_df.csv", index=False) train_uid_df.to_csv( f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_train_uid_df.csv", index=False) return traindata, model_prob
def random_get_embedding_fun(self, id_list_dict): emb_matrix_dict = {} for col in self.use_cols: col_file_names = [] sepc_embs = self.spec_emb_dict[col] # 必须要用 # 随机抽一些embedding 优先抽最大个数个 再在后续不断拼到dict中达到max_embs就停止 # 文件名对应的表示是user_id_xx for indexpath, pathi in enumerate(self.path_list): for filei in os.listdir(pathi): if filei.find('user_id_' + col) > -1: col_file_names.append(pathi + filei) if len(sepc_embs) > 0: # 排它 col_file_names = list( set(col_file_names).difference(set(sepc_embs))) random.shuffle(col_file_names) select_nums = min( [len(col_file_names), self.max_nums[col] - len(sepc_embs)]) # 再选入的个数 file_to_load = col_file_names[:select_nums] # 再选入的emb file_to_load = sepc_embs + file_to_load emblist = [] for filei in file_to_load: try: emb_i = Cache.reload_cache(file_nm=filei, base_dir='', pure_nm=False)['word_emb_dict'] emblist.append(emb_i) except: print('missing! ', filei) print('processing {} shape {}'.format(col, len(emblist))) print(file_to_load) # 选中的file emb_matrix_all = self.get_batch_emb_matrix( file_to_load, emblist, id_list_dict, col + '_list', max_embs=self.max_embs[col]) # id_list_dict 外部传入 emb_matrix_dict[col] = emb_matrix_all # 一个list del emb_matrix_all, emblist gc.collect() # key 是列名 value是一个list 里面有这个列所属的各种embedding矩阵 按照词表*emb_size的 return emb_matrix_dict
BATCH_SIZE = 512 SEQ_LENGTH = 150 DROPOUT = 0.3 NUM_CLASS = 20 EPOCHS = 30 LR = 1e-3 device = torch.device("cuda:0") ############################## ######## 获取emb ############# ############################## seq_length_creative_id = 150 # 序列都padding到了150 id_list_dict = Cache.reload_cache( file_nm='../../cached_data/CACHE_id_list_dict_150_normal.pkl', base_dir='', pure_nm=False) # 定义需要的输入 cols_to_emb = [ 'creative_id', 'ad_id', 'advertiser_id', 'product_id', 'product_category', 'industry', 'time' ] # 定义emb 文件路径 path_list = ['../../cached_data/'] # 定义最大emb_size max_embs = { 'creative_id': 2000, 'ad_id': 2000, 'advertiser_id': 2000, 'product_id': 2000,
def _load_merged_emb(emb_lst): all_emb_dict = {} for i, nm in enumerate(emb_lst, 1): all_emb_dict[f"emb_{i}"] = Cache.reload_cache(nm) return all_emb_dict
return id_list, key2index id_list_dict = {} for col in tqdm(sequence_features): id_list, key2index = get_sequence(datalabel, col, max_len=150) # dict ,id_list as key index sequence key2index as words -> key index id_list_dict[col] = {'id_list': id_list, 'key2index': key2index} Cache.cache_data(id_list_dict, nm_marker='id_list_dict_150_normal') # ################################################################################################################## # get time embedding import datetime # during 2019-09-01 to 2019-11-30 id_list_dict = Cache.reload_cache(file_nm=data_path + 'CACHE_id_list_dict_150_normal.pkl', base_dir='', pure_nm=False) class strTimeEmb(object): ''' # time 中一些特征做onehot encoding 周x 是否是周末 月 月第x周 教师节 中秋节 16日 9.29调休 10.1假期 10.7重阳节 10.12调休 10.28寒衣节 11.8立冬 11.17学生日 11.28感恩节 ''' def __init__(self, daynow): self.daynow = int(daynow) self.month = 0 self.day = 0