def run_w2v(sentence_id, word_id, emb_size=256): ''' :param sentence_id: sentence groupby key :param word_id: col as word :param emb_size: output embedding size used in w2v :return: ''' # large window embedding window = 150 res_dict = w2v_pro_normal(datalog, sentence_id=sentence_id, word_id=word_id, window=window, emb_size=emb_size, dropna=False, n_jobs=12, epoch=5) epoch = 10 method = 'CBOW' author = 'AZ' marker = 'TXBASE' Cache.cache_data( res_dict, nm_marker= f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}' ) del res_dict gc.collect()
def run_d2v(sentence_id, word_id, marker, epoch=10, window=30, emb_size=128): emb_name = f'EMB_DICT_ZQ_D2V_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}' print(emb_name) if word_id == 'industry': epoch = 8 res_dict = d2v_pro(datalog, sentence_id=sentence_id, word_id=word_id, emb_size=emb_size, dropna=False, n_jobs=48, hs=1, window=window, negative=10, epoch=epoch, return_model=False) Cache.cache_data(res_dict, nm_marker=emb_name)
def load_idlist(id_list_nm='id_list_dict_max_len_200_all', zero_pre_post='pre'): """ zero_pre_post: "pre"表示序列开头填充0,"post"表示序列尾部填充0 """ # id_list_dict: 包含padding后的序列特征字典以及词表 id_list_dict = Cache.reload_cache(file_nm=id_list_nm, base_dir=INPUT_DATA_BASE_DIR, pure_nm=True) # truncate: if USE_SEQ_LENGTH < 200: if zero_pre_post == 'pre': # 前面填充0,从后序开始截断:-USE_SEQ_LENGTH: for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:] elif zero_pre_post == 'post': # 后面填充0,从前序开始截断:0:USE_SEQ_LENGTH for col in EMB_keys2do: id_list_dict[col + "_list"]['id_list'] = id_list_dict[ col + "_list"]['id_list'][:, 0:USE_SEQ_LENGTH] else: raise NotImplementedError KEY2INDEX_DICT = {} # 每个序列特征的词表组成的字典 SEQ_LENTH_DICT = {} # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150 for key in EMB_keys2do: KEY2INDEX_DICT[key] = id_list_dict[f'{key}_list']['key2index'] SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1] if len(set(SEQ_LENTH_DICT.values())) == 1: print("GlobalSeqLength:", SEQ_LENTH_DICT[key]) else: print( "GlobalSeqLength is Not Unique!!! If you are sure, comment the line after to avoid exception." ) raise # 生成mask 放入click_times_list array_new = id_list_dict['industry_list']['id_list'].copy() array_new = (array_new == 0).astype(np.int32) id_list_dict['click_times_list'] = {} id_list_dict['click_times_list']['id_list'] = array_new # mask del array_new gc.collect() input_dict_all = {} for col in EMB_keys2do: input_dict_all[col] = id_list_dict[col + '_list']['id_list'] input_dict_all['click_times'] = id_list_dict['click_times_list'][ 'id_list'] # 加入time return input_dict_all, KEY2INDEX_DICT
def run_w2v(sentence_id, word_id, emb_size=256): window = 60 res_dict = w2v_pro_normal(datalog, sentence_id=sentence_id, word_id=word_id, window=60, emb_size=emb_size, dropna=False, n_jobs=24, epoch=10) epoch = 10 method = 'CBOW' author = 'AZ' marker = 'CLICK_TIMES_INCREASED' Cache.cache_data( res_dict, nm_marker= f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}' ) del res_dict gc.collect()
def run_w2v(sentence_id, word_id, emb_size=256, epoch=10): window = 60 res_dict0, res_dict1 = w2v_pro_item(datalog, sentence_id=sentence_id, word_id=word_id, window=window, emb_size=emb_size, dropna=False, n_jobs=12, epoch=epoch) epoch = epoch method = 'cbow' author = 'AZ' marker = 'CONCAT_' + word_id[1] Cache.cache_data( res_dict0, nm_marker= f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[0]}' ) del res_dict0, res_dict1 # do not use category embedding # Cache.cache_data(res_dict1, # nm_marker=f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[1]}') gc.collect()
def load_datalabel(): ''' :return: train datalabel and matrix to save modelresult ''' datalabel = Cache.reload_cache(file_nm='datalabel_with_seq_length', base_dir=INPUT_DATA_BASE_DIR, pure_nm=True) if datalabel['age'].min() == 1: datalabel['age'] = datalabel['age'] - 1 if datalabel['gender'].min() == 1: datalabel['gender'] = datalabel['gender'] - 1 assert datalabel['age'].min() == 0 assert datalabel['gender'].min() == 0 datalabel = datalabel[['user_id', 'gender', 'age']] traindata = datalabel.loc[~datalabel['age'].isna()].reset_index(drop=True) testdata = datalabel.loc[datalabel['age'].isna()].copy().reset_index( drop=True) traindata['age'] = traindata['age'].astype(np.int8) traindata['gender'] = traindata['gender'].astype(np.int8) traindata['age_gender'] = traindata['gender'] * 10 + traindata['age'] # gender = 0, age => 0~9 # gender = 1, age+=10 => 10~19 print( f"traindata['age_gender'].unique(): {sorted(traindata['age_gender'].unique())}" ) print(traindata.shape, testdata.shape) # init array to store oof and model prob train_shape = traindata.shape[0] test_shape = testdata.shape[0] model_prob = np.zeros((train_shape + test_shape, NUM_CLASSES, N_FOLDS), dtype='float32') all_uid_df = datalabel[['user_id']].copy() # to save the model_prob train_uid_df = traindata[['user_id']].copy() # to save the oof_prob if not isTEST: os.makedirs(f"../../05_RESULT/META/{TRAIN_MARKER}", exist_ok=True) os.makedirs("../../05_RESULT/SUB", exist_ok=True) all_uid_df.to_csv( f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_all_uid_df.csv", index=False) train_uid_df.to_csv( f"../../05_RESULT/META/{TRAIN_MARKER}/SAVE_train_uid_df.csv", index=False) return traindata, model_prob
def random_get_embedding_fun(self, id_list_dict): emb_matrix_dict = {} for col in self.use_cols: col_file_names = [] sepc_embs = self.spec_emb_dict[col] # 必须要用 # 随机抽一些embedding 优先抽最大个数个 再在后续不断拼到dict中达到max_embs就停止 # 文件名对应的表示是user_id_xx for indexpath, pathi in enumerate(self.path_list): for filei in os.listdir(pathi): if filei.find('user_id_' + col) > -1: col_file_names.append(pathi + filei) if len(sepc_embs) > 0: # 排它 col_file_names = list( set(col_file_names).difference(set(sepc_embs))) random.shuffle(col_file_names) select_nums = min( [len(col_file_names), self.max_nums[col] - len(sepc_embs)]) # 再选入的个数 file_to_load = col_file_names[:select_nums] # 再选入的emb file_to_load = sepc_embs + file_to_load emblist = [] for filei in file_to_load: try: emb_i = Cache.reload_cache(file_nm=filei, base_dir='', pure_nm=False)['word_emb_dict'] emblist.append(emb_i) except: print('missing! ', filei) print('processing {} shape {}'.format(col, len(emblist))) print(file_to_load) # 选中的file emb_matrix_all = self.get_batch_emb_matrix( file_to_load, emblist, id_list_dict, col + '_list', max_embs=self.max_embs[col]) # id_list_dict 外部传入 emb_matrix_dict[col] = emb_matrix_all # 一个list del emb_matrix_all, emblist gc.collect() # key 是列名 value是一个list 里面有这个列所属的各种embedding矩阵 按照词表*emb_size的 return emb_matrix_dict
BATCH_SIZE = 512 SEQ_LENGTH = 150 DROPOUT = 0.3 NUM_CLASS = 20 EPOCHS = 30 LR = 1e-3 device = torch.device("cuda:0") ############################## ######## 获取emb ############# ############################## seq_length_creative_id = 150 # 序列都padding到了150 id_list_dict = Cache.reload_cache( file_nm='../../cached_data/CACHE_id_list_dict_150_normal.pkl', base_dir='', pure_nm=False) # 定义需要的输入 cols_to_emb = [ 'creative_id', 'ad_id', 'advertiser_id', 'product_id', 'product_category', 'industry', 'time' ] # 定义emb 文件路径 path_list = ['../../cached_data/'] # 定义最大emb_size max_embs = { 'creative_id': 2000, 'ad_id': 2000, 'advertiser_id': 2000, 'product_id': 2000,
def _load_merged_emb(emb_lst): all_emb_dict = {} for i, nm in enumerate(emb_lst, 1): all_emb_dict[f"emb_{i}"] = Cache.reload_cache(nm) return all_emb_dict
if max_len is None: max_len = int(np.percentile(id_list_length, 99)) # pre padding , 0 before sequence id_list = pad_sequences(id_list, maxlen=max_len, padding='pre', truncating='pre') return id_list, key2index id_list_dict = {} for col in tqdm(sequence_features): id_list, key2index = get_sequence(datalabel, col, max_len=150) # dict ,id_list as key index sequence key2index as words -> key index id_list_dict[col] = {'id_list': id_list, 'key2index': key2index} Cache.cache_data(id_list_dict, nm_marker='id_list_dict_150_normal') # ################################################################################################################## # get time embedding import datetime # during 2019-09-01 to 2019-11-30 id_list_dict = Cache.reload_cache(file_nm=data_path + 'CACHE_id_list_dict_150_normal.pkl', base_dir='', pure_nm=False) class strTimeEmb(object): ''' # time 中一些特征做onehot encoding 周x 是否是周末