def run_w2v(sentence_id, word_id, emb_size=256): ''' :param sentence_id: sentence groupby key :param word_id: col as word :param emb_size: output embedding size used in w2v :return: ''' # large window embedding window = 150 res_dict = w2v_pro_normal(datalog, sentence_id=sentence_id, word_id=word_id, window=window, emb_size=emb_size, dropna=False, n_jobs=12, epoch=5) epoch = 10 method = 'CBOW' author = 'AZ' marker = 'TXBASE' Cache.cache_data( res_dict, nm_marker= f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}' ) del res_dict gc.collect()
def run_d2v(sentence_id, word_id, marker, epoch=10, window=30, emb_size=128): emb_name = f'EMB_DICT_ZQ_D2V_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}' print(emb_name) if word_id == 'industry': epoch = 8 res_dict = d2v_pro(datalog, sentence_id=sentence_id, word_id=word_id, emb_size=emb_size, dropna=False, n_jobs=48, hs=1, window=window, negative=10, epoch=epoch, return_model=False) Cache.cache_data(res_dict, nm_marker=emb_name)
def run_w2v(sentence_id, word_id, emb_size=256): window = 60 res_dict = w2v_pro_normal(datalog, sentence_id=sentence_id, word_id=word_id, window=60, emb_size=emb_size, dropna=False, n_jobs=24, epoch=10) epoch = 10 method = 'CBOW' author = 'AZ' marker = 'CLICK_TIMES_INCREASED' Cache.cache_data( res_dict, nm_marker= f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}' ) del res_dict gc.collect()
def run_w2v(sentence_id, word_id, emb_size=256, epoch=10): window = 60 res_dict0, res_dict1 = w2v_pro_item(datalog, sentence_id=sentence_id, word_id=word_id, window=window, emb_size=emb_size, dropna=False, n_jobs=12, epoch=epoch) epoch = epoch method = 'cbow' author = 'AZ' marker = 'CONCAT_' + word_id[1] Cache.cache_data( res_dict0, nm_marker= f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[0]}' ) del res_dict0, res_dict1 # do not use category embedding # Cache.cache_data(res_dict1, # nm_marker=f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[1]}') gc.collect()
if max_len is None: max_len = int(np.percentile(id_list_length, 99)) # pre padding , 0 before sequence id_list = pad_sequences(id_list, maxlen=max_len, padding='pre', truncating='pre') return id_list, key2index id_list_dict = {} for col in tqdm(sequence_features): id_list, key2index = get_sequence(datalabel, col, max_len=150) # dict ,id_list as key index sequence key2index as words -> key index id_list_dict[col] = {'id_list': id_list, 'key2index': key2index} Cache.cache_data(id_list_dict, nm_marker='id_list_dict_150_normal') # ################################################################################################################## # get time embedding import datetime # during 2019-09-01 to 2019-11-30 id_list_dict = Cache.reload_cache(file_nm=data_path + 'CACHE_id_list_dict_150_normal.pkl', base_dir='', pure_nm=False) class strTimeEmb(object): ''' # time 中一些特征做onehot encoding 周x 是否是周末