Esempio n. 1
0
    def run_w2v(sentence_id, word_id, emb_size=256):
        '''

        :param sentence_id: sentence groupby key
        :param word_id: col as word
        :param emb_size: output embedding size used in w2v
        :return:
        '''
        # large window embedding
        window = 150
        res_dict = w2v_pro_normal(datalog,
                                  sentence_id=sentence_id,
                                  word_id=word_id,
                                  window=window,
                                  emb_size=emb_size,
                                  dropna=False,
                                  n_jobs=12,
                                  epoch=5)
        epoch = 10
        method = 'CBOW'
        author = 'AZ'
        marker = 'TXBASE'
        Cache.cache_data(
            res_dict,
            nm_marker=
            f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}'
        )
        del res_dict
        gc.collect()
def run_d2v(sentence_id, word_id, marker, epoch=10, window=30, emb_size=128):
    emb_name = f'EMB_DICT_ZQ_D2V_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}'
    print(emb_name)
    if word_id == 'industry':
        epoch = 8
    res_dict = d2v_pro(datalog,
                       sentence_id=sentence_id,
                       word_id=word_id,
                       emb_size=emb_size,
                       dropna=False,
                       n_jobs=48,
                       hs=1,
                       window=window,
                       negative=10,
                       epoch=epoch,
                       return_model=False)

    Cache.cache_data(res_dict, nm_marker=emb_name)
Esempio n. 3
0
    def run_w2v(sentence_id, word_id, emb_size=256):
        window = 60
        res_dict = w2v_pro_normal(datalog,
                                  sentence_id=sentence_id,
                                  word_id=word_id,
                                  window=60,
                                  emb_size=emb_size,
                                  dropna=False,
                                  n_jobs=24,
                                  epoch=10)
        epoch = 10
        method = 'CBOW'
        author = 'AZ'
        marker = 'CLICK_TIMES_INCREASED'

        Cache.cache_data(
            res_dict,
            nm_marker=
            f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id}'
        )
        del res_dict
        gc.collect()
Esempio n. 4
0
 def run_w2v(sentence_id, word_id, emb_size=256, epoch=10):
     window = 60
     res_dict0, res_dict1 = w2v_pro_item(datalog,
                                         sentence_id=sentence_id,
                                         word_id=word_id,
                                         window=window,
                                         emb_size=emb_size,
                                         dropna=False,
                                         n_jobs=12,
                                         epoch=epoch)
     epoch = epoch
     method = 'cbow'
     author = 'AZ'
     marker = 'CONCAT_' + word_id[1]
     Cache.cache_data(
         res_dict0,
         nm_marker=
         f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[0]}'
     )
     del res_dict0, res_dict1
     # do not use category embedding
     # Cache.cache_data(res_dict1,
     #                  nm_marker=f'EMB_DICT_{author}_{method}_{marker}_{window}WINDOW_{epoch}EPOCH_{sentence_id}_{word_id[1]}')
     gc.collect()
Esempio n. 5
0
        if max_len is None:
            max_len = int(np.percentile(id_list_length, 99))
        # pre padding , 0 before sequence
        id_list = pad_sequences(id_list,
                                maxlen=max_len,
                                padding='pre',
                                truncating='pre')
        return id_list, key2index

    id_list_dict = {}
    for col in tqdm(sequence_features):
        id_list, key2index = get_sequence(datalabel, col, max_len=150)
        # dict ,id_list as key index sequence key2index as words -> key index
        id_list_dict[col] = {'id_list': id_list, 'key2index': key2index}

    Cache.cache_data(id_list_dict, nm_marker='id_list_dict_150_normal')

    # ##################################################################################################################
    # get time embedding
    import datetime
    # during 2019-09-01 to 2019-11-30
    id_list_dict = Cache.reload_cache(file_nm=data_path +
                                      'CACHE_id_list_dict_150_normal.pkl',
                                      base_dir='',
                                      pure_nm=False)

    class strTimeEmb(object):
        '''
    	# time 中一些特征做onehot encoding
        周x
        是否是周末