Exemple #1
0
def load_idlist(id_list_nm='id_list_dict_150_normal', zero_pre_post='pre'):
    """
    zero_pre_post: "pre"表示序列开头填充0,"post"表示序列尾部填充0
    """
    # id_list_dict: 包含padding后的序列特征字典以及词表
    id_list_dict = Cache.reload_cache(file_nm=id_list_nm,
                                      base_dir=INPUT_DATA_BASE_DIR,
                                      pure_nm=True)
    #     # time补上'-1'
    #     id_list_dict['time_list']['key2index']['-1'] = 92
    # truncate:
    if USE_SEQ_LENGTH < 150:
        if zero_pre_post == 'pre':  # 前面填充0,从后序开始截断:-USE_SEQ_LENGTH:
            for col in EMB_keys2do:
                id_list_dict[col + "_list"]['id_list'] = id_list_dict[
                    col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:]

        elif zero_pre_post == 'post':  # 后面填充0,从前序开始截断:0:USE_SEQ_LENGTH
            for col in EMB_keys2do:
                id_list_dict[col + "_list"]['id_list'] = id_list_dict[
                    col + "_list"]['id_list'][:, 0:USE_SEQ_LENGTH]
        else:
            raise NotImplementedError

    KEY2INDEX_DICT = {}  # 每个序列特征的词表组成的字典
    SEQ_LENTH_DICT = {}  # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150

    for key in EMB_keys2do:
        KEY2INDEX_DICT[key] = id_list_dict[f'{key}_list']['key2index']
        SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1]

    if len(set(SEQ_LENTH_DICT.values())) == 1:
        print("GlobalSeqLength:", SEQ_LENTH_DICT[key])
    else:
        print(
            "GlobalSeqLength is Not Unique!!! If you are sure, comment the line after to avoid exception."
        )
        raise

    # 生成mask 放入click_times_list
    array_new = id_list_dict['industry_list']['id_list'].copy()
    array_new = (array_new == 0).astype(np.int32)
    id_list_dict['click_times_list'] = {}
    id_list_dict['click_times_list']['id_list'] = array_new  # mask
    del array_new
    gc.collect()

    input_dict_all = {}
    for col in EMB_keys2do:
        input_dict_all[col] = id_list_dict[col + '_list']['id_list']
    # 加入time
    input_dict_all['click_times'] = id_list_dict['click_times_list']['id_list']
    return input_dict_all, KEY2INDEX_DICT
Exemple #2
0
def write(feature1_feature2):
    list_df = Cache.reload_cache('CACHE_list_df_adv_userseq_'+feature1_feature2+'.pkl')[0].values.tolist()
    f = open('adv_userseq_'+feature1_feature2+'.txt', 'w')
    for i in list_df:
        if i:
            for j in i:
                f.write(str(j))
                f.write(' ')
            f.write('\n')
        else:
            f.write(str(-2))
            f.write(' ')
            f.write('\n')
    f.close()
Exemple #3
0
def get_embedding(f1_f2, f1):
    avg_f1 = Cache.reload_cache('CACHE_list_df_avg_' + f1_f2 + '.pkl')
    feature_tokens = avg_f1[[1]].values.flatten().astype(str).tolist()
    tokenizer = Tokenizer(num_words=len(feature_tokens) + 1)
    tokenizer.fit_on_texts(feature_tokens)
    embedding_dim = 64
    embedding_matrix = np.random.randn(len(feature_tokens) + 1, embedding_dim)
    avg_f1_copy = avg_f1.copy()
    avg_f1_copy = avg_f1_copy.set_index(1)

    for feature in feature_tokens:
        embedding_vector = np.array(avg_f1_copy.loc[int(feature), :].values[0])
        if embedding_vector is not None:
            index = tokenizer.texts_to_sequences([feature])[0][0]
            embedding_matrix[index] = embedding_vector
    return embedding_matrix
Exemple #4
0
def input_w2v(f1_f2, all_data, f2):
    feature_seq = all_data[[f2]].values.flatten().astype(str).tolist()

    avg_f1 = Cache.reload_cache('CACHE_list_df_avg_' + f1_f2 + '.pkl')
    feature_tokens = avg_f1[[1]].values.flatten().astype(str).tolist()
    tokenizer = Tokenizer(num_words=len(feature_tokens) + 1)
    tokenizer.fit_on_texts(feature_tokens)

    npy_path = f1_f2
    sequences = tokenizer.texts_to_sequences(feature_seq[:41907133])
    x_train = pad_sequences(sequences, maxlen=1, padding='post')
    print(x_train.shape)
    np.save(npy_path + '_f2_train.npy', x_train)

    sequences = tokenizer.texts_to_sequences(feature_seq[41907133:])
    x_test = pad_sequences(sequences, maxlen=1, padding='post')
    print(x_test.shape)
    np.save(npy_path + '_f2_test.npy', x_test)
def get_emb_matrix(col):
    """
    inputs:    
    col 需要做成预训练emb_matrix的列
    
    加载:
    emb_dict 预训练的词向量
    word_emb_dict 字典
    id_list_dict 字典索引序列
    
    得出id_list_dict+emb_matrix
    """
    id_list_dict_all = Cache.reload_cache(
        f'CACHE_EMB_INPUTSEQ_stage2_{col}.pkl')
    #     id_list_dict = id_list_dict_all['id_list']
    #     key2index = id_list_dict_all['key2index']
    #     emb = id_list_dict_all['emb']
    key_to_represent_rare = '-1'
    words = list(id_list_dict_all['emb'].keys())
    emb_size = id_list_dict_all['emb'][words[0]].shape[0]
    voc_size = len(words)
    emb_matrix = np.zeros((voc_size + 1, emb_size))
    # emb 中必须要有'-1' 作为index 0
    if '-1' not in id_list_dict_all['key2index'].keys():
        #  emb中无-1 为全词表数据!需要自行计算均值emb vec
        # 为embi 添加一个embedding
        # 这些词的vector求均值
        vector_low_frequency_words = np.zeros((emb_size, ))
        for w in words:
            vector_low_frequency_words += id_list_dict_all['emb'][w]
        vector_low_frequency_words = vector_low_frequency_words / voc_size
        # emb添加一个key value
        id_list_dict_all['emb'][
            key_to_represent_rare] = vector_low_frequency_words
        # print(f'{col} file has no key_to_represent_rare add low frequency words and fill vector as:', vector_low_frequency_words)
    for k, idx in id_list_dict_all['key2index'].items():
        try:
            emb_matrix[idx, :] = id_list_dict_all['emb'][k]
        except KeyError:  # 如果k不在不在word_emb_dict中,则默认用max_key_to_represent_rare填充
            #                 print('find oov:',(k, idx))
            emb_matrix[idx, :] = id_list_dict_all['emb'][key_to_represent_rare]
    emb_matrix = np.float32(emb_matrix)
    return {col: [id_list_dict_all['id_list'], emb_matrix]}
Exemple #6
0
def get_embedding(f1_f2,f1):
    path = 'adv_userseq_'+f1_f2+'_word2vec.kv'
    wv = KeyedVectors.load(path, mmap='r')
    list_df = Cache.reload_cache('CACHE_list_df_adv_userseq_'+f1_f2+'.pkl')
    list_df.columns=['list',f1] 
    f = open('adv_userseq_'+f1_f2+'.txt','r')
    ind = 0
    buf = []
    for i in f:
        buf_ = np.zeros(64)
        for j in i.strip().split(' '):
            buf_ = buf_+wv[j]
        buf_ = buf_/len(i) # 求平均
        buf_f1 = list_df.at[ind, f1]
        buf__ = []
        buf_ = buf_.tolist()
        buf__.append(buf_)
        buf__.append(buf_f1)
        buf.append(buf__)
        ind = ind+1
    df_f1_list = pd.DataFrame(buf) 
    Cache.cache_data(df_f1_list, nm_marker='list_df_avg_adv_userseq_'+f1_f2)
    return 0
Exemple #7
0
            list_feature2_.append(i)
            list_feature2.append(list_feature2_)
        list_df = pd.DataFrame(list_feature2)
        Cache.cache_data(list_df, nm_marker='list_df_adv_userseq_'+feature1+'_'+feature2)
        del list_df,data_group,feature2_name_list,list_feature2_,index_get_group,list_feature2
        gc.collect()
        return True
    except:
        return False


# In[4]:



train = Cache.reload_cache('CACHE_train_raw.pkl').drop(columns = ['communication_onlinerate']).astype(int)
train = reduce_mem(train, use_float16=True)
test = Cache.reload_cache('CACHE_test_B_raw.pkl').drop(columns = ['id','communication_onlinerate']).astype(int)
test = reduce_mem(test, use_float16=True)
data = pd.concat([train,test],axis=0,ignore_index=True)
data = reduce_mem(data, use_float16=True)
del train,test
gc.collect()
poc_feature1_list = [['task_id','age'],['task_id','city'],['task_id','city_rank'],['task_id','device_name'],['task_id','career'],
                  ['task_id','gender'],['task_id','residence'],['adv_id','age'],['adv_id','city'],['adv_id','city_rank'],
                  ['adv_id','device_name'],['adv_id','career'],['adv_id','gender'],['adv_id','residence'],['creat_type_cd','age'],
                  ['creat_type_cd','city'],['creat_type_cd','city_rank'],['creat_type_cd','device_name'],['creat_type_cd','career'],
                  ['creat_type_cd','gender'],['creat_type_cd','residence'],['indu_name','age'],['indu_name','city'],['indu_name','city_rank'],
                  ['indu_name','device_name'],['indu_name','career'],['indu_name','gender'],['indu_name','residence'],['adv_prim_id','age'],
                  ['adv_prim_id','city'],['adv_prim_id','city_rank'],['adv_prim_id','device_name'],['adv_prim_id','career'],['adv_prim_id','gender'],
                  ['adv_prim_id','residence']]
Exemple #8
0
train = pd.read_csv(
    r'train_data.csv', sep='|',
    dtype=str).drop(columns=['communication_onlinerate']).astype(int)
train = reduce_mem(train, use_float16=True)
test = pd.read_csv(
    r'test_data_A.csv', sep='|',
    dtype=str).drop(columns=['id', 'communication_onlinerate']).astype(int)
test.insert(0, 'label', np.ones([1000000]))
test['label'] = 2
test = reduce_mem(test, use_float16=True)
data = pd.concat([train, test], axis=0, ignore_index=True)
data = reduce_mem(data, use_float16=True)
data_uid_ptd_feature = data[['uid', 'pt_d', feature]]

list_data = Cache.reload_cache('CACHE_list_df_adv_id.pkl')
list_data.columns = ['list', 'pt_d', 'uid']
list_data = pd.merge(data_uid_ptd_feature,
                     list_data,
                     how='left',
                     on=('uid', 'pt_d'))
list_data = list_data['list'].values.tolist()
index = 0
list_data_ = []
for i in list_data:
    i.append(data_uid_ptd_feature.at[index, feature])
    list_data_.append(i)
    index = index + 1

f = open(f_path + '.txt', 'w')
for i in tqdm(list_data_):
Exemple #9
0
def gen_list_df(feature):
    print(f'{feature} start!')
    data = Cache.reload_cache(
        'CACHE_data_sampling_pos1_neg5.pkl')  # 直接对采样后的数据做序列
    if feature == 'label':
        data.loc[data['pt_d'] >= 8, 'label'] = -1  # test的label做mask
        data['label'] = data['label'].astype(np.int8)
        data['label'] = data['label'] + 1  # 因为0用于padding置为0
    data = data[['uid', feature, 'pt_d']]
    gc.collect()
    print(data.shape)
    data_group = data.groupby(['uid'])
    gc.collect()
    index_list = []
    feature_list = []
    print('index_list start')
    for name, group in tqdm(data_group):
        index_list.append(name)
    print('feature_list start')
    for i in tqdm(index_list):
        index_get_group = data_group.get_group(i)
        ptd_set = set(index_get_group['pt_d'].values.flatten().tolist())
        for j in ptd_set:
            feature_list_ = []
            buf_list = []
            buf_list = index_get_group.query('pt_d < @j')[
                feature].values.flatten().tolist()  # 本行样本之前的点击行为序列
            buf_list.append(0)  # padding 0
            feature_list_.append(buf_list)  # 行为序列
            feature_list_.append(j)  # pt_d
            feature_list_.append(i)  # uid
            feature_list.append(feature_list_)

    list_df = pd.DataFrame(feature_list)
    del index_list, feature_list, feature_list_, data_group, index_get_group, ptd_set
    gc.collect()
    list_df.columns = ['list', 'pt_d', 'uid']
    list_df['list'] = list_df['list'].map(lambda x: [str(i)
                                                     for i in x])  # 转str
    list_df = list_df.drop_duplicates(subset=['pt_d', 'uid'])
    list_df = data.merge(list_df, how='left',
                         on=('uid', 'pt_d'))  # 顺序还是用data的顺序
    # 加入当天本样本 label不加
    if feature != 'label':
        list_df['list'] = list_df[feature].map(
            lambda x: [str(x)]) + list_df['list']
    print('w2v start!')
    emb_size = 32  # 预训练 embedding dim
    model = Word2Vec(list_df['list'].values.tolist(),
                     size=emb_size,
                     window=5,
                     workers=5,
                     min_count=1,
                     sg=0,
                     hs=0,
                     negative=5,
                     iter=5,
                     seed=0)
    # 1 获取seq
    id_list, key2index = get_sequence(list_df, 'list', max_len=40)
    # 2 获取key2index
    emb_dict = {}
    for word_i in list(model.wv.vocab.keys()):
        if word_i in model.wv:
            emb_dict[word_i] = model.wv[word_i]
        else:
            emb_dict[word_i] = np.zeros(emb_size)
    # 3 保存
    id_list_dict = {}
    id_list_dict['id_list'] = id_list
    id_list_dict['key2index'] = key2index
    id_list_dict['emb'] = emb_dict
    Cache.cache_data(id_list_dict, nm_marker=f'EMB_INPUTSEQ_stage2_{feature}')
    print(f'{feature} done!')
Exemple #10
0
pd.set_option('max_colwidth', 200)
pd.set_option('display.width', 5000)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

SEED = 999
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = '4'
gpus = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(gpus[0], True)

# %%
data = Cache.reload_cache('CACHE_data_step_1_feature_0924_r5.pkl')
data.drop(columns=['communication_onlinerate'], inplace=True)

sparse_features = [
    'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id',
    'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',
    'app_second_class', 'city', 'device_name', 'career', 'gender', 'net_type',
    'residence', 'emui_dev', 'indu_name', 'cmr_0', 'cmr_1', 'cmr_2', 'cmr_3',
    'cmr_4', 'cmr_5', 'cmr_6', 'cmr_7', 'cmr_8', 'cmr_9', 'cmr_10', 'cmr_11',
    'cmr_12', 'cmr_13', 'cmr_14', 'cmr_15', 'cmr_16', 'cmr_17', 'cmr_18',
    'cmr_19', 'cmr_20', 'cmr_21', 'cmr_22', 'age', 'city_rank'
]
# 删除掉cmr_23
dense_features = [
    i for i in data.columns if i not in sparse_features +
    ['index', 'id', 'uid', 'level_0', 'pt_d', 'label']
Exemple #11
0
                                 period=1)
    earlystop_callback = EarlyStopping(
        monitor="val_AUC",
        min_delta=0.00001,
        patience=3,
        verbose=1,
        mode="max",
        baseline=None,
        restore_best_weights=True,
    )
    reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_AUC', factor=0.5, patience=1, min_lr=0.0000001)

    #!################################################################################################################

    deepfm_data = Cache.reload_cache('CACHE_data_deepfm.pkl')
    label = Cache.reload_cache('CACHE_train_NONcmr.pkl')['label'].values

    emb1 = np.load('adv_idembedding_matrix.npy', allow_pickle=True)
    emb2 = np.load('adv_prim_idembedding_matrix.npy', allow_pickle=True)
    emb3 = np.load('creat_type_cdembedding_matrix.npy', allow_pickle=True)
    emb4 = np.load('indu_nameembedding_matrix.npy', allow_pickle=True)
    emb5 = np.load('task_idembedding_matrix.npy', allow_pickle=True)
    emb_label = np.load('labelembedding_matrix.npy', allow_pickle=True)

    trans_1_train = np.load('adv_idx_train.npy', allow_pickle=True)
    trans_2_train = np.load('adv_prim_idx_train.npy', allow_pickle=True)
    trans_3_train = np.load('creat_type_cdx_train.npy', allow_pickle=True)
    trans_4_train = np.load('indu_namex_train.npy', allow_pickle=True)
    trans_5_train = np.load('task_idx_train.npy', allow_pickle=True)
    trans_label_train = np.load('labelx_train.npy', allow_pickle=True)
Exemple #12
0
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


print('start!')
data = Cache.reload_cache('CACHE_dataall_stage2_0924.pkl')
print(data.dtypes)
data['communication_onlinerate'] = data['communication_onlinerate'].map(
    lambda x: x.replace('^', ' '))
route = Cache.reload_cache('CACHE_cmr_stage2_0924.pkl')
route_columns = [i for i in route.columns]
data = pd.concat([data, route], axis=1)  # 无index
data = data.reset_index(drop=True).reset_index()  # 添加index

cols = [i for i in data.columns if i not in ['id', 'index']]
data1 = data.query('pt_d<8').drop_duplicates(
    subset=cols)  # 重复样本去掉 不清楚test_b是不是pt_d=8
data2 = data.query('pt_d>=8')


def get_sample(df, day, rate=5):
test_B['label'] = 2
Cache.cache_data(test_B, nm_marker='test_B_raw')

# # cmr-onehot

# In[5]:

tokenizer = Tokenizer(num_words=24, filters='^')
communication_onlinerate_dict = [
    '0^1^2^3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23'
]
tokenizer.fit_on_texts(communication_onlinerate_dict)

# In[6]:

data = Cache.reload_cache('CACHE_train_raw.pkl')
communication_onlinerate_raw = data['communication_onlinerate'].tolist()
communication_onlinerate_sequences = tokenizer.texts_to_sequences(
    communication_onlinerate_raw)
communication_onlinerate_sequences = pad_sequences(
    communication_onlinerate_sequences, maxlen=24, padding='post')
communication_onlinerate_onehot = []
with tqdm(total=communication_onlinerate_sequences.shape[0]) as pbar:
    for i in communication_onlinerate_sequences:
        communication_onlinerate_onehot.append(
            np.delete(np.eye(25)[i], 0, axis=1).sum(axis=0))
        pbar.update(1)
communication_onlinerate_onehot = pd.DataFrame(
    communication_onlinerate_onehot).astype(int)
communication_onlinerate_onehot = reduce_mem(communication_onlinerate_onehot,
                                             use_float16=True)
print("EPOCHS: ", EPOCHS)
print("NUM_WORKERS: ", NUM_WORKERS)
print("Cards to use:", os.environ["CUDA_VISIBLE_DEVICES"])
print("BATCH_SIZE: ", BATCH_SIZE)
print("EMB_keys2do: ", EMB_keys2do)
print("NUM_CLASSES: ", NUM_CLASSES)
print("USE_SEQ_LENGTH: ", USE_SEQ_LENGTH)
print("###" * 35)

##############################################################################

print("###" * 35)
print("@@@Load id_list_dict...")
print("###" * 35)
id_list_dict = Cache.reload_cache(
    file_nm=
    '/home/tione/notebook/cached_data/CACHE_id_list_dict_150_normal.pkl',
    pure_nm=False)
gc.collect()
# id_list_dict 包含padding后的序列特征字典以及词表
# truncate:
if USE_SEQ_LENGTH < 150:
    for col in EMB_keys2do:
        id_list_dict[col + "_list"]['id_list'] = id_list_dict[
            col + "_list"]['id_list'][:, -USE_SEQ_LENGTH:]

SEQ_LENTH_DICT = {}  # 存放每个序列截断长度的字典 一般都是一样的,比如这里是 150

for key in EMB_keys2do:
    SEQ_LENTH_DICT[key] = id_list_dict[f'{key}_list']['id_list'].shape[-1]

if len(set(SEQ_LENTH_DICT.values())) == 1:
Exemple #15
0
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


##############################################################################################################
# 这块效率好低啊...算了也不想改了,seq='|'应该可以跑通
datatrain = pd.read_csv('./data/train_data.csv')
datatraintestA = Cache.reload_cache('CACHE_dataall0816.pkl')
datatest = pd.read_csv('./data/test_data_B.csv')
columns_str = datatest.columns[0]
dflisttst = []
for i in tqdm(range(datatest.shape[0])):
    dflisttst.append([
        int(j) if index != 32 else j
        for index, j in enumerate(datatest[columns_str].iloc[i].split('|'))
    ])
del datatest
gc.collect()
dflisttst = pd.DataFrame(dflisttst, columns=columns_str.split('|'))
dataall = pd.concat([datatraintestA, dflisttst], ignore_index=True)
dataall = reduce_mem(dataall, use_float16=False)
Cache.cache_data(
    dataall,
Exemple #16
0
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


train = pd.read_csv(
    r'train_data.csv', sep='|',
    dtype=str).drop(columns=['communication_onlinerate']).astype(int)
train = reduce_mem(train, use_float16=True)
test = pd.read_csv(
    r'test_data_A.csv', sep='|',
    dtype=str).drop(columns=['id', 'communication_onlinerate']).astype(int)
test.insert(0, 'label', np.ones([1000000]))
test['label'] = 2
test = reduce_mem(test, use_float16=True)
data = pd.concat([train, test], axis=0, ignore_index=True)
data = reduce_mem(data, use_float16=True)
data_uid_ptd = data[['uid', 'pt_d']]

list_data = Cache.reload_cache('CACHE_list_df_2label.pkl')
list_data.columns = ['list', 'pt_d', 'uid']
list_data = pd.merge(data_uid_ptd, list_data, how='left', on=('uid', 'pt_d'))
list_data = list_data['list'].values.tolist()

f = open(f_path + '.txt', 'w')
for i in tqdm(list_data):
    for j in i:
        f.write(str(j))
        f.write(' ')
    f.write('\n')
f.close()
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# data = reduce_mem(data, use_float16=False)
data = Cache.reload_cache('CACHE_zlh_nn_feature_stage_2.pkl')
data = reduce_mem(data, use_float16=True)
# 重置index唯一值
del data['raw_index']
# del data['communication_onlinerate']
gc.collect()
data = data.reset_index(drop=True).reset_index()
# 加载cross emb
dense_feature_size = 128
# m_user_0 = np.load('./cached_data/m0_user_stage2.npy').astype(np.float16)
m_user_1 = np.load('./cached_data/m1_user_stage2.npy').astype(np.float16)
# m_item_0 = np.load('./cached_data/m0_item_stage2.npy').astype(np.float16)
m_item_1 = np.load('./cached_data/m1_item_stage2.npy').astype(np.float16)
dataindex_base = np.load('./cached_data/dataindex_stage2.npy')
# 从matrix中取出采样后的输入
# m_user_0 = np.hstack([dataindex_base.reshape(-1,1),m_user_0])
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# In[2]:


data = Cache.reload_cache('CACHE_data_sampling_pos1_neg5.pkl')


# ## count encode

# In[3]:


cate_cols = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id',
             'tags', 'app_first_class', 'app_second_class', 'city', 'device_name', 'career', 'gender', 'age', 'net_type',
             'residence', 'emui_dev', 'indu_name',
             'communication_onlinerate_1', 'communication_onlinerate_2', 'communication_onlinerate_3',
             'communication_onlinerate_4', 'communication_onlinerate_5', 'communication_onlinerate_6',
             'communication_onlinerate_7', 'communication_onlinerate_8', 'communication_onlinerate_9',
             'communication_onlinerate_10', 'communication_onlinerate_11', 'communication_onlinerate_12',
             'communication_onlinerate_13', 'communication_onlinerate_14', 'communication_onlinerate_15',
    m1_user = []
    for i in tqdm(list(data['user_f'])):
        try:
            m1_user.append(model.wv[i])
        except:
            m1_user.append([0] * 128)
    m1_user = np.array(m1_user, dtype=np.float32)
    print(m1_item.shape)
    np.save('./cached_data/m0_item_stage2.npy', m1_item)
    print(m1_user.shape)
    np.save('./cached_data/m0_user_stage2.npy', m1_user)


if __name__ == '__main__':
    print('start!')
    data = Cache.reload_cache('CACHE_sampling_pro_feature.pkl')
    print(data.shape)
    data['label'] = data['label'].fillna(2).astype(int)  # mask
    gc.collect()
    print('w2v start!')
    # 生成一个emb matrix
    user_fe_list = ['age', 'city_rank', 'gender', 'slot_id',
                    'net_type']  # 'city_rank'
    item_fe_list = [
        'task_id', 'adv_id', 'creat_type_cd', 'dev_id', 'inter_type_cd',
        'indu_name', 'adv_prim_id', 'tags', 'spread_app_id', 'app_first_class',
        'his_on_shelf_time'
    ]
    print('join!')  # 简化的预训练方式 将用户属性,广告属性做拼接,大窗口做预训练学习共现分布
    data['user_f'] = ''
    for i, vari in enumerate(user_fe_list):
Exemple #20
0
    sequences = tokenizer.texts_to_sequences(feature_seq[41907133:])
    x_test = pad_sequences(sequences, maxlen=1, padding='post')
    print(x_test.shape)
    np.save(npy_path + '_f2_test.npy', x_test)


if __name__ == '__main__':
    f1_f2_list = [['task_id', 'age'], ['task_id', 'city'],
                  ['task_id', 'city_rank'], ['task_id', 'device_name'],
                  ['task_id', 'career'], ['task_id', 'gender'],
                  ['task_id', 'residence'], ['adv_id', 'age'],
                  ['adv_id', 'city'], ['adv_id', 'city_rank'],
                  ['adv_id', 'device_name'], ['adv_id', 'career'],
                  ['adv_id', 'gender'], ['adv_id', 'residence'],
                  ['creat_type_cd', 'age'], ['creat_type_cd', 'city'],
                  ['creat_type_cd', 'city_rank'],
                  ['creat_type_cd', 'device_name'],
                  ['creat_type_cd', 'career'], ['creat_type_cd', 'gender'],
                  ['creat_type_cd', 'residence'], ['indu_name', 'age'],
                  ['indu_name', 'city'], ['indu_name', 'city_rank'],
                  ['indu_name', 'device_name'], ['indu_name', 'career'],
                  ['indu_name', 'gender'], ['indu_name', 'residence'],
                  ['adv_prim_id', 'age'], ['adv_prim_id', 'city'],
                  ['adv_prim_id', 'city_rank'], ['adv_prim_id', 'device_name'],
                  ['adv_prim_id', 'career'], ['adv_prim_id', 'gender'],
                  ['adv_prim_id', 'residence']]
    all_data = Cache.reload_cache('CACHE_data_deepfm.pkl')
    for i in tqdm(f1_f2_list):
        input_w2v(str(i[0]) + '_' + str(i[1]), all_data, str(i[1]))
Exemple #21
0
# window特征+2k

last_seq_list = [
    'creat_type_cd', 'tags', 'spread_app_id', 'task_id', 'adv_id', 'label'
]
user_fe_list = [
    'age', 'city_rank', 'career', 'gender', 'city', 'device_name', 'residence',
    'emui_dev'
]
item_fe_list = ['task_id', 'adv_id', 'adv_prim_id', 'tags', 'spread_app_id']
cross_emb_dict = {}  # 成对做拼接+slotnettype
for i, vari in enumerate(user_fe_list):
    for j, varj in enumerate(item_fe_list):
        if j > i:
            # 拼接emb
            df1 = Cache.reload_cache(
                f'CACHE_EMB_TARGET_DICT_{vari}__{varj}_w2v.pkl')
            df2 = Cache.reload_cache(
                f'CACHE_EMB_TARGET_DICT_{varj}__{vari}_w2v.pkl')
            embvari = {}
            # 都转int key ,拼接
            for key, value in df1['key'].items():
                embvari[key] = np.hstack([value, df2['value'][str(key)]])
            embvarj = {}
            # 都转int key ,拼接
            for key, value in df2['key'].items():
                embvarj[key] = np.hstack([value, df1['value'][str(key)]])
            cross_emb_dict[vari + '__' + varj] = (embvari, embvarj)
print('load data finish!')

# ## 处理做交叉相似度计算的列 生成索引
def get_sample(df, day, neg_rate=5):
    set1 = df.query('pt_d=={}'.format(day))
    set1_pos = set1.query('label==1')
    nums_pos = set1_pos.shape[0]
    nums_neg = nums_pos * neg_rate
    set1_neg = set1.query('label==0')
    set1_neg = set1_neg.sample(nums_neg, random_state=0)
    df_sample = pd.concat([set1_pos, set1_neg])
    print(df_sample['label'].value_counts(), df_sample['label'].mean())
    return df_sample


# In[4]:

train = Cache.reload_cache('CACHE_train.pkl')
train = train.reset_index()
train.rename(columns={'index': 'raw_index'}, inplace=True)

test_B = Cache.reload_cache('CACHE_test_B.pkl').drop(columns=['id'])
test_B = test_B.reset_index()
test_B.rename(columns={'index': 'raw_index'}, inplace=True)
test_B['raw_index'] = test_B['raw_index'] + 41907133

train_ptd_1 = get_sample(train, 1)
train_ptd_2 = get_sample(train, 2)
train_ptd_3 = get_sample(train, 3)
train_ptd_4 = get_sample(train, 4)
train_ptd_5 = get_sample(train, 5)
train_ptd_6 = get_sample(train, 6)
train_ptd_7 = get_sample(train, 7)