Ejemplo n.º 1
0
def SeqEncoder(df, pri_id, col, timesteps=5, encode_dim=5):
    mode = df[[pri_id, col, 'day', 'time']]
    # 获取每个用户的操作序列,按时间
    # 按照天数降序排列,按照时间升序排列,然后按照UID聚集
    mode.sort_values(by=['day', 'time'], ascending=[False, True], inplace=True)
    temp = mode.groupby(pri_id)[col].apply(list).reset_index().rename(
        columns={col: '%s_seq' % col})
    temp['%s_seq_len' % col] = temp['%s_seq' % col].apply(lambda x: len(x))
    ob_len = int(temp['%s_seq_len' % col].max() * 3.0 / 5.0)
    # 改为timesteps的倍数
    while ob_len % timesteps != 0:
        ob_len += 1
    # 不足长度的序列进行填充(选取长度为最长长度的5/3)
    def Padding(x):

        if len(x) < ob_len:
            # 填充(Nan)
            x += ['Nan' for i in range(ob_len - len(x))]
        else:
            # 截取
            x = x[:ob_len]
        return x

    temp['%s_seq' % col] = temp['%s_seq' % col].apply(Padding)
    # 编码
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    mode['encoder'] = le.fit_transform(mode[col])
    # 建立字典
    seq_dic = {}
    encoder = mode[[col, 'encoder']].drop_duplicates()
    for key, val in zip(encoder[col], encoder['encoder']):
        seq_dic[key] = val
    seq_dic['Nan'] = mode['encoder'].max() + 1
    # 对序列进行转化
    temp['%s_seq' % col] = temp['%s_seq' %
                                col].apply(lambda x: [seq_dic[i] for i in x])
    matrix = np.asarray([i for i in temp['%s_seq' % col]])

    # 全连接层编码
    # res = _F.AutoEncoder(matrix,encode_dim)

    # LSTM编码
    res = _F.AutoEncoderLSTM(matrix, 5, timesteps)
    # 保存结果
    results = pd.DataFrame()
    results[pri_id] = temp[pri_id]
    for i in range(encode_dim):
        results[col + "_encode_" + str(i)] = res[:, i]
    results.to_csv(data_path + "data/_F_seq_encode_%s.csv" % col, index=False)
    return results