Exemple #1
0
def get_xy_fd():

    feature_dim_dict = {
        "sparse": [
            SingleFeat('user_age', 4),
            SingleFeat('user_gender', 2),
            SingleFeat('item_id', 4),
            SingleFeat('item_gender', 4)
        ]
    }  # raw feature:single value feature

    # history behavior feature:multi-value value feature
    behavior_feature_list = ["item_id", "item_gender"]
    # single value feature input
    user_age = np.array([1, 2, 3])
    user_gender = np.array([0, 1, 0])
    item_id = np.array([0, 1, 2])
    item_gender = np.array([0, 1, 0])

    # multi-value feature input
    hist_item_id = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 0]])
    hist_item_gender = np.array([[0, 1, 0, 1], [0, 1, 1, 1], [0, 0, 1, 0]])
    # valid length of behavior sequence of every sample
    hist_length = np.array([4, 4, 3])

    feature_dict = {
        'user_age': user_age,
        'user_gender': user_gender,
        'item_id': item_id,
        'item_gender': item_gender,
        'hist_item_id': hist_item_id,
        'hist_item_gender': hist_item_gender,
    }

    x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + \
        [feature_dict['hist_'+feat]
            for feat in behavior_feature_list] + [hist_length]
    # Notice the concatenation order: single feature + multi-value feature + length
    # Since the length of the historical sequences of different features in DIN are the same(they are all extended from item_id),only one length vector is enough.
    y = [1, 0, 1]

    return x, y, feature_dim_dict, behavior_feature_list
Exemple #2
0
def test_DCN_invalid(embedding_size=8, cross_num=0, hidden_size=()):
    feature_dim_dict = {
        'sparse': [
            SingleFeat('sparse_1', 2),
            SingleFeat('sparse_2', 5),
            SingleFeat('sparse_3', 10)
        ],
        'dense': [
            SingleFeat('dense_1', 1),
            SingleFeat('dense_1', 1),
            SingleFeat('dense_1', 1)
        ]
    }
    with pytest.raises(ValueError):
        _ = DCN(
            feature_dim_dict,
            embedding_size=embedding_size,
            cross_num=cross_num,
            hidden_size=hidden_size,
            keep_prob=0.5,
        )
Exemple #3
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    sparse_feature_list = [
        SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features]

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
        [test[feat.name].values for feat in dense_feature_list]

    # 4.Define Model,train,predict and evaluate
    model = DeepFM({
        "sparse": sparse_feature_list,
        "dense": dense_feature_list
Exemple #4
0
def get_input(use_count=False,
              use_unique=False,
              use_video=False,
              use_audio=False,
              use_title=False,
              ONLINE_FLAG=False,
              SAMPLE_FLAG=True,
              VALIDATION_FRAC=0.2,
              target='finish'):
    train_file = 'track2/sample_train.txt' if SAMPLE_FLAG else 'track2/final_track2_train.txt'
    test_file = 'track2/sample_test_no_answer.txt' if SAMPLE_FLAG else 'track2/final_track2_test_no_anwser.txt'
    video_file = 'track2/sample_video_features.csv' if SAMPLE_FLAG else 'track2/track2_video_features_mms.csv'
    face_file = 'track2/sample_face2.csv' if SAMPLE_FLAG else 'track2/face_df2.csv'
    audio_file = 'track2/sample_audio_features.csv' if SAMPLE_FLAG else 'track2/track2_audio_features.csv'
    title_file = 'track2/sample_title.txt' if SAMPLE_FLAG else 'track2/track2_title.txt'

    data = pd.read_csv(train_file,
                       sep='\t',
                       names=[
                           'uid', 'user_city', 'item_id', 'author_id',
                           'item_city', 'channel', 'finish', 'like',
                           'music_id', 'did', 'creat_time', 'video_duration'
                       ])
    print('training set read completed.')
    if ONLINE_FLAG:
        test_data = pd.read_csv(test_file,
                                sep='\t',
                                names=[
                                    'uid', 'user_city', 'item_id', 'author_id',
                                    'item_city', 'channel', 'finish', 'like',
                                    'music_id', 'did', 'creat_time',
                                    'video_duration'
                                ])
        train_size = data.shape[0]
        data = data.append(test_data).reset_index(drop=True)
    else:
        train_size = int(data.shape[0] * (1 - VALIDATION_FRAC))
    print('test set read completed.')

    sparse_features = [
        'uid',
        'user_city',
        'item_id',
        'author_id',
        'item_city',
        'channel',
        'music_id',
        'did',
    ]
    dense_features = []

    data['video_duration'] = pd.qcut(data['video_duration'],
                                     q=10,
                                     labels=False,
                                     duplicates='drop')
    sparse_features.append('video_duration')

    data['creat_time'] = data['creat_time'] % (24 * 3600) / 3600
    data['creat_time'] = pd.qcut(data['creat_time'],
                                 q=24,
                                 labels=False,
                                 duplicates='drop')
    sparse_features.append('creat_time')

    if use_count:
        data['uid-author_id'] = data['uid'].astype(
            str) + '-' + data['author_id'].astype(str)
        data['uid-did'] = data['uid'].astype(str) + '-' + data['did'].astype(
            str)
        data['did-channel'] = data['did'].astype(
            str) + '-' + data['channel'].astype(str)

        # 计数特征
        cols = ['uid', 'did', 'item_id', 'author_id', 'uid-author_id']
        for c in cols:
            data[c + '_cnt'] = data[c].map(data[c].value_counts())
            data[c + '_cnt'] = pd.qcut(data[c + '_cnt'],
                                       q=10,
                                       labels=False,
                                       duplicates='drop')
            sparse_features.append(c + '_cnt')

        # 均值特征
        df = get_expanding_mean(data[:train_size], data[train_size:], [
            'uid-author_id', 'uid-did', 'did-channel', 'uid', 'did', 'item_id'
        ], 'finish')
        dense_features += list(df.columns)
        data = pd.concat([data, df], axis=1)

    if use_unique:
        data['uid_icity_nunique'] = data['uid'].map(
            data.groupby('uid')['item_city'].nunique())
        data['uid_icity_nunique'] = pd.qcut(data['uid_icity_nunique'],
                                            q=10,
                                            labels=False,
                                            duplicates='drop')
        sparse_features.append('uid_icity_nunique')

        data['uid_item_nunique'] = data['uid'].map(
            data.groupby('uid')['item_id'].nunique())
        data['uid_item_nunique'] = pd.qcut(data['uid_item_nunique'],
                                           q=10,
                                           labels=False,
                                           duplicates='drop')
        sparse_features.append('uid_item_nunique')

        data['uid_author_nunique'] = data['uid'].map(
            data.groupby('uid')['author_id'].nunique())
        data['uid_author_nunique'] = pd.qcut(data['uid_author_nunique'],
                                             q=10,
                                             labels=False,
                                             duplicates='drop')
        sparse_features.append('uid_author_nunique')

        data['uid_music_nunique'] = data['uid'].map(
            data.groupby('uid')['music_id'].nunique())
        data['uid_music_nunique'] = pd.qcut(data['uid_music_nunique'],
                                            q=10,
                                            labels=False,
                                            duplicates='drop')
        sparse_features.append('uid_music_nunique')

        data['item_ucity_nunique'] = data['item_id'].map(
            data.groupby('item_id')['user_city'].nunique())
        data['item_ucity_nunique'] = pd.qcut(data['item_ucity_nunique'],
                                             q=10,
                                             labels=False,
                                             duplicates='drop')
        sparse_features.append('item_ucity_nunique')

        data['item_uid_nunique'] = data['item_id'].map(
            data.groupby('item_id')['uid'].nunique())
        data['item_uid_nunique'] = pd.qcut(data['item_uid_nunique'],
                                           q=30,
                                           labels=False,
                                           duplicates='drop')
        sparse_features.append('item_uid_nunique')

        data['author_uid_nunique'] = data['author_id'].map(
            data.groupby('author_id')['uid'].nunique())
        data['author_uid_nunique'] = pd.qcut(data['author_uid_nunique'],
                                             q=20,
                                             labels=False,
                                             duplicates='drop')
        sparse_features.append('author_uid_nunique')

    print('generate stats feats completed.')

    if use_video:
        video_feats = pd.read_csv(video_file)
        print('video feats read completed.')

        data = pd.merge(data, video_feats, how='left', on='item_id')
        for i in range(128):
            col = 'vd' + str(i)
            data[col].fillna(0, inplace=True)
        print('merge video feats completed.')

    if use_audio:
        audio_feats = pd.read_csv(audio_file)
        print('audio feats read completed.')

        data = pd.merge(data, audio_feats, how='left', on='item_id')
        for i in range(128):
            col = 'ad' + str(i)
            data[col].fillna(0, inplace=True)
        print('merge audio feats completed.')

    if use_title:
        max_len = 47
        title_feats = pd.read_json(title_file, lines=True)
        print('title feats read completed')

        def get_title_len(d):
            return sum(d.values())

        title_feats['title_len'] = title_feats['title_features'].apply(
            get_title_len)
        prior = title_feats['title_len'].mean()

        dense_features.append('title_len')
        title_feats['title_features'] = title_feats['title_features'].apply(
            lambda x: list(x.keys()))

        data = pd.merge(data, title_feats, how='left', on='item_id')
        for row in data.loc[data.title_features.isna(),
                            'title_features'].index:
            data.at[row, 'title_features'] = []
        data['title_len'].fillna(prior, inplace=True)
        print('merge title feats completed')

    data[sparse_features] = data[sparse_features].fillna('-1', )

    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    if len(dense_features) > 0:
        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense_features] = mms.fit_transform(data[dense_features])

    sparse_feature_list = [
        SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features]
    sequence_feature_list = []

    if use_title:
        sequence_feature_list.append(
            VarLenFeat('title', 134545, max_len, 'sum'))

    print('data preprocess completed.')

    train = data.iloc[:train_size]
    test = data.iloc[train_size:]

    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
        [train[feat.name].values for feat in dense_feature_list]

    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
        [test[feat.name].values for feat in dense_feature_list]

    if use_title:
        train_model_input += [
            pad_sequences(train['title_features'],
                          maxlen=max_len,
                          padding='post')
        ]
        test_model_input += [
            pad_sequences(test['title_features'],
                          maxlen=max_len,
                          padding='post')
        ]

    if use_video:
        vd_cols = ['vd' + str(i) for i in range(128)]
        video_input = data[vd_cols].values
        train_model_input += [video_input[:train_size]]
        test_model_input += [video_input[train_size:]]

    if use_audio:
        ad_cols = ['ad' + str(i) for i in range(128)]
        audio_input = data[ad_cols].values
        train_model_input += [audio_input[:train_size]]
        test_model_input += [audio_input[train_size:]]

    print('input process completed.')
    print(f'use sparse feats: [{",".join(sparse_features)}]')
    print(f'use dense feats: [{",".join(dense_features)}]')

    train_labels, test_labels = train[target].values, test[target].values
    feature_dim_dict = {
        "sparse": sparse_feature_list,
        "dense": dense_feature_list,
        "sequence": sequence_feature_list
    }

    if ONLINE_FLAG:
        return feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels, test_data
    return feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels
def model_pool(defaultfilename='./input/final_track1_train.txt', defaulttestfile='./input/final_track1_test_no_anwser.txt',
                defaultcolumnname=['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration'],
                defaulttarget=['finish', 'like'], defaultmodel="AFM", PERCENT=100):
        
    sparse_features=[]
    dense_features=[]
    target=defaulttarget    
     
    #1 train file
    data = pd.read_csv(defaultfilename, sep='\t', names=defaultcolumnname, iterator=True)
    #1 train file concats
    take=[]
    loop = True
    while loop:
        try:
            chunk=data.get_chunk(10000000)
            chunk=chunk.sample(frac=PERCENT/100., replace=True, random_state=1)
            take.append(chunk)
            gc.collect()
        except StopIteration:
            loop=False
            print('stop iteration')
            
    data = pd.concat(take, ignore_index=True, copy=False) 
    train_size = data.shape[0]
    print(train_size)
    take.clear()
    del [chunk,take]
        
    for column in data.columns:
        if column in defaulttarget:
            continue
        if data[column].dtype in  [numpy.float_ , numpy.float64]:
            dense_features.append(column)
        if data[column].dtype in [numpy.int_, numpy.int64]:
            sparse_features.append(column)
            
#     sparse_features=list(set(sparse_features))
#     dense_features=list(set(dense_features))
    #***************normal
    #3. Remove na values
    data[sparse_features].fillna('-1', inplace=True)
    data[dense_features].fillna(0, inplace=True)
    
    #4. Label Encoding for sparse features, and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    #5. Dense normalize
    if dense_features:
        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense_features] = mms.fit_transform(data[dense_features])
    #*****************normal
        #6. generate input data for model
    sparse_feature_list = [SingleFeat(feat, data[feat].nunique())
                           for feat in sparse_features]
    dense_feature_list = [SingleFeat(feat, 0)
                          for feat in dense_features]
    
    #****************model
    # 6.choose a model
    import pkgutil
    import mdeepctr.models
#     modelnames = [name for _, name, _ in pkgutil.iter_modules(mdeepctr.__path__)]
#     modelname = input("choose a model: "+",".join(modelnames)+"\n")
#     if not modelname:
    modelname=defaultmodel
    # 7.build a model
    model = getattr(mdeepctr.models, modelname)({"sparse": sparse_feature_list,
                    "dense": dense_feature_list}, final_activation='sigmoid', output_dim=len(defaulttarget))
    # 8. eval predict
    def auc(y_true, y_pred):
        return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)
    
    model.compile("adam", loss="binary_crossentropy", loss_weights=loss_weights, metrics=[auc])


    train_model_input = [data[feat.name].values for feat in sparse_feature_list] + \
                        [data[feat.name].values for feat in dense_feature_list]
    train_labels = [data[target].values for target in defaulttarget]

    my_callbacks = [EarlyStopping(monitor='loss', min_delta=1e-2, patience=1, verbose=1, mode='min')]

    history = model.fit(train_model_input, train_labels,
                batch_size=2**14, epochs=3, verbose=1, callbacks=my_callbacks)

    del [train_model_input, train_labels, data]
#     import objgraph
#     objgraph.show_refs([data], filename='data-graph.png')
    
    #2 test file       
    test_data = pd.read_csv(defaulttestfile, sep='\t', names=defaultcolumnname, )
    raw_test_data=test_data.copy()
    #data = data.append(test_data)
    test_size=test_data.shape[0]
    print(test_size)
    #***************normal
    #3. Remove na values
    test_data[sparse_features].fillna('-1', inplace=True)
    test_data[dense_features].fillna(0, inplace=True)
    #4. Label Encoding for sparse features, and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        test_data[feat] = lbe.fit_transform(test_data[feat])
    #5. Dense normalize
    if dense_features:
        mms = MinMaxScaler(feature_range=(0, 1))
        test_data[dense_features] = mms.fit_transform(test_data[dense_features])
    #*****************normal
    #test = test_data
    test_model_input = [test_data[feat.name].values for feat in sparse_feature_list] + \
        [test_data[feat.name].values for feat in dense_feature_list]
       
    pred_ans = model.predict(test_model_input, batch_size=2**14)
        
    result = raw_test_data[['uid', 'item_id', 'finish', 'like']].copy()
    result.rename(columns={'finish': 'finish_probability',
                           'like': 'like_probability'}, inplace=True)
    result['finish_probability'] = pred_ans[0]
    result['like_probability'] = pred_ans[1]
    output = "%s-result.csv" % (modelname)
    result[['uid', 'item_id', 'finish_probability', 'like_probability']].to_csv(
        output, index=None, float_format='%.6f')
    
    return history
Exemple #6
0
    sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel',
                       'music_id', 'did', ]
    dense_features = ['video_duration']  # 'creat_time',

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0,)

    target = ['finish', 'like']

    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    sparse_feature_list = [SingleFeat(feat, data[feat].nunique())
                           for feat in sparse_features]
    dense_feature_list = [SingleFeat(feat, 0)
                          for feat in dense_features]

    train = data.iloc[:train_size]
    test = data.iloc[train_size:]

    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
        [test[feat.name].values for feat in dense_feature_list]

    train_labels = [train[target[0]].values, train[target[1]].values]
    test_labels = [test[target[0]].values, test[target[1]].values]
key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(
    genres_list,
    maxlen=max_len,
    padding='post',
)

# 2.count #unique features for each sparse field and generate feature config for sequence feature

sparse_feat_list = [
    SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
]
sequence_feature = [VarLenFeat(
    'genres',
    len(key2index) + 1, max_len,
    'mean')]  # Notice : value 0 is for padding for sequence input feature

# 3.generate input data for model
sparse_input = [data[feat.name].values for feat in sparse_feat_list]
dense_input = []
sequence_input = [genres_list]
model_input = sparse_input + dense_input + \
    sequence_input  # make sure the order is right

# 4.Define Model,compile and train
model = DeepFM({
Exemple #8
0
def model_pool(defaultfilename='./input/final_track2_train.txt',
               defaulttestfile='./input/final_track2_test_no_anwser.txt',
               defaultcolumnname=[
                   'uid', 'user_city', 'item_id', 'author_id', 'item_city',
                   'channel', 'finish', 'like', 'music_id', 'did',
                   'creat_time', 'video_duration'
               ],
               defaulttarget=['finish', 'like'],
               defaultmodel="AFM",
               PERCENT=1):
    data = pd.read_csv(defaultfilename,
                       sep='\t',
                       names=defaultcolumnname,
                       iterator=True)
    #1 train file
    take = []
    loop = True
    while loop:
        try:
            chunk = data.get_chunk(10000)
            chunk = chunk.take(list(range(min(chunk.shape[0], PERCENT * 100))),
                               axis=0)
            take.append(chunk)
        except StopIteration:
            loop = False
            print('stop iteration')

    data = pd.concat(take, ignore_index=True)
    train_size = data.shape[0]
    print(train_size)

    #2 extract file
    test_data = pd.read_csv(
        defaulttestfile,
        sep='\t',
        names=defaultcolumnname,
    )
    data = data.append(test_data)
    test_size = test_data.shape[0]
    print(test_size)

    sparse_features = []
    dense_features = []
    target = defaulttarget
    for column in data.columns:
        if column in defaulttarget:
            continue
        if data[column].dtype in [numpy.float_, numpy.float64]:
            dense_features.append(column)
        if data[column].dtype in [numpy.int_, numpy.int64]:
            sparse_features.append(column)

    #3. Remove na values
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    #4. Label Encoding for sparse features, and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    #5. Dense normalize
    if dense_features:
        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense_features] = mms.fit_transform(data[dense_features])
    #6. generate input data for model
    sparse_feature_list = [
        SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features]
    #7. generate input data for model
    train = data.iloc[:train_size]
    test = data.iloc[train_size:]

    #8.generate data
    print(train.columns)
    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
        [test[feat.name].values for feat in dense_feature_list]

    train_labels = [train[target].values for target in defaulttarget]
    test_labels = test[target]

    # 6.choose a model
    import pkgutil
    import mdeepctr.models
    #     modelnames = [name for _, name, _ in pkgutil.iter_modules(mdeepctr.__path__)]
    #     modelname = input("choose a model: "+",".join(modelnames)+"\n")
    #     if not modelname:
    modelname = defaultmodel
    # 7.build a model
    model = getattr(mdeepctr.models, modelname)({
        "sparse": sparse_feature_list,
        "dense": dense_feature_list
    },
                                                final_activation='sigmoid',
                                                output_dim=len(defaulttarget))

    # 8. eval predict
    def auc(y_true, y_pred):
        return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)


#     model.compile("adagrad", loss="binary_crossentropy", metrics=[auc])

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[auc])
    my_callbacks = [
        EarlyStopping(monitor='loss',
                      min_delta=1e-2,
                      patience=3,
                      verbose=1,
                      mode='min')
    ]

    history = model.fit(train_model_input,
                        train_labels,
                        batch_size=4096,
                        epochs=100,
                        verbose=1,
                        callbacks=my_callbacks)
    pred_ans = model.predict(test_model_input, batch_size=2**14)

    #     nsamples, nx, ny = numpy.asarray(pred_ans).shape
    #     pred_ans = numpy.asarray(pred_ans).reshape((nx*ny, nsamples))
    #     print(test_labels.shape)
    #     print(pred_ans.shape)
    #
    #     logloss = round(log_loss(test_labels, pred_ans), 4)
    #     try:
    #         roc_auc = round(roc_auc_score(test_labels, pred_ans), 4)
    #     except:
    #         roc_auc=0

    result = test_data[['uid', 'item_id', 'finish', 'like']].copy()
    result.rename(columns={
        'finish': 'finish_probability',
        'like': 'like_probability'
    },
                  inplace=True)
    result['finish_probability'] = pred_ans[0]
    result['like_probability'] = pred_ans[1]
    output = "%s-result.csv" % (modelname)
    result[['uid', 'item_id', 'finish_probability',
            'like_probability']].to_csv(output,
                                        index=None,
                                        float_format='%.6f')

    return history
Exemple #9
0
def deepctr_cv(X_train,
               y_train,
               folds,
               logger,
               cv_path,
               X_test=None,
               optional_data=None,
               prep=True,
               split_conf=None):

    scores = []
    preds = []

    meta = np.zeros_like(y_train).astype("float64")
    if split_conf is None:
        X_tr, X_te, main_conf, _ = prep_for_embedding(X_train,
                                                      X_test,
                                                      conf,
                                                      prep=prep)
        X_train, X_test = X_tr, X_te
    else:
        main_conf = split_conf

    cat_cols = [c for c, _, _ in main_conf[0]]
    cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]]
    num_fs = [SingleFeat(c, 0) for c in conf.num_cols]

    X_test = split_df(X_test, cat_cols, conf.num_cols)

    for num_fold, (tr_ind, tes_ind) in enumerate(folds):
        if num_fold > 0:
            break
        logger.info(f"fold_{num_fold}")

        fold_path = cv_path / f"fold{num_fold}"
        seed_path = fold_path
        Path(fold_path).mkdir(exist_ok=True, parents=True)

        callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))]

        X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind]
        y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind]
        X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols)
        X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols)

        model = DeepFM({
            'sparse': cat_fs,
            'dense': num_fs
        },
                       final_activation='sigmoid')
        model.compile("adam", "binary_crossentropy", metrics=['accuracy'])
        model.fit(X_cv_train,
                  y_cv_train,
                  callbacks=callbacks,
                  batch_size=2048,
                  epochs=10,
                  verbose=1,
                  validation_data=(X_cv_test, y_cv_test))
        model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5')
        gc.collect()

        if X_test is not None:
            pred = model.predict(X_test, batch_size=2048)
            pred = pred[:, 0]
            np.save(seed_path / f"pred.npy", pred)

        train_oof = model.predict(X_cv_test, batch_size=2048)
        train_oof = train_oof[:, 0]
        auc = roc_auc_score(y_cv_test.values, train_oof)
        logger.info(f"{num_fold}: auc {auc}")
        np.save(seed_path / f"train_oof.npy", train_oof)

        # auc = roc_auc_score(y_cv_test, train_oof)
        # logger.info(f"seed_average: auc {auc}")
        scores.append(auc)
        np.save(fold_path / f"tes_ind.npy", tes_ind)
        meta[tes_ind] += train_oof
        del X_cv_train, y_cv_train, X_cv_test, y_cv_test

        if X_test is not None:
            preds.append(pred)

    scores = np.array(scores)
    preds = np.array(preds)
    pred = rank_average(preds)
    logger.info(f"{scores.mean()}, {scores.std()}")
    return scores, pred, meta