Exemple #1
0
def test_only_with_number_features():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    category_features = []

    sequence_features = []

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    wide_features = ['rating', 'userAge']

    dataloader, _ = prepare_dataloader(features)

    model = WideDeep(features,
                     wide_features, [], [],
                     num_classes=2,
                     embedding_size=4,
                     hidden_layers=(8, 4),
                     final_activation='sigmoid',
                     dropout=0.3)

    model(next(iter(dataloader)))
def test_normal():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId', CategoryEncoder(min_cnt=1)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    dataloader, _ = prepare_dataloader(features)

    deep_fm = DeepFM(features,
                     num_classes=2,
                     embedding_size=4,
                     hidden_layers=(8, 4),
                     final_activation='sigmoid',
                     dropout=0.3)

    deep_fm(next(iter(dataloader)))
Exemple #3
0
def test_without_category_feature():
    number_features = []

    category_features = []

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    wide_features = ['title', 'genres']
    deep_features = ['clickedMovieIds', 'clickedMovieTopGenres']

    dataloader, _ = prepare_dataloader(features)

    model = WideDeep(features,
                     wide_features,
                     deep_features, [],
                     num_classes=2,
                     embedding_size=4,
                     hidden_layers=(8, 4),
                     final_activation='sigmoid',
                     dropout=0.3)

    model(next(iter(dataloader)))
def test_without_number_feature():
    number_features = []

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId', CategoryEncoder(min_cnt=1)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    dataloader = prepare_dataloader(features)

    model = DNN(features,
                num_classes=2,
                embedding_size=4,
                hidden_layers=(8, 4),
                final_activation='sigmoid',
                dropout=0.3)

    model(next(iter(dataloader)))
def test_column_flow_define():
    number_features = [
        Number('age', None),
        Number('height', ColumnFlow([StandardScaler()]))
    ]

    category_features = [
        Category('gender', ColumnFlow([CategoryEncoder(min_cnt=1)]))
    ]

    sequence_features = [
        Sequence('likes', ColumnFlow([SequenceEncoder(sep=' ', min_cnt=1)]))
    ]

    features = Features(number_features, category_features, sequence_features)

    features.fit(__TEST_DATA)

    actual = features.transform(__TEST_DATA)

    expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76])
    expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2])
    expected_height = np.array([
        0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853,
        -0.34789771, 0.48705679, 1.69310217
    ])

    assert len(actual) == 4
    assert features.number_feature_names() == ['age', 'height']
    assert features.category_feature_names() == ['gender']
    assert features.sequence_feature_names() == ['likes']
    np.testing.assert_array_equal(actual['age'], expected_age)
    np.testing.assert_array_equal(actual['gender'], expected_gender)
    np.testing.assert_array_almost_equal(actual['height'], expected_height)
Exemple #6
0
def create_dataloader_fn(
        number_features, category_features, sequence_features, batch_size,
        train_df, label_col='label', test_df=None, num_workers=0):

    features = Features(
        number_features=number_features,
        category_features=category_features,
        sequence_features=sequence_features)

    features = features.fit(train_df)

    train_X_map = features.transform(train_df)
    train_y = train_df[label_col].values
    train_dataset = Dataset(features, train_X_map, train_y)
    train_loader = data.DataLoader(
        train_dataset, batch_size=batch_size,
        shuffle=True, num_workers=num_workers)

    test_loader = None
    if test_df is not None:
        test_X_map = features.transform(test_df)
        test_y = None
        if label_col in set(test_df.columns):
            test_y = test_df[label_col].values
        test_dataset = Dataset(features, test_X_map, test_y)
        test_loader = data.DataLoader(
            test_dataset, batch_size=batch_size,
            shuffle=False, num_workers=num_workers)

    return features, train_loader, test_loader
Exemple #7
0
def test_normal():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId', CategoryEncoder(min_cnt=1)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    attention_groups = [
        AttentionGroup(name='group1',
                       pairs=[{
                           'ad': 'movieId',
                           'pos_hist': 'clickedMovieIds'
                       }, {
                           'ad': 'topGenre',
                           'pos_hist': 'clickedMovieTopGenres'
                       }],
                       hidden_layers=[8, 4])
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    dataloader = prepare_dataloader(features)

    model = DIN(features,
                attention_groups=attention_groups,
                num_classes=2,
                embedding_size=4,
                hidden_layers=(16, 8),
                final_activation='sigmoid',
                dropout=0.3)

    model(next(iter(dataloader)))
def create_test_data():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId', CategoryEncoder(min_cnt=1)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('noClickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('noClickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    attention_groups = [
        AttentionGroup(name='group1',
                       pairs=[{
                           'ad': 'movieId',
                           'pos_hist': 'clickedMovieIds',
                           'neg_hist': 'noClickedMovieIds'
                       }, {
                           'ad': 'topGenre',
                           'pos_hist': 'clickedMovieTopGenres',
                           'neg_hist': 'noClickedMovieTopGenres'
                       }],
                       hidden_layers=[8, 4])
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    dataloader = prepare_dataloader(features)

    return dataloader, features, attention_groups
Exemple #9
0
def test_normal():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId', CategoryEncoder(min_cnt=1)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    features.fit(__SAMPLE_DF)

    X_map = features.transform(__SAMPLE_DF)

    dataset = Dataset(features, X_map, __SAMPLE_DF.label.values)

    assert dataset[0]['userId'] == 1
    assert dataset[0]['movieId'] == 1
    assert dataset[0]['genres'].tolist() == [8, 9, 0, 0]
    assert dataset[0]['__genres_length'] == 2
    assert dataset[0]['label'] == 1
def create_test_data_with_sharing_emb():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    # provide word to index mapping
    movie_word2idx = {
        '__PAD__': 0,
        '4226': 1,
        '5971': 2,
        '6291': 3,
        '7153': 4,
        '30707': 5,
        '3242': 6,
        '42': 7,
        '32': 8,
        '34': 9,
        '233': 10,
        '291': 11,
        '324': 12,
        '325': 13,
        '3542': 14,
        '322': 15,
        '33': 16,
        '45': 17,
        '__UNKNOWN__': 18
    }

    movie_idx2word = {index: word for word, index in movie_word2idx.items()}

    category_features = [
        Category(
            'movieId',
            CategoryEncoder(word2idx=movie_word2idx, idx2word=movie_idx2word)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence(
            'clickedMovieIds',
            SequenceEncoder(sep='|',
                            max_len=5,
                            word2idx=movie_word2idx,
                            idx2word=movie_idx2word)),
        Sequence(
            'noClickedMovieIds',
            SequenceEncoder(sep='|',
                            max_len=5,
                            word2idx=movie_word2idx,
                            idx2word=movie_idx2word))
    ]

    attention_groups = [
        AttentionGroup(name='group1',
                       pairs=[{
                           'ad': 'movieId',
                           'pos_hist': 'clickedMovieIds',
                           'neg_hist': 'noClickedMovieIds'
                       }],
                       hidden_layers=[8, 4])
    ]

    embedding_ref = EmbeddingRef({
        'clickedMovieIds': 'movieId',
        'noClickedMovieIds': 'movieId'
    })

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    dataloader = prepare_dataloader(features)

    return dataloader, features, attention_groups, embedding_ref
Exemple #11
0
def test_shared_embedding():
    number_features = []

    movie_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5)
    genre_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5)

    movie_enc.fit(
        np.concatenate(
            (_SAMPLE_DF.clickedMovieIds.values, _SAMPLE_DF.movieId.values),
            axis=None))

    genre_enc.fit(
        np.concatenate((_SAMPLE_DF.clickedMovieTopGenres.values,
                        _SAMPLE_DF.topGenre.values),
                       axis=None))

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId',
                 CategoryEncoder(min_cnt=1,
                                 word2idx=movie_enc.word2idx,
                                 idx2word=movie_enc.idx2word),
                 embedding_name='movieId'),
        Category('topGenre',
                 CategoryEncoder(min_cnt=1,
                                 word2idx=genre_enc.word2idx,
                                 idx2word=genre_enc.idx2word),
                 embedding_name='topGenre',
                 embedding_size=8)
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|',
                                 min_cnt=1,
                                 max_len=5,
                                 word2idx=movie_enc.word2idx,
                                 idx2word=movie_enc.idx2word),
                 embedding_name='movieId'),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|',
                                 min_cnt=1,
                                 max_len=5,
                                 word2idx=genre_enc.word2idx,
                                 idx2word=genre_enc.idx2word),
                 embedding_name='topGenre',
                 embedding_size=8)
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    dataloader, _ = prepare_dataloader(features)

    model = DNN(features,
                num_classes=2,
                embedding_size=16,
                hidden_layers=(8, 4),
                final_activation='sigmoid',
                dropout=0.3)

    model(next(iter(dataloader)))