Ejemplo n.º 1
0
def create_dataloader_fn(
        number_features, category_features, sequence_features, batch_size,
        train_df, label_col='label', test_df=None, num_workers=0):

    features = Features(
        number_features=number_features,
        category_features=category_features,
        sequence_features=sequence_features)

    features = features.fit(train_df)

    train_X_map = features.transform(train_df)
    train_y = train_df[label_col].values
    train_dataset = Dataset(features, train_X_map, train_y)
    train_loader = data.DataLoader(
        train_dataset, batch_size=batch_size,
        shuffle=True, num_workers=num_workers)

    test_loader = None
    if test_df is not None:
        test_X_map = features.transform(test_df)
        test_y = None
        if label_col in set(test_df.columns):
            test_y = test_df[label_col].values
        test_dataset = Dataset(features, test_X_map, test_y)
        test_loader = data.DataLoader(
            test_dataset, batch_size=batch_size,
            shuffle=False, num_workers=num_workers)

    return features, train_loader, test_loader
Ejemplo n.º 2
0
def test_column_flow_define():
    number_features = [
        Number('age', None),
        Number('height', ColumnFlow([StandardScaler()]))
    ]

    category_features = [
        Category('gender', ColumnFlow([CategoryEncoder(min_cnt=1)]))
    ]

    sequence_features = [
        Sequence('likes', ColumnFlow([SequenceEncoder(sep=' ', min_cnt=1)]))
    ]

    features = Features(number_features, category_features, sequence_features)

    features.fit(__TEST_DATA)

    actual = features.transform(__TEST_DATA)

    expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76])
    expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2])
    expected_height = np.array([
        0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853,
        -0.34789771, 0.48705679, 1.69310217
    ])

    assert len(actual) == 4
    assert features.number_feature_names() == ['age', 'height']
    assert features.category_feature_names() == ['gender']
    assert features.sequence_feature_names() == ['likes']
    np.testing.assert_array_equal(actual['age'], expected_age)
    np.testing.assert_array_equal(actual['gender'], expected_gender)
    np.testing.assert_array_almost_equal(actual['height'], expected_height)
Ejemplo n.º 3
0
def test_normal():
    number_features = [
        Number('userAge', StandardScaler()),
        Number('rating', StandardScaler())
    ]

    category_features = [
        Category('userId', CategoryEncoder(min_cnt=1)),
        Category('movieId', CategoryEncoder(min_cnt=1)),
        Category('topGenre', CategoryEncoder(min_cnt=1))
    ]

    sequence_features = [
        Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
        Sequence('clickedMovieIds',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
        Sequence('clickedMovieTopGenres',
                 SequenceEncoder(sep='|', min_cnt=1, max_len=5))
    ]

    features = Features(number_features=number_features,
                        category_features=category_features,
                        sequence_features=sequence_features)

    features.fit(__SAMPLE_DF)

    X_map = features.transform(__SAMPLE_DF)

    dataset = Dataset(features, X_map, __SAMPLE_DF.label.values)

    assert dataset[0]['userId'] == 1
    assert dataset[0]['movieId'] == 1
    assert dataset[0]['genres'].tolist() == [8, 9, 0, 0]
    assert dataset[0]['__genres_length'] == 2
    assert dataset[0]['label'] == 1