def test_column_flow_define(): number_features = [ Number('age', None), Number('height', ColumnFlow([StandardScaler()])) ] category_features = [ Category('gender', ColumnFlow([CategoryEncoder(min_cnt=1)])) ] sequence_features = [ Sequence('likes', ColumnFlow([SequenceEncoder(sep=' ', min_cnt=1)])) ] features = Features(number_features, category_features, sequence_features) features.fit(__TEST_DATA) actual = features.transform(__TEST_DATA) expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76]) expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2]) expected_height = np.array([ 0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853, -0.34789771, 0.48705679, 1.69310217 ]) assert len(actual) == 4 assert features.number_feature_names() == ['age', 'height'] assert features.category_feature_names() == ['gender'] assert features.sequence_feature_names() == ['likes'] np.testing.assert_array_equal(actual['age'], expected_age) np.testing.assert_array_equal(actual['gender'], expected_gender) np.testing.assert_array_almost_equal(actual['height'], expected_height)
def create_dataloader_fn( number_features, category_features, sequence_features, batch_size, train_df, label_col='label', test_df=None, num_workers=0): features = Features( number_features=number_features, category_features=category_features, sequence_features=sequence_features) features = features.fit(train_df) train_X_map = features.transform(train_df) train_y = train_df[label_col].values train_dataset = Dataset(features, train_X_map, train_y) train_loader = data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_loader = None if test_df is not None: test_X_map = features.transform(test_df) test_y = None if label_col in set(test_df.columns): test_y = test_df[label_col].values test_dataset = Dataset(features, test_X_map, test_y) test_loader = data.DataLoader( test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) return features, train_loader, test_loader
def test_normal(): number_features = [ Number('userAge', StandardScaler()), Number('rating', StandardScaler()) ] category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) features.fit(__SAMPLE_DF) X_map = features.transform(__SAMPLE_DF) dataset = Dataset(features, X_map, __SAMPLE_DF.label.values) assert dataset[0]['userId'] == 1 assert dataset[0]['movieId'] == 1 assert dataset[0]['genres'].tolist() == [8, 9, 0, 0] assert dataset[0]['__genres_length'] == 2 assert dataset[0]['label'] == 1