Ejemplo n.º 1
0
def test_processor(feature_size, chunksize):
    """Tests FileProcessor.

    :param feature_size: int, size of feature vector.
    :param chunksize: int, FileProcessor chunk size.
    """
    sep = '\t'
    n_rows = 50
    feature = 3

    with TemporaryDirectory() as dir:
        input_path = os.path.join(dir, 'data.tsv')
        output_path = os.path.join(dir, 'data_proc.tsv')

        data, feature_values = generate_data(n_rows=n_rows,
                                             feature=feature,
                                             feature_size=feature_size,
                                             seed=42)

        data.to_csv(input_path, sep=sep, index=False)

        reader_params = {
            'chunksize': chunksize,
            'sep': sep,
        }

        transformers = (
            Standardizer,
            MaxIndex,
            MaxFeatureAbsMeanDiff,
        )
        processor = FileProcessor(transformers, reader_params=reader_params)
        processor.train(input_path)
        processor.process(input_path, output_path)

        processed = pd.read_csv(output_path, sep=sep)

    # check feature_{i}_stand_{index}
    expected_stand = (feature_values -
                      feature_values.mean(axis=0)) / feature_values.std(axis=0,
                                                                        ddof=1)
    stand = processed.filter(regex=f'feature_{feature}_stand_[0-9]+')
    assert np.allclose(expected_stand, stand)
    assert np.allclose(stand.mean(axis=0), 0)
    assert np.allclose(stand.std(axis=0, ddof=1), 1)

    # check max_feature_{i}_index
    expected_max = feature_values.max(axis=1)
    max_index = processed[f'max_feature_{feature}_index'].values
    max_mask = (max_index.reshape(
        (-1, 1)) == np.arange(feature_values.shape[1]).reshape((1, -1)))
    fact_max = feature_values[max_mask]
    assert np.allclose(expected_max, fact_max)

    # check max_feature_{i}_abs_mean_diff
    expected_max_mean = np.broadcast_to(feature_values.mean(axis=0),
                                        shape=max_mask.shape)[max_mask]
    expected_abs_mean_diff = np.abs(expected_max - expected_max_mean)
    abs_mean_diff = processed[f'max_feature_{feature}_abs_mean_diff']
    assert np.allclose(expected_abs_mean_diff, abs_mean_diff)
Ejemplo n.º 2
0
        header = True
        for seed in tqdm(range(100), desc='generating data'):
            data, _ = generate_data(
                n_rows=100_000,
                feature=feature,
                feature_size=512,
                seed=seed
            )
            data.to_csv(
                input_path,
                sep=sep,
                index=False,
                mode='a',
                header=header
            )
            header = False

        reader_params = {
            'chunksize': 10_000,
            'sep': sep,
        }

        transformers = (
            Standardizer,
            MaxIndex,
            MaxFeatureAbsMeanDiff,
        )
        processor = FileProcessor(transformers, reader_params=reader_params)
        processor.train(input_path)
        processor.process(input_path, output_path)
Ejemplo n.º 3
0
parser.add_argument(
    '-o', '--output',
    default=os.path.join(DATA_DIR, OUTPUT_FILE),
    help='Path to file, where processed data will be stored.',
)
parser.add_argument(
    '-c', '--chunk_size',
    default=128,
    help='Rows to process at a time.',
)
args = parser.parse_args()


if __name__ == '__main__':
    train_path = args.train
    test_path = args.input
    output_path = args.output
    reader_params = {
        'chunksize': args.chunk_size,
        'sep': SEPARATOR,
    }

    transformers = (
        Standardizer,
        MaxIndex,
        MaxFeatureAbsMeanDiff,
    )
    processor = FileProcessor(transformers, reader_params=reader_params)
    processor.train(train_path)
    processor.process(test_path, output_path)