def test_processor(feature_size, chunksize): """Tests FileProcessor. :param feature_size: int, size of feature vector. :param chunksize: int, FileProcessor chunk size. """ sep = '\t' n_rows = 50 feature = 3 with TemporaryDirectory() as dir: input_path = os.path.join(dir, 'data.tsv') output_path = os.path.join(dir, 'data_proc.tsv') data, feature_values = generate_data(n_rows=n_rows, feature=feature, feature_size=feature_size, seed=42) data.to_csv(input_path, sep=sep, index=False) reader_params = { 'chunksize': chunksize, 'sep': sep, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(input_path) processor.process(input_path, output_path) processed = pd.read_csv(output_path, sep=sep) # check feature_{i}_stand_{index} expected_stand = (feature_values - feature_values.mean(axis=0)) / feature_values.std(axis=0, ddof=1) stand = processed.filter(regex=f'feature_{feature}_stand_[0-9]+') assert np.allclose(expected_stand, stand) assert np.allclose(stand.mean(axis=0), 0) assert np.allclose(stand.std(axis=0, ddof=1), 1) # check max_feature_{i}_index expected_max = feature_values.max(axis=1) max_index = processed[f'max_feature_{feature}_index'].values max_mask = (max_index.reshape( (-1, 1)) == np.arange(feature_values.shape[1]).reshape((1, -1))) fact_max = feature_values[max_mask] assert np.allclose(expected_max, fact_max) # check max_feature_{i}_abs_mean_diff expected_max_mean = np.broadcast_to(feature_values.mean(axis=0), shape=max_mask.shape)[max_mask] expected_abs_mean_diff = np.abs(expected_max - expected_max_mean) abs_mean_diff = processed[f'max_feature_{feature}_abs_mean_diff'] assert np.allclose(expected_abs_mean_diff, abs_mean_diff)
header = True for seed in tqdm(range(100), desc='generating data'): data, _ = generate_data( n_rows=100_000, feature=feature, feature_size=512, seed=seed ) data.to_csv( input_path, sep=sep, index=False, mode='a', header=header ) header = False reader_params = { 'chunksize': 10_000, 'sep': sep, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(input_path) processor.process(input_path, output_path)
parser.add_argument( '-o', '--output', default=os.path.join(DATA_DIR, OUTPUT_FILE), help='Path to file, where processed data will be stored.', ) parser.add_argument( '-c', '--chunk_size', default=128, help='Rows to process at a time.', ) args = parser.parse_args() if __name__ == '__main__': train_path = args.train test_path = args.input output_path = args.output reader_params = { 'chunksize': args.chunk_size, 'sep': SEPARATOR, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(train_path) processor.process(test_path, output_path)