Python FileProcessor.train Examples

Programming Language: Python

Namespace/Package Name: file_processor

Class/Type: FileProcessor

Method/Function: train

Examples at hotexamples.com: 3

Python FileProcessor.train - 3 examples found. These are the top rated real world Python examples of file_processor.FileProcessor.train extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FileProcessor(28)

dir_to_files(13)

process(6)

match_file(6)

run(5)

read_file(4)

transform_features(3)

train(3)

_zip_bytes_io(2)

delete_csv(2)

generate_statistics(2)

process_file(1)

save_file(1)

retrieveData(1)

remove_output_dir(1)

read_in_chunks(1)

readFile(1)

process_files(1)

prepare(1)

_s3_prefix(1)

convert_file_names_to_name_data_dict(1)

parse_data(1)

_bucket_key_list(1)

main(1)

get_second_lowest_cost_lookup(1)

get_rate_code_lookup(1)

get_lines_as_array(1)

get_file_contents(1)

_upload_file_obj(1)

create_outfile(1)

update_outfile(1)

Example #1

Show file

def test_processor(feature_size, chunksize):
    """Tests FileProcessor.

    :param feature_size: int, size of feature vector.
    :param chunksize: int, FileProcessor chunk size.
    """
    sep = '\t'
    n_rows = 50
    feature = 3

    with TemporaryDirectory() as dir:
        input_path = os.path.join(dir, 'data.tsv')
        output_path = os.path.join(dir, 'data_proc.tsv')

        data, feature_values = generate_data(n_rows=n_rows,
                                             feature=feature,
                                             feature_size=feature_size,
                                             seed=42)

        data.to_csv(input_path, sep=sep, index=False)

        reader_params = {
            'chunksize': chunksize,
            'sep': sep,
        }

        transformers = (
            Standardizer,
            MaxIndex,
            MaxFeatureAbsMeanDiff,
        )
        processor = FileProcessor(transformers, reader_params=reader_params)
        processor.train(input_path)
        processor.process(input_path, output_path)

        processed = pd.read_csv(output_path, sep=sep)

    # check feature_{i}_stand_{index}
    expected_stand = (feature_values -
                      feature_values.mean(axis=0)) / feature_values.std(axis=0,
                                                                        ddof=1)
    stand = processed.filter(regex=f'feature_{feature}_stand_[0-9]+')
    assert np.allclose(expected_stand, stand)
    assert np.allclose(stand.mean(axis=0), 0)
    assert np.allclose(stand.std(axis=0, ddof=1), 1)

    # check max_feature_{i}_index
    expected_max = feature_values.max(axis=1)
    max_index = processed[f'max_feature_{feature}_index'].values
    max_mask = (max_index.reshape(
        (-1, 1)) == np.arange(feature_values.shape[1]).reshape((1, -1)))
    fact_max = feature_values[max_mask]
    assert np.allclose(expected_max, fact_max)

    # check max_feature_{i}_abs_mean_diff
    expected_max_mean = np.broadcast_to(feature_values.mean(axis=0),
                                        shape=max_mask.shape)[max_mask]
    expected_abs_mean_diff = np.abs(expected_max - expected_max_mean)
    abs_mean_diff = processed[f'max_feature_{feature}_abs_mean_diff']
    assert np.allclose(expected_abs_mean_diff, abs_mean_diff)

Example #2

Show file

        header = True
        for seed in tqdm(range(100), desc='generating data'):
            data, _ = generate_data(
                n_rows=100_000,
                feature=feature,
                feature_size=512,
                seed=seed
            )
            data.to_csv(
                input_path,
                sep=sep,
                index=False,
                mode='a',
                header=header
            )
            header = False

        reader_params = {
            'chunksize': 10_000,
            'sep': sep,
        }

        transformers = (
            Standardizer,
            MaxIndex,
            MaxFeatureAbsMeanDiff,
        )
        processor = FileProcessor(transformers, reader_params=reader_params)
        processor.train(input_path)
        processor.process(input_path, output_path)

Example #3

Show file

parser.add_argument(
    '-o', '--output',
    default=os.path.join(DATA_DIR, OUTPUT_FILE),
    help='Path to file, where processed data will be stored.',
)
parser.add_argument(
    '-c', '--chunk_size',
    default=128,
    help='Rows to process at a time.',
)
args = parser.parse_args()


if __name__ == '__main__':
    train_path = args.train
    test_path = args.input
    output_path = args.output
    reader_params = {
        'chunksize': args.chunk_size,
        'sep': SEPARATOR,
    }

    transformers = (
        Standardizer,
        MaxIndex,
        MaxFeatureAbsMeanDiff,
    )
    processor = FileProcessor(transformers, reader_params=reader_params)
    processor.train(train_path)
    processor.process(test_path, output_path)