Ejemplo n.º 1
0
def test_get_features_shape():
    dframe = read_raw_data()
    processed = preprocess_data(dframe)
    features = get_featues(processed)
    label = get_label(processed)

    assert features.shape == (150, 4)
    assert label.shape == (150, )
Ejemplo n.º 2
0
def test_get_features_shape():
    dframe = read_raw_data()
    processed = preprocess_data(dframe)
    features = get_features(processed)
    label = get_target(processed)

    assert features.shape == (34109, 7)
    assert label.shape == (34109)
def main(targets):

    # Will change to test config path if test target is seen
    config_dir = 'config'
    run_all = False

    # Set up logging
    with open(Path(config_dir, 'logging.json')) as f:
        logging_params = json.load(f)

    if logging_params['produce_logs']:
        log_file = logging_params['log_file']
        ensure_path_exists(log_file)
        logging.basicConfig(
            filename=log_file,
            filemode='a',
            format='%(asctime)s, %(name)s %(levelname)s %(message)s',
            datefmt='%H:%M:%S',
            level=logging.DEBUG)
        logging.info(f"{'*'*80}\nBEGIN RUN\n{'*'*80}")

    # Regardless of if a logfile is being collected, we should also get the logs
    # to show up in standard out.
    logging.getLogger().addHandler(logging.StreamHandler())

    if 'all' in targets or len(targets) == 0:
        run_all = True

    if 'clean' in targets:
        # Would probably just delete the data folder... but should truly look at
        # the configuration to decide what to delete.
        raise NotImplementedError

    if 'test' in targets:
        # If `test` is the only target seen, then run all targets with the
        # configs and data found in the test directory.
        #
        # Otherwise, if additional targets are specified then only run those
        # targets but still use test config (and therefore test data).
        # print('Test target recognized. Will use test configuration files.')
        logging.info(
            'Test target recognized. Will use test configuration files.')
        config_dir = 'test/config'

        if len(targets) == 1:
            # print('Testing all targets: `data`, `features`, `train`.')
            run_all = True

    if 'data' in targets or run_all:
        # Load, clean, and preprocess data. Then store preprocessed data to
        # configured intermediate directory.
        # print('Data target recognized.')
        logging.info('Data target recognized.')

        with open(Path(config_dir, 'data-params.json'), 'r') as f:
            data_params = json.load(f)

        print('Running ETL pipeline.')
        logging.info('Running ETL pipeline.')
        preprocess_data(**data_params)
        print('ETL pipeline complete.')
        logging.info('ETL pipeline complete.')

    if 'features' in targets or run_all:
        # Creates features for preprocessed data and stores feature-engineered
        # data to a configured csv and directory.
        # print('Features target recognized.')
        logging.info('Features target recognized.')

        with open(Path(config_dir, 'features-params.json'), 'r') as f:
            features_params = json.load(f)

        # print('Engineering features.')
        logging.info('Engineering features.')
        create_features(**features_params)
        # print('Feature engineering complete.')
        logging.info('Feature engineering complete.')

    if 'train' in targets or run_all:
        # Trains model based on feature-engineeered data, report some of its
        # scores, and save the model.
        # print('Train target recognized.')
        logging.info('Train target recognized.')

        with open(Path(config_dir, 'train-params.json'), 'r') as f:
            train_params = json.load(f)

        # print('Training model.')
        logging.info('Training model.')
        train_model(**train_params)
        # print('Model training complete.')
        logging.info('Model training complete.')

    if 'generate' in targets:
        # Generates data from network-stats
        #
        # NOTE: This target should *not* be included in `all`.
        # print('Generate target recognized.')
        logging.info('Generate target recognized.')

        with open(Path(config_dir, 'generate-params.json'), 'r') as f:
            generate_params = json.load(f)

        # print('Collecting data with network-stats.')
        logging.info('Collecting data with network-stats.')
        collect_data(**generate_params)
        # print('Data collection complete.')
        logging.info('Data collection complete.')

    return
Ejemplo n.º 4
0
from src.data import preprocess_data

argparser = ArgumentParser()
argparser.add_argument("--data_path", help="Path to data pickle file.")
argparser.add_argument("--data_name",
                       choices=["seizure", "normal"],
                       help="Name of data.")
argparser.add_argument(
    "--output_path",
    help="Path to directory where the preprocessed data will be saved.")
argparser.add_argument("--feature_size",
                       default=128,
                       type=int,
                       help="Batch size.")

args = argparser.parse_args()

data_path = args.data_path
assert data_path, "Data path is required"

data_name = args.data_name
assert data_name, "Data name is required"

output_path = args.output_path
assert output_path and os.path.isdir(output_path), "Output path is required"

preprocess_data(data_fname=data_path,
                data_name=data_name,
                output_dir=output_path,
                feature_size=args.feature_size)