Ejemplo n.º 1
0
def _preprocess_csv_for_training(
        features,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed
):
    """
    Method to pre-process csv data
    :param features: list of all features (input + output)
    :param data_csv: path to the csv data
    :param data_train_csv:  training csv data
    :param data_validation_csv: validation csv data
    :param data_test_csv: test csv data
    :param train_set_metadata_json: train set metadata json
    :param skip_save_processed_input: if False, the pre-processed data is saved
    as .hdf5 files in the same location as the csvs with the same names.
    :param preprocessing_params: preprocessing parameters
    :param random_seed: random seed
    :return: training, test, validation datasets, training metadata
    """
    train_set_metadata = None
    if train_set_metadata_json is not None:
        train_set_metadata = load_metadata(train_set_metadata_json)

    if data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logger.info(
            'Using full raw csv, no hdf5 and json file '
            'with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(
            data_csv,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logger.info('Writing train set metadata with vocabulary')

            train_set_metadata_json_fp = replace_file_extension(
                data_csv,
                'json'
            )
            data_utils.save_json(
                train_set_metadata_json_fp, train_set_metadata)

        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logger.info(
            'Using training raw csv, no hdf5 and json '
            'file with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(
            data_train_csv,
            data_validation_csv,
            data_test_csv
        )
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(
            concatenated_df,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
            data_utils.save_hdf5(
                data_train_hdf5_fp,
                training_set,
                train_set_metadata
            )
            if validation_set is not None:
                data_validation_hdf5_fp = replace_file_extension(
                    data_validation_csv,
                    'hdf5'
                )
                data_utils.save_hdf5(
                    data_validation_hdf5_fp,
                    validation_set,
                    train_set_metadata
                )
            if test_set is not None:
                data_test_hdf5_fp = replace_file_extension(data_test_csv,
                                                           'hdf5')
                data_utils.save_hdf5(
                    data_test_hdf5_fp,
                    test_set,
                    train_set_metadata
                )
            logger.info('Writing train set metadata with vocabulary')
            train_set_metadata_json_fp = replace_file_extension(data_train_csv,
                                                                'json')
            data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    return training_set, test_set, validation_set, train_set_metadata
Ejemplo n.º 2
0
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Check if hdf5 and json already exist
    data_hdf5_fp = None
    data_train_hdf5_fp = None
    data_validation_hdf5_fp = None
    data_test_hdf5_fp = None
    train_set_metadata_json_fp = 'metadata.json'
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json'
        if (os.path.isfile(data_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(
            data_train_csv)[0] + '.json'
        if (os.path.isfile(data_train_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = os.path.splitext(
            data_validation_csv)[0] + '.hdf5'
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5'
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset_df(data_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_df(data_train_df, data_validation_df,
                                         data_test_df)
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    elif data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logging.info('Using full raw csv, no hdf5 and json file '
                     'with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(data_csv,
                                                 features,
                                                 preprocessing_params,
                                                 random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logging.info('Using training raw csv, no hdf5 and json '
                     'file with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(data_train_csv, data_validation_csv,
                                          data_test_csv)
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(model_definition,
                               [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
Ejemplo n.º 3
0
        type=yaml.safe_load,
        default='{}',
        help='the parameters for preprocessing the different features'
    )

    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
             'to a random number generator: data splitting, parameter '
             'initialization and training set shuffling'
    )

    args = parser.parse_args()

    data, train_set_metadata = build_dataset(
        args.dataset_csv,
        args.train_set_metadata_json,
        args.features,
        args.preprocessing_parameters,
        args.random_seed
    )

    # write train set metadata, dataset
    logger.info('Writing train set metadata with vocabulary')
    data_utils.save_json(args.output_metadata_json, train_set_metadata)
    logger.info('Writing dataset')
    data_utils.save_hdf5(args.output_dataset_h5, data, train_set_metadata)
Ejemplo n.º 4
0
 def save(self, cache_path, dataset, config, training_set_metadata, tag):
     data_utils.save_hdf5(cache_path, dataset)
     if tag == TRAINING:
         training_set_metadata[DATA_TRAIN_HDF5_FP] = cache_path
     return dataset