Esempio n. 1
0
def main(input_filepath, output_filepath, config_file):
    """Runs data loading and cleaning and pre-processing scripts and
    saves data in ../processed."""
    logger = logging.getLogger(__name__)
    logger.info('Loading training data set, setting up pipeline, tuning,'
                'training and evaluating final model.')

    # Parse config file
    # config = parse_config(config_file)

    # Load training data
    X_train = pd.read_csv(input_filepath + '/X_train.csv')
    y_train = pd.read_csv(input_filepath + '/y_train.csv').values.ravel()

    # Pre-processing and modeling pipeline
    cat_features = X_train.select_dtypes(exclude='float64').columns
    num_features = X_train.select_dtypes(include='float64').columns

    pipe = Pipeline([('preprocessing',
                      preprocessing_pipeline(cat_features, num_features)),
                     ('model',
                      TransformedTargetRegressor(regressor=SVR(),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1))])

    # Tune or select model
    #   kf = KFold(config['modeling']['num_folds'], shuffle=True,
    #   random_state=rng).get_n_splits(X_train.values)

    model = Model(model=pipe)

    # Train model
    model.train(X_train, y_train)

    # Save model
    model.save(output_filepath + model.name + '.pkl')
                                    training=False)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=config.batch_size,
                            num_workers=2,
                            drop_last=False,
                            shuffle=False)
    sample_iterator = val_dataset.create_iterator(config.sample_size)

    model = Model(config, logger=logger)
    model.load(is_test=False)
    steps_per_epoch = len(train_dataset) // config.batch_size
    iteration = model.iteration
    epoch = model.iteration // steps_per_epoch
    logger.info('Start from epoch:{}, iteration:{}'.format(epoch, iteration))

    model.train()
    keep_training = True
    best_score = {}
    while (keep_training):
        epoch += 1

        stateful_metrics = ['epoch', 'iter', 'g_lr']
        progbar = Progbar(len(train_dataset),
                          max_iters=steps_per_epoch,
                          width=20,
                          stateful_metrics=stateful_metrics)
        for items in train_loader:
            model.train()
            items = to_cuda(items, config.device)
            _, g_loss, d_loss, logs = model.get_losses(items)
            model.backward(g_loss=g_loss, d_loss=d_loss)