def main(frac: float = default_sample_frac):
    """
    Creates a sample of the solar wind data and saves it.
    # Parameters
    frac: `float`
        should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include on the sample dataset.
    """
    logging.info(f'making sample of {frac}%')
    logging.info('reading config file')
    config = load_data.read_config_file('./config/config.yml')
    directories = config['directories']
    interim_path = Path(directories['interim'])
    # reading gt data
    logging.info('reading training data')
    solar_wind = load_data.read_feather(interim_path / 'solar_wind.feather')

    logging.info('splitting dataset')
    _, valid_idx = load_data.split_train_data(solar_wind,
                                              test_frac=frac,
                                              eval_mode=True)

    sample_data = solar_wind.loc[valid_idx, :]
    sample_data.reset_index(drop=True, inplace=True)
    logging.info('saving file..')
    sample_data.to_feather(interim_path / 'sample_solar_wind.feather')
Beispiel #2
0
def main():
    """
    This function will save the solar wind data
    as a Feather file
    """
    # read the main config file
    config = load_data.read_config_file('./config/config.yml')
    # get the path to the CSV File
    directories = config['directories']
    raw_path = Path(directories['raw'])
    interim_path = Path(directories['interim'])
    interim_path.mkdir(exist_ok=True, parents=True)
    logging.info('reading solar wind data..')
    # reading CSV file
    solar_wind = load_data.read_csv(raw_path / 'solar_wind.csv')
    logging.info('saving to feather..')
    # saving as feather file
    solar_wind.to_feather(interim_path / 'solar_wind.feather')
def main(use_sample: bool = False, n_jobs: int = 1):
    """
    This function will apply all the steps in order to create
    a dataset ready to train models.
    The following steps:
        - read the data
        - compute the solar wind features
        - compute sattelite positions features
        - take the log of smoothed_ssn values
        - create the target for the actual time t and t + 1 hour
        - merge all dataset into a single one
        - save the dataset for future modeling
    # Params
    use_sample: `bool`, optional(defualt=False)
        Whether or not to use the sample dataset
    n_jobs: `in`, optional(defualt=1)
        The number of jobs to run in parallel

    """
    logging.info(f'use_sample={use_sample}, n_jobs={n_jobs}')
    logging.info('reading config file')
    config = load_data.read_config_file('./config/config.yml')
    # directories
    directories = config['directories']
    raw_path = Path(directories['raw'])
    interim_path = Path(directories['interim'])
    processed_path = Path(directories['processed'])
    processed_path.mkdir(exist_ok=True, parents=True)

    # reading gt data
    solar_wind_file = ('sample_solar_wind.feather'
                       if use_sample else 'solar_wind.feather')
    logging.info('reading training data')
    dst_labels = load_data.read_csv(raw_path / 'dst_labels.csv')
    solar_wind = load_data.read_feather(interim_path / solar_wind_file)
    sunspots = load_data.read_csv(raw_path / 'sunspots.csv')
    stl_pos = load_data.read_csv(raw_path / 'satellite_positions.csv')

    logging.info('preprocessing solar wing')
    # preprocessing solar wind
    # setting timedelta as index
    solar_wind.set_index('timedelta', inplace=True)
    # preprocessing solar wind time series
    solar_wind = solar_wind_preprocessing(solar_wind)
    logging.info('computing features')
    start = time.time()
    # computing solar wind features
    data = split_into_period(solar_wind,
                             features=default.init_features,
                             n_jobs=n_jobs)
    elapsed_time = (time.time() - start) / 60
    logging.info(f'elapsed time {elapsed_time:.4f}')

    logging.info('merging other datasets')
    # create target
    target = create_target(dst_labels)
    # preprocessing sattelite positions
    stl_pos = stl_preprocessing(stl_pos)
    # taking the log of smoothed_ssn values
    sunspots['smoothed_ssn'] = np.log(sunspots['smoothed_ssn'])
    # merging dataframes to the main dataframe
    data = merge_daily(data, stl_pos)
    data = merge_daily(data, sunspots)
    # merging target dataframe to the main dataframe
    data = data.merge(target, how='left', on=['period', 'timedelta'])
    # droping last values where there is not available data
    data.dropna(subset=['t0', 't1'], inplace=True)
    # reset index
    data.reset_index(inplace=True, drop=True)
    logging.info('saving')
    output_filename = 'fe' if not use_sample else 'fe_sample'
    # saving to feather format
    data.to_feather(processed_path / f'{output_filename}.feather')
def main(experiment_path: str,
         eval_mode: bool = True,
         use_sample: bool = False,
         test_frac: float = 0.2,
         message: str = None,
         fi_threshold: float = None):
    """
    A function to train or validate an Experiment
    # Parameters
    experiment_path: `str`
        A path to the folder's experiment config file.
        the config file must be named config.yml and
        it must contain the following keys:
            model: `str`
                the path to the model config file
            pipeline: `str`
                the path to the pipeline config file
            optimizer: `Dict[str, Any]`
                the parameters for the Adam optimizer
            epochs: `int`, optinal(default=10)
                the number of epochs to train the model
            use_sigmoid: `bool`, optional(defualt=False)
                Whether or not to use as the final activation
                function of the model

    eval_mode: `bool`, optional (default=True)
        if False, the model will be train using all the data available,
        otherwise, the trained model will be use for inference.
    use_sample: `bool`, optional (default=False)
        if True, we will use only a sample from the dataset.
        to use it, before execute the make_sample.py file
        to create this sample dataset.
    test_frac: `float`, optional (default=0.2)
        if eval_mode is True, the size of the valid dataset
        will be the {test_frac}% of the main dataset.
    message: `str`, optional (default=None)
        we use mlflow to keep track of all parameters and errors of each experiment,
        this parameter will register any string you pass into the experiment record in mlflow.
    fi_threshold `float`, optional (default=None)
        if already exists a feature importance file, this value will be use for filtering
        the features that has greater importance values than {fi_threshold}.
    """
    # getting experiment name
    experiment = os.path.basename(experiment_path)
    logging.info(f'running {experiment}')
    logging.info(f'eval_mode={eval_mode}, use_sample={use_sample}')
    logging.info('reading config file')
    # creating experiment path and loading experiment config file
    experiment_path = Path(experiment_path)
    config = load_data.read_config_file('./config/config.yml')
    experiment_config = load_data.read_config_file(experiment_path /
                                                   'config.yml')
    # reading experiment's model and pipeline config file
    pipeline_config = load_data.read_config_file(experiment_config['pipeline'])
    model_config = load_data.read_config_file(experiment_config['model'])

    directories = config['directories']
    # getting the data path
    processed_path = Path(directories['processed'])
    # creating a prediction folder to save prediction after training
    prediction_path = experiment_path / 'prediction'
    prediction_path.mkdir(exist_ok=True, parents=True)
    # creating a model path to save models after training
    model_path = experiment_path / 'models'

    # reading preprocessed data
    filename = ('fe' if not use_sample else 'fe_sample')
    logging.info('reading training data')
    data = load_data.read_feather(processed_path / f'{filename}.feather')

    logging.info('splitting dataset')
    train_idx, valid_idx = load_data.split_train_data(data,
                                                      test_frac=test_frac,
                                                      eval_mode=eval_mode)
    train_data = data.loc[train_idx, :]
    valid_data = data.loc[valid_idx, :]

    train_data.reset_index(drop=True, inplace=True)
    valid_data.reset_index(drop=True, inplace=True)
    # importing pipeline
    logging.info('building pipeline')
    pipeline = build_pipeline(pipeline_config)
    logging.info(f'{pipeline}')

    # fit pipeline
    logging.info('training pipeline')
    pipeline.fit(train_data)
    # transform both training and valid dataset
    logging.info('transforming datasets')
    train_data = pipeline.transform(train_data)
    valid_data = pipeline.transform(valid_data)

    # getting the bottom and upper limit of the target
    # in the case we want to use the sigmoid function
    # as the final activation function of our model
    use_sigmoid = experiment_config.pop('use_sigmoid', False)
    y_limit = ((train_data['t0'].agg(
        ('max', 'min')) * 1.2).to_list() if use_sigmoid else None)

    # loading the features to train our model
    # if exists a feature importance file
    # we can use it to train our model only with revelant features
    features = load_data.get_features(train_data,
                                      experiment_path=experiment_path,
                                      fi_threshold=fi_threshold,
                                      ignore_features=default.ignore_features)
    in_features = len(features)
    logging.info(f'modeling using {len(features)} features')
    logging.info(f'{features[:30]}')

    # creating datasets
    train_ds = Dataset.from_dataframe(train_data,
                                      features=features,
                                      target=target_name,
                                      device=device)
    valid_ds = Dataset.from_dataframe(valid_data,
                                      features=features,
                                      target=target_name,
                                      device=device)
    # creating dataloaders
    train_dl = DataLoader(dataset=train_ds,
                          batch_size=batch_size,
                          shuffle=True)
    not_shuffle_train_dl = DataLoader(dataset=train_ds,
                                      batch_size=batch_size,
                                      shuffle=False)
    valid_dl = DataLoader(dataset=valid_ds,
                          batch_size=batch_size,
                          shuffle=False)
    # creating databunch
    bunch = DataBunch(train_dl, valid_dl)

    # importing the model instance
    model_instance = model_library[model_config['instance']]
    # init the model
    model = model_instance(in_features=in_features,
                           out_features=len(target_name),
                           y_limit=y_limit,
                           **model_config['parameters']).to(device=device)
    # init optimizer
    optimizer = optim.Adam(model.parameters(),
                           **experiment_config['optimizer'])

    # creating learner instance
    logging.info('creating learner instance')
    cbs = [
        Recoder,
        MetricRecorderCallBack(metrics.torch_rmse), ModelCheckpointCallBack,
        ProgressBarCallBack
    ]

    learner = Learner(model, optimizer, bunch, callbacks=cbs)

    logging.info('training model')
    # importing epochs, default is 10
    epochs = experiment_config.pop('epochs', 10)
    # train the model
    learner.fit(epochs, seed=2020)

    # avg the last 5 epochs weights
    top_models = np.arange(epochs)[-5:]
    learner.modelcheckpoint.load_averaged_model(top_models)

    logging.info('prediction h0 and h1 models')
    # predicting
    valid_output = predict_dl(learner.model, valid_dl)
    train_output = predict_dl(learner.model, not_shuffle_train_dl)
    valid_data[['yhat_t0', 'yhat_t1']] = valid_output['prediction'].numpy()
    train_data[['yhat_t0', 'yhat_t1']] = train_output['prediction'].numpy()

    # computing metrics
    train_error = compute_metrics(train_data, suffix='_train')
    valid_error = compute_metrics(valid_data, suffix='_valid')

    train_error_period = compute_metrics_per_period(train_data,
                                                    suffix='_train')
    valid_error_period = compute_metrics_per_period(valid_data,
                                                    suffix='_valid')

    logging.info('errors')
    logging.info(f'{train_error}')
    logging.info(f'{valid_error}')
    logging.info('period errors')
    logging.info(f'{train_error_period}')
    logging.info(f'{valid_error_period}')
    if eval_mode:
        with mlflow.start_run(run_name=experiment):
            # saving predictions
            train_prediction = train_data.loc[:, default.keep_columns]
            train_prediction.to_csv(prediction_path / 'train.csv', index=False)
            # saving training progress
            learner.metrics_table.to_csv(experiment_path / 'trn_progress.csv',
                                         index=False)
            # saving errors
            train_error_period.to_csv(experiment_path / 'train_erros.csv',
                                      index=False)
            valid_error_period.to_csv(experiment_path / 'valid_erros.csv',
                                      index=False)
            # valid_prediction = valid_data.loc[:, default.keep_columns]
            valid_data.to_csv(prediction_path / 'valid.csv', index=False)
            # saving feature importances if there is aviable
            fi = permutation_importance(model=learner.model,
                                        data=valid_data,
                                        features=features,
                                        target=target_name,
                                        score_func=metrics.rmse)
            if fi_threshold is None:
                fi.to_csv(experiment_path / 'fi_h0.csv', index=False)
                fi.to_csv(experiment_path / 'fi_h1.csv', index=False)
            # saving to mlflow
            # saving metrics
            mlflow.log_metrics(train_error)
            mlflow.log_metrics(valid_error)
            # saving model parameters
            mlflow.log_params(model_config['parameters'])
            mlflow.log_params(experiment_config['optimizer'])
            mlflow.log_params({
                'epochs': epochs,
                'use_sigmoid': use_sigmoid,
                'fi_threshold': fi_threshold,
                'in_features': in_features
            })
            tags = {
                'use_sample': use_sample,
                'model_instance': model_config['instance'],
                'experiment': experiment
            }
            if message is not None:
                tags['message'] = message
            mlflow.set_tags(tags)
    else:
        # creating model path
        test_error = calculate_error_on_test(train_data)
        test_error = pd.DataFrame([test_error])
        test_error.to_csv(experiment_path / 'check_test_error.csv',
                          index=False)
        model_path.mkdir(exist_ok=True, parents=True)
        joblib.dump(learner.model, model_path / 'model_h0.pkl')
        joblib.dump(pipeline, model_path / 'pipeline.pkl')
        joblib.dump(features, model_path / 'features.pkl')
Beispiel #5
0
def main(experiment_path: str,
         eval_mode: bool = True,
         use_sample: bool = False,
         test_frac: float = 0.2,
         message: str = None,
         fi_threshold: float = None):
    """
    A function to train or validate an Experiment
    # Parameters
    experiment_path: `str`
        A path to the folder's experiment config file.
        the config file must be named config.yml and
        it must contain the following keys:
            model: `str`
                the path to the model config file
            pipeline: `str`
                the path to the pipeline config file

    eval_mode: `bool`, optional (default=True)
        if False, the model will be train using all the data available,
        otherwise, the trained model will be use for inference.
    use_sample: `bool`, optional (default=False)
        if True, we will use only a sample from the dataset.
        to use it, before execute the make_sample.py file
        to create this sample dataset.
    test_frac: `float`, optional (default=0.2)
        if eval_mode is True, the size of the valid dataset
        will be the {test_frac}% of the main dataset.
    message: `str`, optional (default=None)
        we use mlflow to keep track of all parameters and errors of each experiment,
        this parameter will register any string you pass into the experiment record in mlflow.
    fi_threshold `float`, optional (default=None)
        if already exists a feature importance file, this value will be use for filtering
        the features that has greater importance values than {fi_threshold}.
    """
    # getting experiment name
    experiment = os.path.basename(experiment_path)
    logging.info(f'running {experiment}')
    logging.info(f'eval_mode={eval_mode}, use_sample={use_sample}')
    logging.info('reading config file')
    # creating experiment path and loading experiment config file
    experiment_path = Path(experiment_path)
    config = load_data.read_config_file('./config/config.yml')
    experiment_config = load_data.read_config_file(experiment_path /
                                                   'config.yml')
    # reading experiment's model and pipeline config file
    pipeline_config = load_data.read_config_file(experiment_config['pipeline'])
    model_config = load_data.read_config_file(experiment_config['model'])

    directories = config['directories']
    # getting the data path
    processed_path = Path(directories['processed'])
    # creating a prediction folder to save prediction after training
    prediction_path = experiment_path / 'prediction'
    prediction_path.mkdir(exist_ok=True, parents=True)
    # creating a model path to save models after training
    model_path = experiment_path / 'models'

    # reading preprocessed data
    filename = ('fe' if not use_sample else 'fe_sample')
    logging.info('reading training data')
    data = load_data.read_feather(processed_path / f'{filename}.feather')

    logging.info('splitting dataset')
    train_idx, valid_idx = load_data.split_train_data(data,
                                                      test_frac=test_frac,
                                                      eval_mode=eval_mode)
    train_data = data.loc[train_idx, :]
    valid_data = data.loc[valid_idx, :]

    train_data.reset_index(drop=True, inplace=True)
    valid_data.reset_index(drop=True, inplace=True)
    # importing pipeline
    logging.info('building pipeline')
    pipeline = build_pipeline(pipeline_config)
    logging.info(f'{pipeline}')

    # fit pipeline
    logging.info('training pipeline')
    pipeline.fit(train_data)
    # transform both training and valid dataset
    logging.info('transforming datasets')
    train_data = pipeline.transform(train_data)
    valid_data = pipeline.transform(valid_data)

    # loading the features to train our model
    # if exists a feature importance file
    # we can use it to train our model only with revelant features
    features = load_data.get_features(train_data,
                                      experiment_path=experiment_path,
                                      fi_threshold=fi_threshold,
                                      ignore_features=default.ignore_features)
    in_features = len(features)
    logging.info(f'modeling using {len(features)} features')
    logging.info(f'{features[:30]}')

    # importing model instance
    model_instance = model_library[model_config['instance']]
    logging.info('training horizon 0 model')
    # training model for horizon 0
    model_h0 = model_instance(**model_config['parameters'])
    model_h0.fit(train_data.loc[:, features], train_data.loc[:, 't0'])

    logging.info('training horizon 1 model')
    # training model for horizon 1
    model_h1 = model_instance(**model_config['parameters'])
    model_h1.fit(train_data.loc[:, features], train_data.loc[:, 't1'])

    logging.info('prediction h0 and h1 models')
    # predicting
    train_data['yhat_t0'] = model_h0.predict(train_data.loc[:, features])
    train_data['yhat_t1'] = model_h1.predict(train_data.loc[:, features])
    valid_data['yhat_t0'] = model_h0.predict(valid_data.loc[:, features])
    valid_data['yhat_t1'] = model_h1.predict(valid_data.loc[:, features])

    # compute errors
    train_error = compute_metrics(train_data, suffix='_train')
    valid_error = compute_metrics(valid_data, suffix='_valid')

    train_error_period = compute_metrics_per_period(train_data,
                                                    suffix='_train')
    valid_error_period = compute_metrics_per_period(valid_data,
                                                    suffix='_valid')
    logging.info('errors')
    logging.info(f'{train_error}')
    logging.info(f'{valid_error}')
    logging.info('period errors')
    logging.info(f'{train_error_period}')
    logging.info(f'{valid_error_period}')
    if eval_mode:
        with mlflow.start_run(run_name=experiment):
            # saving predictions
            train_prediction = train_data.loc[:, default.keep_columns]
            train_prediction.to_csv(prediction_path / 'train.csv', index=False)
            # saving errors
            train_error_period.to_csv(experiment_path / 'train_erros.csv',
                                      index=False)
            valid_error_period.to_csv(experiment_path / 'valid_erros.csv',
                                      index=False)
            # valid_prediction = valid_data.loc[:, default.keep_columns]
            valid_data.to_csv(prediction_path / 'valid.csv', index=False)
            # saving feature importances if there is aviable
            fi_h0 = feature_importances(model_h0, features)
            fi_h1 = feature_importances(model_h1, features)
            if (fi_h0
                    is not None) and (fi_h1
                                      is not None) and (fi_threshold is None):
                fi_h0.to_csv(experiment_path / 'fi_h0.csv', index=False)
                fi_h1.to_csv(experiment_path / 'fi_h1.csv', index=False)
            # saving to mlflow
            # saving metrics
            mlflow.log_metrics(train_error)
            mlflow.log_metrics(valid_error)
            mlflow.log_params({
                'fi_threshold': fi_threshold,
                'in_features': in_features
            })
            # saving model parameters
            mlflow.log_params(model_config['parameters'])
            tags = {
                'use_sample': use_sample,
                'model_instance': model_config['instance'],
                'experiment': experiment
            }
            if message is not None:
                tags['message'] = message
            mlflow.set_tags(tags)
    else:
        # creating model path
        test_error = calculate_error_on_test(train_data)
        test_error = pd.DataFrame([test_error])
        test_error.to_csv(experiment_path / 'check_test_error.csv',
                          index=False)
        model_path.mkdir(exist_ok=True, parents=True)
        joblib.dump(model_h0, model_path / 'model_h0.pkl')
        joblib.dump(model_h1, model_path / 'model_h1.pkl')
        joblib.dump(pipeline, model_path / 'pipeline.pkl')
        joblib.dump(features, model_path / 'features.pkl')