Exemple #1
0
def train_pipeline(params: PipelineParams):
    logger.info(f"Start train with params {params}.")
    data = read_data(params.train_data_path)
    logger.info(f"Data shape is {data.shape}")
    data_train, data_val = split_train_val_data(data, params.split_params)
    logger.info(f"Train data shape is {data_train.shape}")
    logger.info(f"Validation data shape is {data_val.shape}")
    target_train = extract_target(data_train, params.features_params)
    data_train = data_train.drop(columns=['target'])
    transformer = build_transformer(params.features_params)
    transformer.fit(data_train)
    features_train = make_features(transformer, data_train)
    logger.info(f"Train features shape is {features_train.shape}")
    target_val = extract_target(data_val, params.features_params)
    data_val = data_val.drop(columns=['target'])
    features_val = make_features(transformer, data_val)
    logger.info(f"Validation features shape is {features_val.shape}")

    model = train_model(features_train, target_train, params.train_params)
    predicts = predict_model(model, features_val)
    metrics = evaluate_model(predicts, target_val)
    with open(params.metric_path, "w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"Metrics are: {metrics}")
    path_to_model = dump_model(model, params.model_path)
    logger.info(f"Model saved at {params.model_path}")
    with open(params.transformer_path, "wb") as tr:
        pickle.dump(transformer, tr)
    logger.info(f"Feature transformer saved at {params.transformer_path}")
    logger.info("Finished.")
    return path_to_model, metrics
Exemple #2
0
def test_train_model(
    features_and_target: Tuple[pd.DataFrame, pd.Series],
    training_params: LogisticRegressionParams,
):
    features, target = features_and_target
    model = train_model(features, target, training_params)
    assert isinstance(model, LogisticRegression)
    assert model.predict(features).shape[0] == target.shape[0]
def train_pipeline(training_pipeline_params: TrainingPipelineParams, model: SklearnClassifierModel):
    logger.info(f"start train pipeline with params {training_pipeline_params}")
    data = read_data(training_pipeline_params.input_data_path)
    logger.info(f"data.shape is {data.shape}")
    data = drop_columns(data, training_pipeline_params.feature_params)
    logger.info(f"data.shape after dropping some columns is {data.shape}")
    train_df, val_df = split_train_val_data(
        data, training_pipeline_params.splitting_params
    )
    logger.info(f"train_df.shape is {train_df.shape}")
    logger.info(f"val_df.shape is {val_df.shape}")

    if train_df.shape[0] < NOT_ENOUGH_DATA_THRESHOLD:
        msg = "No enough data to build good model"
        logger.warning(msg)
        warning_logger.warning(msg)

    transformer = build_transformer(training_pipeline_params.feature_params)
    transformer.fit(train_df)
    train_features = make_features(transformer, train_df)
    train_target = extract_target(train_df, training_pipeline_params.feature_params)

    logger.info(f"train_features.shape is {train_features.shape}")

    model = train_model(
        train_features, train_target, model
    )

    val_features = make_features(transformer, val_df)
    val_target = extract_target(val_df, training_pipeline_params.feature_params)

    logger.info(f"val_features.shape is {val_features.shape}")
    predicts = predict_model(
        model,
        val_features,
        training_pipeline_params.feature_params.use_log_trick,
    )

    metrics = evaluate_model(
        predicts,
        val_target,
        use_log_trick=training_pipeline_params.feature_params.use_log_trick,
    )

    with open(training_pipeline_params.metric_path, "w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"metrics is {metrics}")

    path_to_model = serialize_model(model, training_pipeline_params.output_model_path)

    return path_to_model, metrics
Exemple #4
0
from src.models.train_model import *

# A model with 3 outputs:
# Pointing signs (binary, weight = 1)
# Depicting signs (binary, weight = 1)
# Lexical signs (categorical, 4 different lexical signs (plus one NULL sign), weight = 1)
model_1 = get_model(['PT', 'DS', 'fls'], [2, 2, 5], [1, 1, 1])
features_1_train, annot_1_train = get_data_concatenated(
    'DictaSign',
    'mixed', ['PT', 'DS', 'fls'], [[1], [1], [41891, 43413, 43422, 42992]],
    video_indices=np.arange(0, 70))
features_1_valid, annot_1_valid = get_data_concatenated(
    'DictaSign',
    'mixed', ['PT', 'DS', 'fls'], [[1], [1], [41891, 43413, 43422, 42992]],
    video_indices=np.arange(70, 94))
t = train_model(model_1, features_1_train, annot_1_train, features_1_valid,
                annot_1_valid, 2000, 5, 100)
print(t.keys())

# A model with 1 output matrix:
# [other, Pointing, Depicting, Lexical]
model_2 = get_model(['PT-DS-fls'], [4], [1])
features_2_train, annot_2_train = get_data_concatenated(
    'NCSLGR',
    'sign_types', ['PT', 'DS', 'fls'],
    [['IX_1p', 'IX_2p', 'IX_3p'],
     ['DCL', 'LCL', 'SCL', 'BCL', 'ICL', 'BPCL', 'PCL'],
     ['lexical_with_ns_not_fs', 'fingerspelling', 'fingerspelled_loan_signs']],
    video_indices=np.arange(0, 10))
features_2_valid, annot_2_valid = get_data_concatenated(
    'NCSLGR',
    'sign_types', ['PT', 'DS', 'fls'],
Exemple #5
0
import sys
from argparse import ArgumentParser
from pathlib import Path

sys.path.append("..")

from src.models import Model
from src.models import train_model

if __name__ == "__main__":
    parser = ArgumentParser()

    parser.add_argument("--max_epochs", type=int, default=1000)
    parser.add_argument("--patience", type=int, default=10)

    model_args = Model.add_model_specific_args(parser).parse_args()
    model = Model(model_args)

    train_model(model, model_args)
Exemple #6
0
    def test_segmenter_end_to_end(self, tmp_path):

        self._setup(data_dir=tmp_path)
        hparams = self._get_args(tmp_path)
        model = Segmenter(hparams)
        train_model(model, hparams)
def main(targets):

    # Will change to test config path if test target is seen
    config_dir = 'config'
    run_all = False

    # Set up logging
    with open(Path(config_dir, 'logging.json')) as f:
        logging_params = json.load(f)

    if logging_params['produce_logs']:
        log_file = logging_params['log_file']
        ensure_path_exists(log_file)
        logging.basicConfig(
            filename=log_file,
            filemode='a',
            format='%(asctime)s, %(name)s %(levelname)s %(message)s',
            datefmt='%H:%M:%S',
            level=logging.DEBUG)
        logging.info(f"{'*'*80}\nBEGIN RUN\n{'*'*80}")

    # Regardless of if a logfile is being collected, we should also get the logs
    # to show up in standard out.
    logging.getLogger().addHandler(logging.StreamHandler())

    if 'all' in targets or len(targets) == 0:
        run_all = True

    if 'clean' in targets:
        # Would probably just delete the data folder... but should truly look at
        # the configuration to decide what to delete.
        raise NotImplementedError

    if 'test' in targets:
        # If `test` is the only target seen, then run all targets with the
        # configs and data found in the test directory.
        #
        # Otherwise, if additional targets are specified then only run those
        # targets but still use test config (and therefore test data).
        # print('Test target recognized. Will use test configuration files.')
        logging.info(
            'Test target recognized. Will use test configuration files.')
        config_dir = 'test/config'

        if len(targets) == 1:
            # print('Testing all targets: `data`, `features`, `train`.')
            run_all = True

    if 'data' in targets or run_all:
        # Load, clean, and preprocess data. Then store preprocessed data to
        # configured intermediate directory.
        # print('Data target recognized.')
        logging.info('Data target recognized.')

        with open(Path(config_dir, 'data-params.json'), 'r') as f:
            data_params = json.load(f)

        print('Running ETL pipeline.')
        logging.info('Running ETL pipeline.')
        preprocess_data(**data_params)
        print('ETL pipeline complete.')
        logging.info('ETL pipeline complete.')

    if 'features' in targets or run_all:
        # Creates features for preprocessed data and stores feature-engineered
        # data to a configured csv and directory.
        # print('Features target recognized.')
        logging.info('Features target recognized.')

        with open(Path(config_dir, 'features-params.json'), 'r') as f:
            features_params = json.load(f)

        # print('Engineering features.')
        logging.info('Engineering features.')
        create_features(**features_params)
        # print('Feature engineering complete.')
        logging.info('Feature engineering complete.')

    if 'train' in targets or run_all:
        # Trains model based on feature-engineeered data, report some of its
        # scores, and save the model.
        # print('Train target recognized.')
        logging.info('Train target recognized.')

        with open(Path(config_dir, 'train-params.json'), 'r') as f:
            train_params = json.load(f)

        # print('Training model.')
        logging.info('Training model.')
        train_model(**train_params)
        # print('Model training complete.')
        logging.info('Model training complete.')

    if 'generate' in targets:
        # Generates data from network-stats
        #
        # NOTE: This target should *not* be included in `all`.
        # print('Generate target recognized.')
        logging.info('Generate target recognized.')

        with open(Path(config_dir, 'generate-params.json'), 'r') as f:
            generate_params = json.load(f)

        # print('Collecting data with network-stats.')
        logging.info('Collecting data with network-stats.')
        collect_data(**generate_params)
        # print('Data collection complete.')
        logging.info('Data collection complete.')

    return
"""Run Source Code through command line"""

__author__ = "Abhijit Pai"
__email__ = "*****@*****.**"

# Imports from modules in codebase 'src'.
from src.preprocess import clean_data
from src.features import add_features
from src.models import train_model

if __name__ == '__main__':
    clean_df = clean_data(raw_file_name="OnlineRetail.csv")
    features_df = add_features(clean_df)
    final_df = train_model(features_df)