def train_pipeline(params: PipelineParams): logger.info(f"Start train with params {params}.") data = read_data(params.train_data_path) logger.info(f"Data shape is {data.shape}") data_train, data_val = split_train_val_data(data, params.split_params) logger.info(f"Train data shape is {data_train.shape}") logger.info(f"Validation data shape is {data_val.shape}") target_train = extract_target(data_train, params.features_params) data_train = data_train.drop(columns=['target']) transformer = build_transformer(params.features_params) transformer.fit(data_train) features_train = make_features(transformer, data_train) logger.info(f"Train features shape is {features_train.shape}") target_val = extract_target(data_val, params.features_params) data_val = data_val.drop(columns=['target']) features_val = make_features(transformer, data_val) logger.info(f"Validation features shape is {features_val.shape}") model = train_model(features_train, target_train, params.train_params) predicts = predict_model(model, features_val) metrics = evaluate_model(predicts, target_val) with open(params.metric_path, "w") as metric_file: json.dump(metrics, metric_file) logger.info(f"Metrics are: {metrics}") path_to_model = dump_model(model, params.model_path) logger.info(f"Model saved at {params.model_path}") with open(params.transformer_path, "wb") as tr: pickle.dump(transformer, tr) logger.info(f"Feature transformer saved at {params.transformer_path}") logger.info("Finished.") return path_to_model, metrics
def test_train_model( features_and_target: Tuple[pd.DataFrame, pd.Series], training_params: LogisticRegressionParams, ): features, target = features_and_target model = train_model(features, target, training_params) assert isinstance(model, LogisticRegression) assert model.predict(features).shape[0] == target.shape[0]
def train_pipeline(training_pipeline_params: TrainingPipelineParams, model: SklearnClassifierModel): logger.info(f"start train pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.input_data_path) logger.info(f"data.shape is {data.shape}") data = drop_columns(data, training_pipeline_params.feature_params) logger.info(f"data.shape after dropping some columns is {data.shape}") train_df, val_df = split_train_val_data( data, training_pipeline_params.splitting_params ) logger.info(f"train_df.shape is {train_df.shape}") logger.info(f"val_df.shape is {val_df.shape}") if train_df.shape[0] < NOT_ENOUGH_DATA_THRESHOLD: msg = "No enough data to build good model" logger.warning(msg) warning_logger.warning(msg) transformer = build_transformer(training_pipeline_params.feature_params) transformer.fit(train_df) train_features = make_features(transformer, train_df) train_target = extract_target(train_df, training_pipeline_params.feature_params) logger.info(f"train_features.shape is {train_features.shape}") model = train_model( train_features, train_target, model ) val_features = make_features(transformer, val_df) val_target = extract_target(val_df, training_pipeline_params.feature_params) logger.info(f"val_features.shape is {val_features.shape}") predicts = predict_model( model, val_features, training_pipeline_params.feature_params.use_log_trick, ) metrics = evaluate_model( predicts, val_target, use_log_trick=training_pipeline_params.feature_params.use_log_trick, ) with open(training_pipeline_params.metric_path, "w") as metric_file: json.dump(metrics, metric_file) logger.info(f"metrics is {metrics}") path_to_model = serialize_model(model, training_pipeline_params.output_model_path) return path_to_model, metrics
from src.models.train_model import * # A model with 3 outputs: # Pointing signs (binary, weight = 1) # Depicting signs (binary, weight = 1) # Lexical signs (categorical, 4 different lexical signs (plus one NULL sign), weight = 1) model_1 = get_model(['PT', 'DS', 'fls'], [2, 2, 5], [1, 1, 1]) features_1_train, annot_1_train = get_data_concatenated( 'DictaSign', 'mixed', ['PT', 'DS', 'fls'], [[1], [1], [41891, 43413, 43422, 42992]], video_indices=np.arange(0, 70)) features_1_valid, annot_1_valid = get_data_concatenated( 'DictaSign', 'mixed', ['PT', 'DS', 'fls'], [[1], [1], [41891, 43413, 43422, 42992]], video_indices=np.arange(70, 94)) t = train_model(model_1, features_1_train, annot_1_train, features_1_valid, annot_1_valid, 2000, 5, 100) print(t.keys()) # A model with 1 output matrix: # [other, Pointing, Depicting, Lexical] model_2 = get_model(['PT-DS-fls'], [4], [1]) features_2_train, annot_2_train = get_data_concatenated( 'NCSLGR', 'sign_types', ['PT', 'DS', 'fls'], [['IX_1p', 'IX_2p', 'IX_3p'], ['DCL', 'LCL', 'SCL', 'BCL', 'ICL', 'BPCL', 'PCL'], ['lexical_with_ns_not_fs', 'fingerspelling', 'fingerspelled_loan_signs']], video_indices=np.arange(0, 10)) features_2_valid, annot_2_valid = get_data_concatenated( 'NCSLGR', 'sign_types', ['PT', 'DS', 'fls'],
import sys from argparse import ArgumentParser from pathlib import Path sys.path.append("..") from src.models import Model from src.models import train_model if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--max_epochs", type=int, default=1000) parser.add_argument("--patience", type=int, default=10) model_args = Model.add_model_specific_args(parser).parse_args() model = Model(model_args) train_model(model, model_args)
def test_segmenter_end_to_end(self, tmp_path): self._setup(data_dir=tmp_path) hparams = self._get_args(tmp_path) model = Segmenter(hparams) train_model(model, hparams)
def main(targets): # Will change to test config path if test target is seen config_dir = 'config' run_all = False # Set up logging with open(Path(config_dir, 'logging.json')) as f: logging_params = json.load(f) if logging_params['produce_logs']: log_file = logging_params['log_file'] ensure_path_exists(log_file) logging.basicConfig( filename=log_file, filemode='a', format='%(asctime)s, %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG) logging.info(f"{'*'*80}\nBEGIN RUN\n{'*'*80}") # Regardless of if a logfile is being collected, we should also get the logs # to show up in standard out. logging.getLogger().addHandler(logging.StreamHandler()) if 'all' in targets or len(targets) == 0: run_all = True if 'clean' in targets: # Would probably just delete the data folder... but should truly look at # the configuration to decide what to delete. raise NotImplementedError if 'test' in targets: # If `test` is the only target seen, then run all targets with the # configs and data found in the test directory. # # Otherwise, if additional targets are specified then only run those # targets but still use test config (and therefore test data). # print('Test target recognized. Will use test configuration files.') logging.info( 'Test target recognized. Will use test configuration files.') config_dir = 'test/config' if len(targets) == 1: # print('Testing all targets: `data`, `features`, `train`.') run_all = True if 'data' in targets or run_all: # Load, clean, and preprocess data. Then store preprocessed data to # configured intermediate directory. # print('Data target recognized.') logging.info('Data target recognized.') with open(Path(config_dir, 'data-params.json'), 'r') as f: data_params = json.load(f) print('Running ETL pipeline.') logging.info('Running ETL pipeline.') preprocess_data(**data_params) print('ETL pipeline complete.') logging.info('ETL pipeline complete.') if 'features' in targets or run_all: # Creates features for preprocessed data and stores feature-engineered # data to a configured csv and directory. # print('Features target recognized.') logging.info('Features target recognized.') with open(Path(config_dir, 'features-params.json'), 'r') as f: features_params = json.load(f) # print('Engineering features.') logging.info('Engineering features.') create_features(**features_params) # print('Feature engineering complete.') logging.info('Feature engineering complete.') if 'train' in targets or run_all: # Trains model based on feature-engineeered data, report some of its # scores, and save the model. # print('Train target recognized.') logging.info('Train target recognized.') with open(Path(config_dir, 'train-params.json'), 'r') as f: train_params = json.load(f) # print('Training model.') logging.info('Training model.') train_model(**train_params) # print('Model training complete.') logging.info('Model training complete.') if 'generate' in targets: # Generates data from network-stats # # NOTE: This target should *not* be included in `all`. # print('Generate target recognized.') logging.info('Generate target recognized.') with open(Path(config_dir, 'generate-params.json'), 'r') as f: generate_params = json.load(f) # print('Collecting data with network-stats.') logging.info('Collecting data with network-stats.') collect_data(**generate_params) # print('Data collection complete.') logging.info('Data collection complete.') return
"""Run Source Code through command line""" __author__ = "Abhijit Pai" __email__ = "*****@*****.**" # Imports from modules in codebase 'src'. from src.preprocess import clean_data from src.features import add_features from src.models import train_model if __name__ == '__main__': clean_df = clean_data(raw_file_name="OnlineRetail.csv") features_df = add_features(clean_df) final_df = train_model(features_df)