def run_validate(config: str, mode: str, bucket: str): # Libraries -------------------------------------------------------------------------------------------------------- import logging import yaml from src.validate import DataValidator from src.helpers import load_data import sys # Settings --------------------------------------------------------------------------------------------------------- logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) try: logging.info('Initializing configuration...') stream = open(config, 'r') config = yaml.load(stream=stream, Loader=yaml.FullLoader) validator = DataValidator(config=config) df = load_data(input_path=config['raw_path'], input_data=config['raw_data'], mode=mode, bucket=bucket) validation_status = validator.validate(df=df) validator.check_validity(validation_status=validation_status) except RuntimeError as error: logging.info(error) sys.exit(1)
def load(): """Load the global dataset and an example.""" print('Loading and calculating initial data.') global DATASET, INITIAL, PROBES, SETTINGS # Global dataset DATASET = load_data() # List of available probes PROBES = sorted(list(DATASET['ads'].unique())) # Example dataset INITIAL = select_data( DATASET, None, SETTINGS['t_abs'], SETTINGS['t_tol'], SETTINGS['g1'], SETTINGS['g2']) print('Data load complete.')
def main(): train_x, train_y = helpers.load_data() viz.corr_heatmap(train_x)
def main(): train_x ,train_y = helpers.load_data() alg = ensemble.AdaBoostClassifier() analysis.select_feature(alg,train_x,train_y)
def main(): train_x ,train_y = helpers.load_data() alg = ensemble.AdaBoostClassifier() analysis.param_search(alg,train_x,train_y)
def run_generate_features( config: str, mode: str, bucket: str, train_path: 'GCSPath', test_path: 'GCSPath', val_path: 'GCSPath' ) -> NamedTuple('output_paths', [('train', 'GCSPath'), ('test', 'GCSPath'), ('val', 'GCSPath')]): # Libraries -------------------------------------------------------------------------------------------------------- import logging.config import yaml import sys import os from src.generate_features import FeaturesGenerator from src.helpers import load_data, save_data # Settings --------------------------------------------------------------------------------------------------------- logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) try: logging.info('Initializing configuration...') stream = open(config, 'r') config = yaml.load(stream=stream, Loader=yaml.FullLoader) feats_generator = FeaturesGenerator(config=config) logging.info('Initiating features engineering process...') if mode == 'cloud': output_paths_gcs = [] # Train ---------------------------------------------------------------------------------------------------- train_data = load_data(input_data=train_path, mode=mode) y_train = train_data[[config['target']]] x_train = train_data[train_data.columns.difference( [config['target']])] feats_generator.fit(x=x_train, est_param=config['est_params'][0]) x_train_tf_idf_df = feats_generator.transform( x=x_train, est_param=config['est_params'][0]) feats_generator.fit(x=x_train_tf_idf_df, est_param=config['est_params'][1]) x_train_scaled = feats_generator.transform( x=x_train_tf_idf_df, est_param=config['est_params'][1]) train_path_gcs = save_data(x_df=x_train_scaled, y_df=y_train, path=config['featured_path'], out_data=config['featured_data'][0], mode=mode, bucket=bucket) output_paths_gcs.append(train_path_gcs) # Test - Val ----------------------------------------------------------------------------------------------- for input_path, out_filename in zip([test_path, val_path], config['featured_data'][1:]): data = load_data(input_data=input_path, mode=mode) y = data[[config['target']]] x = data[data.columns.difference([config['target']])] x_tf_idf_matrix = feats_generator.transform( x=x, est_param=config['est_params'][0]) x_scaled = feats_generator.transform( x=x_tf_idf_matrix, est_param=config['est_params'][1]) x_path_gcs = save_data(x_df=x_scaled, y_df=y, path=config['featured_path'], out_data=out_filename, mode=mode, bucket=bucket) output_paths_gcs.append(x_path_gcs) return tuple(output_paths_gcs) else: output_paths = [] data_path = os.path.join(config['processed_path'], config['processed_data'][0]) # Train ---------------------------------------------------------------------------------------------------- train_data = load_data(input_data=data_path, mode=mode) y_train = train_data[[config['target']]] x_train = train_data[train_data.columns.difference( [config['target']])] feats_generator.fit(x=x_train, est_param=config['est_params'][0]) x_train_tf_idf_df = feats_generator.transform( x=x_train, est_param=config['est_params'][0]) feats_generator.fit(x=x_train_tf_idf_df, est_param=config['est_params'][1]) x_train_scaled = feats_generator.transform( x=x_train_tf_idf_df, est_param=config['est_params'][1]) train_path = save_data(x_df=x_train_scaled, y_df=y_train, path=config['featured_path'], out_data=config['featured_data'][0], mode=mode, bucket=bucket) output_paths.append(train_path) # Test - Val ----------------------------------------------------------------------------------------------- for input_path, out_filename in zip(config['processed_data'][1:], config['featured_data'][1:]): data_path = os.path.join(config['processed_path'], input_path) data = load_data(input_data=data_path, mode=mode) y = data[[config['target']]] x = data[data.columns.difference([config['target']])] x_tf_idf_matrix = feats_generator.transform( x=x, est_param=config['est_params'][0]) x_scaled = feats_generator.transform( x=x_tf_idf_matrix, est_param=config['est_params'][1]) x_path = save_data(x_df=x_scaled, y_df=y, path=config['featured_path'], out_data=out_filename, mode=mode, bucket=bucket) output_paths.append(x_path) return tuple(output_paths) except RuntimeError as error: logging.info(error) sys.exit(1)
def main(): train_x, train_y = helpers.load_data() alg = tree.DecisionTreeClassifier(random_state=0) analysis.select_feature(alg, train_x, train_y)
def run_train( config: str, mode: str, bucket: str, train_path: 'GCSPath', test_path: 'GCSPath', classifier='logit' ) -> NamedTuple('output_paths', [('train', 'GCSPath'), ('test', 'GCSPath'), ('model', 'GCSPath')]): # Libraries -------------------------------------------------------------------------------------------------------- import logging.config import yaml import sys import os import pprint from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from xgboost import XGBClassifier from lightgbm import LGBMClassifier from src.train_model import Modeler from src.helpers import load_data, save_data # Settings --------------------------------------------------------------------------------------------------------- logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) logging.info('Initializing configuration...') stream = open(config, 'r') config = yaml.load(stream=stream, Loader=yaml.FullLoader) logging.info("Initializing model...") if classifier == 'logit': modeler = Modeler(LogisticRegression, params={ 'max_iter': config['models']['logit']['max_iter'], 'random_state': config['random_state'] }) if classifier == 'dtree': modeler = Modeler(DecisionTreeClassifier, params={'random_state': config['random_state']}) if classifier == 'rf': modeler = Modeler(RandomForestClassifier, params={'random_state': config['random_state']}) if classifier == 'gb': modeler = Modeler(GradientBoostingClassifier, params={'random_state': config['random_state']}) if classifier == 'xgb': modeler = Modeler(XGBClassifier, params={ 'use_label_encoder': config['models']['xgb']['use_label_encoder'], 'random_state': config['random_state'] }) if classifier == 'lightgb': modeler = Modeler(LGBMClassifier, params={'random_state': config['random_state']}) logging.info(f"{classifier} model successfully initialized!") try: logging.info('Starting model training...') if not mode: train_path = os.path.join(config['featured_path'], config['featured_data'][0]) test_path = os.path.join(config['featured_path'], config['featured_data'][1]) # Train -------------------------------------------------------------------------------------------------------- logging.info(f'Training {classifier} model...') train_data = load_data(input_data=train_path, mode=mode) y_train = train_data[config['target']] x_train = train_data[train_data.columns.difference([config['target']])] modeler.train(x_train, y_train) logging.info(f'{classifier} model successfully trained!') logging.info(f'Testing {classifier} model...') # Predict and Evalutate ---------------------------------------------------------------------------------------- test_data = load_data(input_data=test_path, mode=mode) x_test = test_data[train_data.columns.difference([config['target']])] y_pred = modeler.predict(x_test) metrics = modeler.evaluate(y_train, y_pred) # TODO: Store metrics. Figure out how to consume in the next stage for model validation. pprint.pprint(metrics) except RuntimeError as error: logging.info(error) sys.exit(1)
def main(): train_x ,train_y = helpers.load_data() analysis.model_comparison(train_x,train_y)
def run_prepare( config: str, mode: str, bucket: str, train_path: 'GCSPath', test_path: 'GCSPath', val_path: 'GCSPath' ) -> NamedTuple('output_paths', [('train', 'GCSPath'), ('test', 'GCSPath'), ('val', 'GCSPath')]): # Libraries -------------------------------------------------------------------------------------------------------- import logging.config import yaml import sys import os from src.prepare import DataPreparer from src.helpers import load_data, save_data # Settings --------------------------------------------------------------------------------------------------------- logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) try: logging.info('Initializing configuration...') stream = open(config, 'r') config = yaml.load(stream=stream, Loader=yaml.FullLoader) preparer = DataPreparer(config=config) input_paths = [train_path, test_path, val_path] if mode == 'cloud': output_paths_gcs = [] for input_path, out_filename in zip(input_paths, config['processed_data']): data = load_data(input_data=input_path, mode=mode) processed_data = preparer.transform(data=data) # TODO: Add metadata in the pipeline print(processed_data.head(5)) out_path_gcs = save_data(df=processed_data, path=config['processed_path'], out_data=out_filename, mode=mode, bucket=bucket) output_paths_gcs.append(out_path_gcs) return tuple(output_paths_gcs) else: output_paths = [] for input_filename, out_filename in zip(config['interim_data'], config['processed_data']): data_path = os.path.join(config['interim_path'], input_filename) data = load_data(input_data=data_path, mode=mode) processed_data = preparer.transform(data=data) # TODO: Add metadata in the pipeline print(processed_data.head(5)) out_path = save_data(df=processed_data, path=config['processed_path'], out_data=out_filename, mode=mode, bucket=bucket) output_paths.append(out_path) return tuple(output_paths) except RuntimeError as error: logging.info(error) sys.exit(1)