def run_validate(config: str, mode: str, bucket: str):

    # Libraries --------------------------------------------------------------------------------------------------------
    import logging
    import yaml
    from src.validate import DataValidator
    from src.helpers import load_data
    import sys

    # Settings ---------------------------------------------------------------------------------------------------------
    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p',
                        level=logging.INFO)

    try:
        logging.info('Initializing configuration...')
        stream = open(config, 'r')
        config = yaml.load(stream=stream, Loader=yaml.FullLoader)
        validator = DataValidator(config=config)
        df = load_data(input_path=config['raw_path'],
                       input_data=config['raw_data'],
                       mode=mode,
                       bucket=bucket)
        validation_status = validator.validate(df=df)
        validator.check_validity(validation_status=validation_status)
    except RuntimeError as error:
        logging.info(error)
        sys.exit(1)
Exemple #2
0
def load():
    """Load the global dataset and an example."""
    print('Loading and calculating initial data.')
    global DATASET, INITIAL, PROBES, SETTINGS
    # Global dataset
    DATASET = load_data()
    # List of available probes
    PROBES = sorted(list(DATASET['ads'].unique()))
    # Example dataset
    INITIAL = select_data(
        DATASET, None,
        SETTINGS['t_abs'], SETTINGS['t_tol'],
        SETTINGS['g1'], SETTINGS['g2'])
    print('Data load complete.')
Exemple #3
0
def main():

    train_x, train_y = helpers.load_data()
    viz.corr_heatmap(train_x)
Exemple #4
0
def main():
    train_x ,train_y = helpers.load_data()

    alg = ensemble.AdaBoostClassifier()
    analysis.select_feature(alg,train_x,train_y)
Exemple #5
0
def main():
    train_x ,train_y = helpers.load_data()

    alg = ensemble.AdaBoostClassifier()
    analysis.param_search(alg,train_x,train_y)
Exemple #6
0
def run_generate_features(
    config: str, mode: str, bucket: str, train_path: 'GCSPath',
    test_path: 'GCSPath', val_path: 'GCSPath'
) -> NamedTuple('output_paths', [('train', 'GCSPath'), ('test', 'GCSPath'),
                                 ('val', 'GCSPath')]):
    # Libraries --------------------------------------------------------------------------------------------------------
    import logging.config
    import yaml
    import sys
    import os
    from src.generate_features import FeaturesGenerator
    from src.helpers import load_data, save_data

    # Settings ---------------------------------------------------------------------------------------------------------
    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p',
                        level=logging.INFO)

    try:
        logging.info('Initializing configuration...')
        stream = open(config, 'r')
        config = yaml.load(stream=stream, Loader=yaml.FullLoader)
        feats_generator = FeaturesGenerator(config=config)

        logging.info('Initiating features engineering process...')
        if mode == 'cloud':
            output_paths_gcs = []

            # Train ----------------------------------------------------------------------------------------------------
            train_data = load_data(input_data=train_path, mode=mode)
            y_train = train_data[[config['target']]]
            x_train = train_data[train_data.columns.difference(
                [config['target']])]

            feats_generator.fit(x=x_train, est_param=config['est_params'][0])
            x_train_tf_idf_df = feats_generator.transform(
                x=x_train, est_param=config['est_params'][0])

            feats_generator.fit(x=x_train_tf_idf_df,
                                est_param=config['est_params'][1])
            x_train_scaled = feats_generator.transform(
                x=x_train_tf_idf_df, est_param=config['est_params'][1])

            train_path_gcs = save_data(x_df=x_train_scaled,
                                       y_df=y_train,
                                       path=config['featured_path'],
                                       out_data=config['featured_data'][0],
                                       mode=mode,
                                       bucket=bucket)
            output_paths_gcs.append(train_path_gcs)

            # Test - Val -----------------------------------------------------------------------------------------------
            for input_path, out_filename in zip([test_path, val_path],
                                                config['featured_data'][1:]):
                data = load_data(input_data=input_path, mode=mode)
                y = data[[config['target']]]
                x = data[data.columns.difference([config['target']])]

                x_tf_idf_matrix = feats_generator.transform(
                    x=x, est_param=config['est_params'][0])
                x_scaled = feats_generator.transform(
                    x=x_tf_idf_matrix, est_param=config['est_params'][1])

                x_path_gcs = save_data(x_df=x_scaled,
                                       y_df=y,
                                       path=config['featured_path'],
                                       out_data=out_filename,
                                       mode=mode,
                                       bucket=bucket)

                output_paths_gcs.append(x_path_gcs)

            return tuple(output_paths_gcs)
        else:
            output_paths = []
            data_path = os.path.join(config['processed_path'],
                                     config['processed_data'][0])

            # Train ----------------------------------------------------------------------------------------------------
            train_data = load_data(input_data=data_path, mode=mode)
            y_train = train_data[[config['target']]]
            x_train = train_data[train_data.columns.difference(
                [config['target']])]

            feats_generator.fit(x=x_train, est_param=config['est_params'][0])
            x_train_tf_idf_df = feats_generator.transform(
                x=x_train, est_param=config['est_params'][0])

            feats_generator.fit(x=x_train_tf_idf_df,
                                est_param=config['est_params'][1])
            x_train_scaled = feats_generator.transform(
                x=x_train_tf_idf_df, est_param=config['est_params'][1])

            train_path = save_data(x_df=x_train_scaled,
                                   y_df=y_train,
                                   path=config['featured_path'],
                                   out_data=config['featured_data'][0],
                                   mode=mode,
                                   bucket=bucket)
            output_paths.append(train_path)

            # Test - Val -----------------------------------------------------------------------------------------------
            for input_path, out_filename in zip(config['processed_data'][1:],
                                                config['featured_data'][1:]):
                data_path = os.path.join(config['processed_path'], input_path)
                data = load_data(input_data=data_path, mode=mode)
                y = data[[config['target']]]
                x = data[data.columns.difference([config['target']])]

                x_tf_idf_matrix = feats_generator.transform(
                    x=x, est_param=config['est_params'][0])
                x_scaled = feats_generator.transform(
                    x=x_tf_idf_matrix, est_param=config['est_params'][1])

                x_path = save_data(x_df=x_scaled,
                                   y_df=y,
                                   path=config['featured_path'],
                                   out_data=out_filename,
                                   mode=mode,
                                   bucket=bucket)

                output_paths.append(x_path)
            return tuple(output_paths)

    except RuntimeError as error:
        logging.info(error)
        sys.exit(1)
Exemple #7
0
def main():
    train_x, train_y = helpers.load_data()

    alg = tree.DecisionTreeClassifier(random_state=0)
    analysis.select_feature(alg, train_x, train_y)
def run_train(
    config: str,
    mode: str,
    bucket: str,
    train_path: 'GCSPath',
    test_path: 'GCSPath',
    classifier='logit'
) -> NamedTuple('output_paths', [('train', 'GCSPath'), ('test', 'GCSPath'),
                                 ('model', 'GCSPath')]):
    # Libraries --------------------------------------------------------------------------------------------------------

    import logging.config
    import yaml
    import sys
    import os
    import pprint
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    from src.train_model import Modeler
    from src.helpers import load_data, save_data

    # Settings ---------------------------------------------------------------------------------------------------------
    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p',
                        level=logging.INFO)

    logging.info('Initializing configuration...')
    stream = open(config, 'r')
    config = yaml.load(stream=stream, Loader=yaml.FullLoader)

    logging.info("Initializing model...")
    if classifier == 'logit':
        modeler = Modeler(LogisticRegression,
                          params={
                              'max_iter':
                              config['models']['logit']['max_iter'],
                              'random_state': config['random_state']
                          })
    if classifier == 'dtree':
        modeler = Modeler(DecisionTreeClassifier,
                          params={'random_state': config['random_state']})
    if classifier == 'rf':
        modeler = Modeler(RandomForestClassifier,
                          params={'random_state': config['random_state']})
    if classifier == 'gb':
        modeler = Modeler(GradientBoostingClassifier,
                          params={'random_state': config['random_state']})
    if classifier == 'xgb':
        modeler = Modeler(XGBClassifier,
                          params={
                              'use_label_encoder':
                              config['models']['xgb']['use_label_encoder'],
                              'random_state':
                              config['random_state']
                          })
    if classifier == 'lightgb':
        modeler = Modeler(LGBMClassifier,
                          params={'random_state': config['random_state']})
    logging.info(f"{classifier} model successfully initialized!")

    try:
        logging.info('Starting model training...')
        if not mode:
            train_path = os.path.join(config['featured_path'],
                                      config['featured_data'][0])
            test_path = os.path.join(config['featured_path'],
                                     config['featured_data'][1])

        # Train --------------------------------------------------------------------------------------------------------
        logging.info(f'Training {classifier} model...')
        train_data = load_data(input_data=train_path, mode=mode)
        y_train = train_data[config['target']]
        x_train = train_data[train_data.columns.difference([config['target']])]
        modeler.train(x_train, y_train)
        logging.info(f'{classifier} model successfully trained!')

        logging.info(f'Testing {classifier} model...')
        # Predict and Evalutate ----------------------------------------------------------------------------------------
        test_data = load_data(input_data=test_path, mode=mode)
        x_test = test_data[train_data.columns.difference([config['target']])]
        y_pred = modeler.predict(x_test)
        metrics = modeler.evaluate(y_train, y_pred)
        # TODO: Store metrics. Figure out how to consume in the next stage for model validation.
        pprint.pprint(metrics)

    except RuntimeError as error:
        logging.info(error)
        sys.exit(1)
Exemple #9
0
def main():
    train_x ,train_y = helpers.load_data()

    analysis.model_comparison(train_x,train_y)
Exemple #10
0
def run_prepare(
    config: str, mode: str, bucket: str, train_path: 'GCSPath',
    test_path: 'GCSPath', val_path: 'GCSPath'
) -> NamedTuple('output_paths', [('train', 'GCSPath'), ('test', 'GCSPath'),
                                 ('val', 'GCSPath')]):
    # Libraries --------------------------------------------------------------------------------------------------------
    import logging.config
    import yaml
    import sys
    import os
    from src.prepare import DataPreparer
    from src.helpers import load_data, save_data

    # Settings ---------------------------------------------------------------------------------------------------------
    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p',
                        level=logging.INFO)
    try:
        logging.info('Initializing configuration...')
        stream = open(config, 'r')
        config = yaml.load(stream=stream, Loader=yaml.FullLoader)
        preparer = DataPreparer(config=config)
        input_paths = [train_path, test_path, val_path]

        if mode == 'cloud':
            output_paths_gcs = []
            for input_path, out_filename in zip(input_paths,
                                                config['processed_data']):
                data = load_data(input_data=input_path, mode=mode)
                processed_data = preparer.transform(data=data)
                # TODO: Add metadata in the pipeline
                print(processed_data.head(5))
                out_path_gcs = save_data(df=processed_data,
                                         path=config['processed_path'],
                                         out_data=out_filename,
                                         mode=mode,
                                         bucket=bucket)
                output_paths_gcs.append(out_path_gcs)
            return tuple(output_paths_gcs)

        else:
            output_paths = []
            for input_filename, out_filename in zip(config['interim_data'],
                                                    config['processed_data']):
                data_path = os.path.join(config['interim_path'],
                                         input_filename)
                data = load_data(input_data=data_path, mode=mode)
                processed_data = preparer.transform(data=data)
                # TODO: Add metadata in the pipeline
                print(processed_data.head(5))
                out_path = save_data(df=processed_data,
                                     path=config['processed_path'],
                                     out_data=out_filename,
                                     mode=mode,
                                     bucket=bucket)
                output_paths.append(out_path)
            return tuple(output_paths)

    except RuntimeError as error:
        logging.info(error)
        sys.exit(1)