def main():
    """
    Use user-inputted command-line arguments through argparse library
    to upload text files to a database table
    """
    parser = OptionParser()
    parser.add_option('-f',
                      '--file',
                      dest='filename_to_upload',
                      help='just upload this one file')
    parser.add_option('-d',
                      '--dir',
                      dest='dir_to_upload',
                      help='upload everything in this dir')
    parser.add_option('-c',
                      '--config',
                      dest='config_file',
                      help='where to get database credential file')
    parser.add_option('-n',
                      '--names',
                      dest='file_table_names_file_path',
                      help='where the file_table_names json file is stored')

    (options, args) = parser.parse_args()

    filenames_to_upload = []
    if options.filename_to_upload:
        absfile = abspath(options.filename_to_upload)
        filenames_to_upload.append(absfile)

    elif options.dir_to_upload:
        for f in listdir(options.dir_to_upload):
            absfile = join(options.dir_to_upload, f)
            if isfile(absfile):
                filenames_to_upload.append(absfile)
    else:
        print('no files specified to upload...quitting')
        sys.exit(0)

    # create engine to connect to postgres
    print(options.config_file)
    engine = create_engine_from_config_file(options.config_file)

    with open(options.file_table_names_file_path, 'r') as f:
        global file_table_names
        file_table_names = json.load(f)

    for filename in filenames_to_upload:
        df_dict = read_file_to_df(filename)
        upload_df_to_db(engine, df_dict)
                      dest='config_file',
                      help='where to get database credential file')
    parser.add_option('-v',
                      '--view_name',
                      dest='view_name',
                      help='what the final view name should be')
    parser.add_option('-e',
                      '--date_cols_from_title',
                      action="store_true",
                      dest='time_from_title',
                      help='''turn this flag on if you want date/season columns
                              extracted from the table names''')
    parser.add_option('-o',
                      '--corrections',
                      dest='corrections',
                      help='''point to a json dict of column names you want to
                              change''')

    (options, args) = parser.parse_args()

    engine = create_engine_from_config_file(options.config_file)

    if not options.config_file:
        print('no database credentials json file specified use --config /path')
        sys.exit(1)
    if not options.view_name:
        print('no destination view name specified use --view_name name')
        sys.exit(1)
    else:
        main(engine, options)
"""
Create clean iRead tables and view
"""
import pandas as pd

from tulsa.config import create_engine_from_config_file

config_file = '/path/to/configs/dbcreds.json'
engine = create_engine_from_config_file(config_file)

get_table_info_query = """
                       SELECT *
                       FROM pg_catalog.pg_tables
                       WHERE schemaname = 'raw_data'
                           AND tablename ~ 'iread.*';
                       """
tables = engine.execute(get_table_info_query)
tables = tables.fetchall()
table_names = [table['tablename'] for table in tables]


# look at column names
def column_names(table):
    # note: .keys() gets the column names
    return engine.execute(
        '''SELECT * FROM raw_data."{table}"'''.format(table=table)).keys()


# create a dictionary with column names for each table
columns_dict_old = {
    tablename: column_names(tablename)
Beispiel #4
0
def main(dbcreds, params_file, log_level, log_location, regenerate, run_name,
         state_location, report):
    """Run all the models!!!!

    You will need to specify the database credentials for DBCREDS and a
    parameter file as PARAMS_FILE.

    The format of DBCREDS is a json with the keys 'host', 'port', 'database',
    'user', and 'password' pointing to your postgres database.

    The format of your PARAMS_FILE is also a json which should look like
    the following::

        {
            "labels_to_make": ["eventualnot186_with2nd"],
            "feature_groups_to_make": ["reenroll"],
            "models_to_make": {
                "LR": { "penalty": ["l1"], "C": [0.01]},
                "RF": {"n_estimators": [1000], "max_depth": [50],
                       "max_features": ["log2"], "min_samples_split": [10]}
            },
            "metrics_to_make": [],
            "split_strategy": ["predict_new"],
            "scale": true
        }

    The potential values for each key are as follows:
        * labels_to_make: ['eventualnot186', 'eventualnot186_with2nd']
                                  see features.py for all options
        * feature_groups_to_make: ['all', 'map', 'actionable', ...]
                                  see feature_groups.py for all options
        * models_to_make: ['RF', 'LR', 'SVM', ...]
                                  see evaluate.py for all options
        * metrics_to_make: ['precision_at_{n}', 'recall_at_{n}', 'auc',
                             'prec_rec_n_graph', 'pred_probs_hist']
                                  see metrics.py for more information
        * split_strategy: ['cohort', 'predict_new']
                                  see splits.py for more information


    Note that each "model to make" takes a dictionary that is used by
    scikit-learn to make a ParameterGrid object. So if you only want to
    run a single model, all of the lists that appear should be of length one.
    """
    # implement logging
    numeric_level = getattr(logging, log_level.upper())

    logging_params = {
        'level': numeric_level,
        'format': '%(asctime)s [%(levelname)s]: %(message)s'
    }
    if log_location:
        logging_params['filename'] = log_location
    else:
        logging_params['stream'] = sys.stderr

    logging.basicConfig(**logging_params)
    logging.info('Process started')
    logging.info('===============')
    start_time = time.time()

    params = json.load(params_file)
    labels_to_make = params['labels_to_make']
    feature_groups_to_make = params['feature_groups_to_make']
    models_to_make = params['models_to_make']
    metrics_to_make = params['metrics_to_make']
    split_strategies = params['split_strategy']
    try:
        scale = params['scale']
    except KeyError:
        scale = None
    # create engine
    engine = create_engine_from_config_file(dbcreds)

    # feature group mapping
    logging.info('feature groups requested are: %s', feature_groups_to_make)
    features_to_make = set()
    for group in feature_groups_to_make:
        features_to_make.update(feature_groups[group])
    features_to_make = list(features_to_make)
    logging.info('individual features to make are: %s', features_to_make)

    # instantiate a model
    for label_name in labels_to_make:
        tm = model.TulsaModel(engine, features_to_make, label_name,
                              models_to_make, metrics_to_make,
                              split_strategies, regenerate, run_name,
                              state_location, scale)

        logging.info('Initializing Tulsa model')

        tm.generate_labels()
        logging.info('Labels generated')

        tm.generate_features()
        logging.info('Features generated (%s features)',
                     len(tm.label_feature_df.columns))

        tm.impute_missing_values()
        logging.info('imputed missing values')

        tm.warn_nans()
        logging.info('I tried to warn you about NaNs')

        tm.make_splits()
        logging.info('Make splits')

        tm.fit_models_and_metrics()
        logging.info('Model evaluated')

        tm.touch_run_number()
        logging.info('Touching run number')

        if report:
            logging.info('Reporting type is: %s', report)
            tm.report_predictions(report)
            logging.info('Reporting predictions to CSV')

    end_time = time.time()
    time_taken = end_time - start_time
    logging.info('Time taken: %s seconds', time_taken)
    logging.info('■  Done')