Exemple #1
0
def main():
    config_file = args.path_to_config_file
    config, config_raw = configure_model(config_file)

    #If logging is enable, check that there are no records for
    #the selected experiment
    if not args.notlog:
        logger_uri = cfg_main['logger']['uri']
        logger_db = cfg_main['logger']['db']
        logger_collection = cfg_main['logger']['collection']
        mongo_logger = Logger(logger_uri, logger_db, logger_collection)
        if mongo_logger.experiment_exists(config['experiment_name']):
            raise ExperimentExists(config['experiment_name'])

    # datasets
    logger.info('Loading datasets...')
    train, test  = make_datasets(config)
    logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
        test.x.shape))
    # Dump datasets if dump option was selected
    if args.dump:
        logger.info('Dumping train and tests sets')
        datasets = [(train, 'train'), 
                    (test, 'test')]
        for data, name in datasets:
            if data is not None:
                filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
                finally:
                    df.to_csv(os.path.join(path_to_dumps, filename))
Exemple #2
0
def main():
    config_file = args.path_to_config_file
    config, config_raw = configure_model(config_file)

    #If logging is enabled, check that there are no records for
    #the selected experiment
    if not args.notlog:
        logger_uri = cfg_main['logger']['uri']
        logger_db = cfg_main['logger']['db']
        logger_collection = cfg_main['logger']['collection']
        mongo_logger = Logger(logger_uri, logger_db, logger_collection)
        if mongo_logger.experiment_exists(config['experiment_name']):
            raise ExperimentExists(config['experiment_name'])

    # datasets
    logger.info('Loading datasets...')
    train, test  = make_datasets(config)
    logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
        test.x.shape))

    #Check percentage of NAs for every feature,
    #raise an error if at least one feature has more NAs than the
    #acceptable threshold
    logger.info('Checking training set NAs...')
    prop = check_nas_threshold(train.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)
    logger.info('Checking testing set NAs...')
    prop = check_nas_threshold(test.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)

    # Dump datasets if dump option was selected
    if args.dump:
        logger.info('Dumping train and tests sets')
        datasets = [(train, 'train'), 
                    (test, 'test')]
        for data, name in datasets:
            if data is not None:
                filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
                finally:
                    df.to_csv(os.path.join(path_to_dumps, filename))
def main():

    if args.warninglog:
        myhandler = logging.FileHandler(os.path.abspath(args.warninglog))
        myhandler.setLevel('WARNING')
        logger.addHandler(myhandler)
    if args.debuglog:
        myhandler2 = logging.FileHandler(os.path.abspath(args.debuglog))
        myhandler2.setLevel('DEBUG')
        logger.addHandler(myhandler2)

    config_file = args.path_to_config_file
    config, config_raw = configure_model(config_file)

    if args.predicttop and args.notlog:
        raise ValueError("You cannot save the top X predictions "
                "on all parcels without also logging.")

    #If logging is enabled, check that there are no records for
    #the selected experiment
    if not args.notlog:

        logger_uri = cfg_main['logger']['uri']
        logger_db = cfg_main['logger']['db']
        logger_collection = cfg_main['logger']['collection']
        mongo_logger = Logger(logger_uri, logger_db, logger_collection)

        if mongo_logger.experiment_exists(config['experiment_name']):

            # if the user hasn't selected to overwrite the record, throw error
            if not args.overwritelog:
                raise ExperimentExists(config['experiment_name'])
            else:
                mongo_logger.delete_experiment(config['experiment_name'])

    # datasets
    logger.info('Loading datasets...')
    if not args.predicttop:
        train, test  = make_datasets(config, predictset=False)
        logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
            test.x.shape))
    else:
        train, test, preds = make_datasets(config, predictset=True)
        logger.debug('Train x shape: {} Test x shape: {} Prediction x shape {}'\
                .format(train.x.shape, test.x.shape, preds.x.shape))

    #Check percentage of NAs for every feature,
    #raise an error if at least one feature has more NAs than the
    #acceptable threshold
    logger.info('Checking training set NAs...')
    prop = check_nas_threshold(train.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)
    logger.info('Checking testing set NAs...')
    prop = check_nas_threshold(test.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)

    # Dump datasets if dump option was selected
    if args.dump:
        logger.info('Dumping train and tests sets')
        datasets = [(train, 'train'), 
                    (test, 'test')]
        if args.predicttop:
            logger.info('Dumping prediction sets')
            datasets.append((preds, 'prediction'))
        for data, name in datasets:
            if data is not None:
                filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                    df.to_csv(os.path.join(path_to_dumps, filename))
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
            else:
                logger.info('{} is None, skipping dump...'.format(name))