Ejemplo n.º 1
0
def learn(train, dev, test, args, sargs_str):

    # Read strategy-specific args
    sargs = util.parse(parser, sargs_str.split())

    # Clean out the sandbox
    util.mkdir(sargs['sandbox'], clean=True)

    # Feature columns describe how to use the input
    my_feature_columns = []
    for key in train[0].keys():
        my_feature_columns.append(tf.feature_column.numeric_column(key=key))

    # Calculate epoch length
    steps_per_epoch = math.ceil(len(train[0]) / sargs['batch'])
    total_steps = sargs['epochs'] * steps_per_epoch

    # Train a classifier
    extra_args = {
        'classes': CLASSES,
        'columns': my_feature_columns,
        'steps_per_epoch': steps_per_epoch,
        'learning_rate': sargs['lr'],
        'model_dir': sargs['sandbox'],
        'warm_start_dir': None
    }
    merged_args = {**args, **sargs, **extra_args}

    # Create a new classifier instance
    classifier = cl.create_classifier(merged_args)

    # Train the model for exactly 1 epoch
    classifier.train(
        input_fn=lambda: pandas2tf.train_input_fn(train, sargs['batch']),
        steps=total_steps)

    # Evaluate the model
    train_result = classifier.evaluate(
        input_fn=lambda: pandas2tf.eval_input_fn(train, sargs['batch']))
    dev_result = classifier.evaluate(
        input_fn=lambda: pandas2tf.eval_input_fn(dev, sargs['batch']))
    test_result = classifier.evaluate(
        input_fn=lambda: pandas2tf.eval_input_fn(test, sargs['batch']))
    return train_result, dev_result, test_result, classifier
Ejemplo n.º 2
0
def learn(train, dev, test, args, sargs_str):
    sargs = util.parse(parser, sargs_str.split())
    return util.sklearn_wrapper(train, dev, test, LogisticRegression(**sargs))
Ejemplo n.º 3
0
def predict(classifier, test, args, sargs_str, threshold=None):
    sargs = util.parse(parser, sargs_str.split())
    preds = classifier.predict(test[0])
    if threshold is not None:
        preds = [1 if x >= threshold else 0 for x in preds]
    return preds
Ejemplo n.º 4
0
        logger.info('%s[%s] results:', strategy, sargs)
        logger.info('train: %s', train_stats)
        logger.info('dev:   %s', dev_stats)
        logger.info('test:  %s', test_stats)

        if dwf_logging is not None:
            result = dwf_logging.pack_results(train_stats, dev_stats, test_stats)
            dwf_logging.report_result(result, client_info['client_id'])            


        table.append([
            args['resample'],
            args['resample_amount'],
            args['preprocess'],
            strategy,
            sargs,
            train_stats['fmes'],
            dev_stats['fmes'],
            test_stats['fmes'],
            train_stats,
            dev_stats,
            test_stats,
        ])

    with open(os.path.join(args['output'], 'dbh.csv'), 'a') as f:
        for line in table:
            f.write(';'.join([str(item) for item in line]) + '\n')
    
if __name__ == '__main__':
    main(util.parse(parser, sys.argv[1:]))
Ejemplo n.º 5
0
parser = argparse.ArgumentParser()
parser.add_argument('--csv', required=True, help='csv to read the data from')
parser.add_argument('--label',
                    required=True,
                    help='name of the label to predict')
parser.add_argument('--seed',
                    default=1337,
                    type=int,
                    help='random seed for repeatability')
parser.add_argument('--amount',
                    type=float,
                    default=100,
                    help='percentage of the input to keep')
parser.add_argument('--output', required=True, help='csv to to save to')

args = util.parse(parser, sys.argv[1:])

# read full dataset
data = pd.read_csv(args['csv'], header=0)
print('Before resampling:\n%s', data[args['label']].value_counts())

# split to classes
bins = []
bins.append(data[data[args['label']] == 0])
bins.append(data[data[args['label']] != 0])

# resample ALL classes
for i, item in enumerate(bins):
    current = len(item)
    target = int(current * (args['amount'] / 100))
    bins[i] = resamp(bins[i],
Ejemplo n.º 6
0
def learn(train, dev, test, args, sargs_str):
    sargs = util.parse(parser, sargs_str.split())
    return util.sklearn_wrapper(train, dev, test, SVC(**sargs))
Ejemplo n.º 7
0
def learn(train, dev, test, args, sargs_str):
    sargs = util.parse(parser, sargs_str.split())
    return util.sklearn_wrapper(train, dev, test, DecisionTreeClassifier(**sargs))
Ejemplo n.º 8
0
def learn(train, dev, test, args, sargs_str):
    sargs = util.parse(parser, sargs_str.split())
    return util.sklearn_wrapper(train, dev, test, KNeighborsClassifier(**sargs))
Ejemplo n.º 9
0
def learn(train, dev, test, args, sargs_str):

    # Read strategy-specific args
    sargs = util.parse(parser, sargs_str.split())
   
    # Clean out the sandbox
    util.mkdir(sargs['sandbox'], clean=True)

    # Feature columns describe how to use the input
    my_feature_columns = []
    for key in train[0].keys():
        my_feature_columns.append(tf.feature_column.numeric_column(key=key))

    # Calculate epoch length
    steps_per_epoch = math.ceil(len(train[0]) / sargs['batch'])

    # Train a classifier
    # Repeat until the model consecutively "misses" a set number of times
    rounds = 1
    misses = miss_streak = 0
    best_result = {'fmes': -1}
    best_model_dir = None
    best_classifier = None
    while miss_streak < sargs['max_misses']:

        model_dir = os.path.join(sargs['sandbox'], 'run_' + str(rounds) + '_' + str(miss_streak))

        extra_args = {
            'classes': CLASSES,
            'columns': my_feature_columns,
            'steps_per_epoch': steps_per_epoch,
            'learning_rate': sargs['lr'] / (2 ** misses),
            'model_dir': model_dir,
            'warm_start_dir': best_model_dir
        }
        merged_args = {**args, **sargs, **extra_args}

        # Create a new classifier instance
        classifier = cl.create_classifier(merged_args)

        # Train the model for exactly 1 epoch
        classifier.train(
            input_fn=lambda:pandas2tf.train_input_fn(train, sargs['batch']),
            steps=steps_per_epoch)

        # Evaluate the model
        eval_result = classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(dev, sargs['batch']))
        log('Round ' + str(rounds) + '_' + str(miss_streak) + ', Fmes: ' + str(best_result['fmes']) + ' --> ' + str(eval_result['fmes']))
        if eval_result['fmes'] > best_result['fmes']:
            best_result = eval_result
            best_model_dir = model_dir
            best_classifier = classifier
            miss_streak = 0
            rounds += 1
            log('Improvement, go on...')
        else:
            miss_streak += 1
            misses += 1
            log('Miss #' + str(misses) + ', (streak = ' + str(miss_streak) + ')')
        
        # Cleanup sandbox not to run out of space due to models
        for m_dir in os.listdir(sargs['sandbox']):
            abs_m_dir = os.path.join(sargs['sandbox'], m_dir)
            if best_model_dir != abs_m_dir and model_dir != abs_m_dir:
                tf.summary.FileWriterCache.clear()
                shutil.rmtree(abs_m_dir)                

    final_result_train = best_classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(train, sargs['batch']))
    final_result_dev = best_classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(dev, sargs['batch']))
    final_result_test = best_classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(test, sargs['batch']))
    return final_result_train, final_result_dev, final_result_test, best_classifier
       
Ejemplo n.º 10
0
def predict(classifier, test, args, sargs_str):
    sargs = util.parse(parser, sargs_str.split())
    preds = classifier.predict(input_fn=lambda:pandas2tf.eval_input_fn(test, sargs['batch']))
    return [pred['class_ids'] for pred in preds]