def learn(train, dev, test, args, sargs_str): # Read strategy-specific args sargs = util.parse(parser, sargs_str.split()) # Clean out the sandbox util.mkdir(sargs['sandbox'], clean=True) # Feature columns describe how to use the input my_feature_columns = [] for key in train[0].keys(): my_feature_columns.append(tf.feature_column.numeric_column(key=key)) # Calculate epoch length steps_per_epoch = math.ceil(len(train[0]) / sargs['batch']) total_steps = sargs['epochs'] * steps_per_epoch # Train a classifier extra_args = { 'classes': CLASSES, 'columns': my_feature_columns, 'steps_per_epoch': steps_per_epoch, 'learning_rate': sargs['lr'], 'model_dir': sargs['sandbox'], 'warm_start_dir': None } merged_args = {**args, **sargs, **extra_args} # Create a new classifier instance classifier = cl.create_classifier(merged_args) # Train the model for exactly 1 epoch classifier.train( input_fn=lambda: pandas2tf.train_input_fn(train, sargs['batch']), steps=total_steps) # Evaluate the model train_result = classifier.evaluate( input_fn=lambda: pandas2tf.eval_input_fn(train, sargs['batch'])) dev_result = classifier.evaluate( input_fn=lambda: pandas2tf.eval_input_fn(dev, sargs['batch'])) test_result = classifier.evaluate( input_fn=lambda: pandas2tf.eval_input_fn(test, sargs['batch'])) return train_result, dev_result, test_result, classifier
def learn(train, dev, test, args, sargs_str): sargs = util.parse(parser, sargs_str.split()) return util.sklearn_wrapper(train, dev, test, LogisticRegression(**sargs))
def predict(classifier, test, args, sargs_str, threshold=None): sargs = util.parse(parser, sargs_str.split()) preds = classifier.predict(test[0]) if threshold is not None: preds = [1 if x >= threshold else 0 for x in preds] return preds
logger.info('%s[%s] results:', strategy, sargs) logger.info('train: %s', train_stats) logger.info('dev: %s', dev_stats) logger.info('test: %s', test_stats) if dwf_logging is not None: result = dwf_logging.pack_results(train_stats, dev_stats, test_stats) dwf_logging.report_result(result, client_info['client_id']) table.append([ args['resample'], args['resample_amount'], args['preprocess'], strategy, sargs, train_stats['fmes'], dev_stats['fmes'], test_stats['fmes'], train_stats, dev_stats, test_stats, ]) with open(os.path.join(args['output'], 'dbh.csv'), 'a') as f: for line in table: f.write(';'.join([str(item) for item in line]) + '\n') if __name__ == '__main__': main(util.parse(parser, sys.argv[1:]))
parser = argparse.ArgumentParser() parser.add_argument('--csv', required=True, help='csv to read the data from') parser.add_argument('--label', required=True, help='name of the label to predict') parser.add_argument('--seed', default=1337, type=int, help='random seed for repeatability') parser.add_argument('--amount', type=float, default=100, help='percentage of the input to keep') parser.add_argument('--output', required=True, help='csv to to save to') args = util.parse(parser, sys.argv[1:]) # read full dataset data = pd.read_csv(args['csv'], header=0) print('Before resampling:\n%s', data[args['label']].value_counts()) # split to classes bins = [] bins.append(data[data[args['label']] == 0]) bins.append(data[data[args['label']] != 0]) # resample ALL classes for i, item in enumerate(bins): current = len(item) target = int(current * (args['amount'] / 100)) bins[i] = resamp(bins[i],
def learn(train, dev, test, args, sargs_str): sargs = util.parse(parser, sargs_str.split()) return util.sklearn_wrapper(train, dev, test, SVC(**sargs))
def learn(train, dev, test, args, sargs_str): sargs = util.parse(parser, sargs_str.split()) return util.sklearn_wrapper(train, dev, test, DecisionTreeClassifier(**sargs))
def learn(train, dev, test, args, sargs_str): sargs = util.parse(parser, sargs_str.split()) return util.sklearn_wrapper(train, dev, test, KNeighborsClassifier(**sargs))
def learn(train, dev, test, args, sargs_str): # Read strategy-specific args sargs = util.parse(parser, sargs_str.split()) # Clean out the sandbox util.mkdir(sargs['sandbox'], clean=True) # Feature columns describe how to use the input my_feature_columns = [] for key in train[0].keys(): my_feature_columns.append(tf.feature_column.numeric_column(key=key)) # Calculate epoch length steps_per_epoch = math.ceil(len(train[0]) / sargs['batch']) # Train a classifier # Repeat until the model consecutively "misses" a set number of times rounds = 1 misses = miss_streak = 0 best_result = {'fmes': -1} best_model_dir = None best_classifier = None while miss_streak < sargs['max_misses']: model_dir = os.path.join(sargs['sandbox'], 'run_' + str(rounds) + '_' + str(miss_streak)) extra_args = { 'classes': CLASSES, 'columns': my_feature_columns, 'steps_per_epoch': steps_per_epoch, 'learning_rate': sargs['lr'] / (2 ** misses), 'model_dir': model_dir, 'warm_start_dir': best_model_dir } merged_args = {**args, **sargs, **extra_args} # Create a new classifier instance classifier = cl.create_classifier(merged_args) # Train the model for exactly 1 epoch classifier.train( input_fn=lambda:pandas2tf.train_input_fn(train, sargs['batch']), steps=steps_per_epoch) # Evaluate the model eval_result = classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(dev, sargs['batch'])) log('Round ' + str(rounds) + '_' + str(miss_streak) + ', Fmes: ' + str(best_result['fmes']) + ' --> ' + str(eval_result['fmes'])) if eval_result['fmes'] > best_result['fmes']: best_result = eval_result best_model_dir = model_dir best_classifier = classifier miss_streak = 0 rounds += 1 log('Improvement, go on...') else: miss_streak += 1 misses += 1 log('Miss #' + str(misses) + ', (streak = ' + str(miss_streak) + ')') # Cleanup sandbox not to run out of space due to models for m_dir in os.listdir(sargs['sandbox']): abs_m_dir = os.path.join(sargs['sandbox'], m_dir) if best_model_dir != abs_m_dir and model_dir != abs_m_dir: tf.summary.FileWriterCache.clear() shutil.rmtree(abs_m_dir) final_result_train = best_classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(train, sargs['batch'])) final_result_dev = best_classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(dev, sargs['batch'])) final_result_test = best_classifier.evaluate(input_fn=lambda:pandas2tf.eval_input_fn(test, sargs['batch'])) return final_result_train, final_result_dev, final_result_test, best_classifier
def predict(classifier, test, args, sargs_str): sargs = util.parse(parser, sargs_str.split()) preds = classifier.predict(input_fn=lambda:pandas2tf.eval_input_fn(test, sargs['batch'])) return [pred['class_ids'] for pred in preds]