def setup_experiment(experiment, estimator, hidden_layers, steps, batch_size, epochs, optimizer, learning_rate, force, data_path, s3_profile, s3_url): """Define experiment parameters and hyper parameters Supported optimizers: * 'Adagrad': Returns an `AdagradOptimizer`. * 'Adam': Returns an `AdamOptimizer`. * 'Ftrl': Returns an `FtrlOptimizer`. * 'RMSProp': Returns an `RMSPropOptimizer`. * 'SGD': Returns a `GradientDescentOptimizer`. """ # s3 support, only for loading the dataset s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Prevent overwrite by mistake if gather_results.load_experiment(experiment, data_path=data_path, s3=s3) and not force: print("Experiment %s already configured" % experiment) sys.exit(1) params = {} hyper_params = { 'steps': steps, 'batch_size': batch_size, 'epochs': epochs, 'hidden_units': [x for x in map(lambda x: int(x), hidden_layers.split('/'))], 'optimizer': optimizer, 'learning_rate': learning_rate } experiment_data = { 'estimator': estimator, 'params': params, 'hyper_params': hyper_params } # Store the experiment to disk gather_results.save_experiment(experiment_data, experiment, data_path=data_path, s3=s3) print("Experiment %s saved successfully." % experiment) print("\testimator: %s" % estimator) print("\tparameters: %s" % params) print("\thyper parameters: %s" % hyper_params)
def local_trainer(dataset, experiment, eval_dataset, gpu, debug, data_path, s3_profile, s3_url): # s3 support. When both using s3, dataset and experiment must stored # in the same bucket s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Load experiment data experiment_data = gather_results.load_experiment(experiment, data_path=data_path, s3=s3) if not experiment_data: print("Experiment %s not found" % experiment) sys.exit(1) # Load dataset data dataset_data = gather_results.load_model_config(dataset, data_path=data_path, s3=s3) if not dataset_data: print("Dataset %s not found" % dataset) sys.exit(1) # Read hyper_params and params estimator = experiment_data['estimator'] hyper_params = experiment_data['hyper_params'] params = experiment_data['params'] steps = int(hyper_params['steps']) num_epochs = int(hyper_params['epochs']) batch_size = int(hyper_params['batch_size']) optimizer = hyper_params['optimizer'] learning_rate = float(hyper_params['learning_rate']) class_label = dataset_data['class_label'] if debug: tf.logging.set_verbosity(tf.logging.DEBUG) # Load the normalized data labels = gather_results.load_dataset(dataset, 'labels', data_path=data_path, s3=s3)['labels'] training_data = gather_results.load_dataset(dataset, 'training', data_path=data_path, s3=s3) test_data = gather_results.load_dataset(dataset, 'test', data_path=data_path, s3=s3) print("Training data shape: (%d, %d)" % training_data['examples'].shape) if class_label == 'node_provider': label_vocabulary = set([ 'rax', 'ovh', 'packethost-us-west-1', 'vexxhost', 'limestone-regionone', 'inap-mtl01', 'fortnebula-regionone' ]) elif class_label == 'node_provider_all': label_vocabulary = set([ 'rax-iad', 'ovh-bhs1', 'packethost-us-west-1', 'rax-dfw', 'vexxhost-ca-ymq-1', 'ovh-gra1', 'limestone-regionone', 'inap-mtl01', 'rax-ord', 'vexxhost-sjc1', 'fortnebula-regionone' ]) else: label_vocabulary = None # Get the estimator model_dir = gather_results.get_model_folder(dataset, experiment) estimator = tf_trainer.get_estimator( estimator, hyper_params, params, labels, model_dir, optimizer=_OPTIMIZER_CLS_NAMES[optimizer](learning_rate=learning_rate), label_vocabulary=label_vocabulary, gpu=gpu) def train_and_eval(): # Train tf_trainer.get_training_method(estimator)( input_fn=tf_trainer.get_input_fn(shuffle=True, batch_size=batch_size, num_epochs=num_epochs, labels=labels, **training_data), steps=steps) # Eval on the experiment dataset + any other requested eval_sets = [dataset] eval_sets.extend(eval_dataset) for eval_dataset_name in eval_sets: eval_data = gather_results.load_dataset(eval_dataset_name, 'test', data_path=data_path, s3=s3) eval_size = len(eval_data['example_ids']) # Run tf evaluation and store the metrics print("Evaluation data shape: (%d, %d)" % eval_data['examples'].shape) eval_loss = estimator.evaluate( input_fn=tf_trainer.get_input_fn(batch_size=eval_size, num_epochs=1, labels=labels, **eval_data), name=eval_dataset_name) # Saving and Logging loss print('Training eval data for %s: %r' % (eval_dataset_name, eval_loss)) eval_name = "eval_" + eval_dataset_name gather_results.save_data_json(dataset, eval_loss, eval_name, sub_folder=experiment) # Run a prediction on the "dev" set, which we use as prod, and store it prod_data = gather_results.load_dataset(dataset, 'dev', data_path=data_path, s3=s3) prod_size = len(prod_data['example_ids']) prediction = estimator.predict(input_fn=tf_trainer.get_input_fn( batch_size=prod_size, num_epochs=1, labels=labels, **prod_data)) # Convert bytes fields to string for serialization serializable_pred = [] for pred in prediction: _classes = pred['classes'] pred['classes'] = [x.decode("utf-8") for x in _classes] serializable_pred.append(pred) prediction_name = "prediction_" + dataset pred_data = zip(prod_data['example_ids'], serializable_pred, prod_data['classes']) gather_results.save_data_json(dataset, [x for x in pred_data], prediction_name, sub_folder=experiment) # Now do the training and evalutation if gpu: with tf.device('/gpu:0'): eval_loss = train_and_eval() else: eval_loss = train_and_eval()