def main(): sequence_schema_path = f'{input_path}/train/sequence_schema' context_schema_path = f'{input_path}/train/context_schema' context_schema, sequence_schema = read_schemata(context_schema_path, sequence_schema_path) tf_ctx_schema, tf_seq_schema = build_schema(context_schema, sequence_schema) train_parts = glob.glob(input_path + '/train' + '/part-*') validation_parts = glob.glob(input_path + '/test' + '/part-*') run_config = RunConfig(log_step_count_steps=10, save_checkpoints_steps=100, save_summary_steps=200, keep_checkpoint_max=32) shared_input_fn = partial(input_fn, params, tf_seq_schema, tf_ctx_schema) train_input_fn = partial(shared_input_fn, train_parts) validation_input_fn = partial(shared_input_fn, validation_parts) train_spec = TrainSpec(train_input_fn, max_steps=1000000) eval_spec = EvalSpec(validation_input_fn, steps=200, name='validation', start_delay_secs=30, throttle_secs=1) estimator = Estimator(model_fn=model.model_fn, model_dir=model_dir, params=params, config=run_config) logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) logging.getLogger('tensorflow').propagate = False train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) prediction = list(estimator.predict(input_fn=partial(predict_input_fn, {'epochs': 1, 'batch_size': 10}, grid))) scores = [p.tolist() for p in prediction] pairwise_prob = pairwise_probability(scores) zero = pairwise_prob[0] A_zero = build_diags(zero) print(optimize(A_zero).x)
return features, labels config = tf.ConfigProto() config.gpu_options.allow_growth = True run_config = RunConfig(model_dir='/data/cips/save/%s' % MODEL_ID, session_config=config, save_checkpoints_steps=2000) estimator = DNNClassifier(hidden_units=[512], feature_columns=[ tf.feature_column.numeric_column('feature', shape=(768, )) ], n_classes=len(laws), config=run_config, label_vocabulary=laws_str, dropout=0.1) input_fn = lambda fp: (tf.data.TextLineDataset(fp).apply( tf.contrib.data.shuffle_and_repeat(buffer_size=10000)).batch(batch_size). map(lambda x: tf.py_func( get_encodes, [x], [tf.float32, tf.string], name='bert_client')).map(lambda x, y: ({ 'feature': x }, y)).prefetch(20)) train_spec = TrainSpec(input_fn=lambda: input_fn(train_fp)) eval_spec = EvalSpec(input_fn=lambda: input_fn(eval_fp), throttle_secs=0) train_and_evaluate(estimator, train_spec, eval_spec)
def main(mname, model_dir, batch_size, epochs, eval_steps, eps_log_steps): global model_dir_hdfs if model_dir.startswith('hdfs'): model_dir_hdfs = True tf.logging.set_verbosity(tf.logging.DEBUG) # get TF logger log.setLevel(logging.DEBUG) # create formatter and add it to the handlers formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # create file handler which logs even debug messages if model_dir_hdfs is False: if os.path.exists(model_dir) is False: os.makedirs(model_dir) log_dir = model_dir else: model_dir = os.path.join( model_dir, "job_cifar10_" + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')) log_dir = '.' # clear old log files with open(log_dir + '/tensorflow.log', 'w'): pass with open(log_dir + '/gpu.csv', 'w'): pass with open(log_dir + '/cpu.csv', 'w'): pass fh = logging.FileHandler(log_dir + '/tensorflow.log') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) log.addHandler(fh) log.info("TF version: %s", tf.__version__) log.info("Model directory: %s", model_dir) log.info("Batch size: %s", batch_size) log.info("Prefetch data all to memory: %s", True) log.info("Train epochs: %s", epochs) config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU config.log_device_placement = True # to log device placement (on which device the operation ran) sess = tf.Session(config=config) ktf.set_session( sess) # set this TensorFlow session as the default session for Keras steps_per_epoch = cifar10_data.train_len() / batch_size log.info("Steps per epoch: %s", steps_per_epoch) if eval_steps is None: eval_steps = steps_per_epoch log.info("Evaluating each %i steps", eval_steps) if mname == "cnn": model = cifar10_model_cnn.cifar_model() else: model = cifar10_model_resnet.cifar_model() global input_name input_name = 'input_1' model.summary() def train_input_fn(): dataset = tf.data.Dataset.from_generator( generator=cifar10_data.generator_train, output_types=(tf.float32, tf.float32), output_shapes=shapes) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(buffer_size=batch_size) # dataset = dataset.repeat(20) iterator = dataset.make_one_shot_iterator() features_tensors, labels = iterator.get_next() features = {input_name: features_tensors} return features, labels def eval_input_fn(): dataset = tf.data.Dataset.from_generator( generator=cifar10_data.generator_test, output_types=(tf.float32, tf.float32), output_shapes=shapes) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(buffer_size=batch_size) iterator = dataset.make_one_shot_iterator() features_tensors, labels = iterator.get_next() features = {input_name: features_tensors} return features, labels my_config = RunConfig( save_checkpoints_steps= eval_steps # Save checkpoints every n steps and run the evaluation. # keep_checkpoint_max = 5 # Retain the n most recent checkpoints (default 5). ) estimator = tf.keras.estimator.model_to_estimator(model, config=my_config, model_dir=model_dir) examples_sec_hook = ExamplesPerSecondHook(batch_size, every_n_steps=eps_log_steps) # stopping_hook = early_stopping.stop_if_higher_hook(estimator, "accuracy", 0.5) train_hooks = [examples_sec_hook] train_spec = TrainSpec(input_fn=train_input_fn, hooks=train_hooks, max_steps=cifar10_data.train_len() / batch_size * epochs) eval_spec = EvalSpec(input_fn=eval_input_fn, steps=cifar10_data.val_len() / batch_size, throttle_secs=5) # default 100 steps global is_training is_training = True threading.Thread(target=lambda: collect_stats(log_dir)).start() start = time.time() train_and_evaluate(estimator, train_spec, eval_spec) elapsed = time.time() - start is_training = False log.info("total time taken (seconds): %s ", elapsed) if model_dir_hdfs: parse_res = parse.urlsplit(model_dir) netloc = parse_res[1] path = parse_res[2] webhdfs_model_dir = 'http://' + netloc + ':50070/webhdfs/v1' + path username = getpass.getuser() component_name = estimator.config.task_type + str( estimator.config.task_id) log.info("Uploading log files for %s as %s to HDFS path: %s", component_name, username, webhdfs_model_dir) logging.shutdown() os.system('curl -L -i -T tensorflow.log "' + webhdfs_model_dir + '/tensorflow-' + component_name + '.log?op=CREATE&overwrite=false&user.name=' + username + '"') os.system('curl -L -i -T cpu.csv "' + webhdfs_model_dir + '/cpu-' + component_name + '.csv?op=CREATE&overwrite=false&user.name=' + username + '"') os.system('curl -L -i -T gpu.csv "' + webhdfs_model_dir + '/gpu-' + component_name + '.csv?op=CREATE&overwrite=false&user.name=' + username + '"') else: log.info("Creating zip archive of job results") logging.shutdown() shutil.make_archive(model_dir, 'zip', model_dir)