def map_func(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print ("cluster:" + str(cluster_json)) print ("job name:" + job_name) print ("current index:" + str(index)) sys.stdout.flush() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) t = time.time() if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:' + str(index), cluster=cluster)): train_ops = build_graph() print("python worker index:" + str(index)) sys.stdout.flush() try: hooks = [tf.train.StopAtStepHook(last_step=2)] with tf.train.MonitoredTrainingSession(master=server.target, config=sess_config, checkpoint_dir="./target/tmp/s1/" + str(t), hooks=hooks) as mon_sess: while not mon_sess.should_stop(): print (mon_sess.run(train_ops, feed_dict={a: [1.0, 2.0, 3.0]})) sys.stdout.flush() time.sleep(1) finally: SummaryWriterCache.clear()
def main_on_flink(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): tf_hyperparameter = context.properties.get('TF_Hyperparameter') argv = tf.app.flags.FLAGS(tf_hyperparameter.split(' '), known_only=True) # Parse known flags. if FLAGS.mode == 'train': training_on_flink(tf_context, sess_config, server.target) elif FLAGS.mode == 'decode': inference_on_flink(tf_context, sess_config, server.target) else: raise ValueError( "The 'mode' flag must be one of train/eval/decode")
def map_fun(context): print(tf.__version__) sys.stdout.flush() tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print(cluster_json) sys.stdout.flush() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): record_defaults = [[9], [tf.constant(value=9, dtype=tf.int64)], [9.0], [tf.constant(value=9.0, dtype=tf.float64)], ["9.0"]] dataset = context.flinkStreamDataSet(buffer_size=0) dataset = dataset.map(lambda record: tf.decode_csv( record, record_defaults=record_defaults)) dataset = dataset.batch(3) iterator = dataset.make_one_shot_iterator() input_records = iterator.get_next() global_step = tf.train.get_or_create_global_step() global_step_inc = tf.assign_add(global_step, 1) out_list = [input_records[0], input_records[2], input_records[4]] out = tff_ops.encode_csv(input_list=out_list) is_chief = (index == 0) t = time.time() try: with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, config=sess_config, checkpoint_dir="./target/tmp/input_output/" + str(t)) as mon_sess: # while not mon_sess.should_stop(): while True: print(index, mon_sess.run([global_step_inc, out])) sys.stdout.flush() # time.sleep(1) except Exception as e: print('traceback.print_exc():') traceback.print_exc() sys.stdout.flush() finally: SummaryWriterCache.clear()
def map_fun(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print(cluster_json) sys.stdout.flush() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) if 'ps' == job_name: from time import sleep while True: sleep(1) else: with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): global_step = tf.contrib.framework.get_or_create_global_step() global_step_inc = tf.assign_add(global_step, 1) input_records = [ tf.constant([1, 2, 3]), tf.constant([1.0, 2.0, 3.0]), tf.constant(['1.0', '2.0', '3.0']) ] out = tff_ops.encode_csv(input_list=input_records, field_delim='|') fw = tff_ops.FlinkTFRecordWriter(address=context.toFlink()) w = fw.write([out]) is_chief = (index == 0) t = time.time() try: hooks = [tf.train.StopAtStepHook(last_step=50)] with tf.train.MonitoredTrainingSession( master=server.target, config=sess_config, is_chief=is_chief, checkpoint_dir="./target/tmp/with_output/" + str(t), hooks=hooks) as mon_sess: while not mon_sess.should_stop(): print(index, mon_sess.run([global_step_inc, w])) sys.stdout.flush() time.sleep(1) finally: SummaryWriterCache.clear()
def map_func(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print (cluster_json) sys.stdout.flush() if "worker" == job_name and 0 == index: time.sleep(3) print("worker 0 finish!") sys.stdout.flush() else: while True: print("hello world!") sys.stdout.flush() time.sleep(3)
def flink_stream_train(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() export_model_path = tf_context.get_property("model_save_path") train(cluster_json, job_name, index, export_model_path, tf_context.flink_stream_dataset())
def test_source_sink(context): tf_context = TFContext(context) if 'ps' == tf_context.get_role_name(): from time import sleep while True: sleep(1) else: index = tf_context.get_index() job_name = tf_context.get_role_name() cluster_json = tf_context.get_tf_cluster() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): # with flink_server_device(tf_context) as (device, server, sess_config): reader = FlinkReader(tf_context) writer = FlinkWriter(tf_context) with tf.train.ChiefSessionCreator( master=server.target, config=sess_config).create_session() as sess: while True: batch = reader.next_batch(sess) if batch is None: break tf.logging.info("[TF][%s]process %s" % (str(datetime.datetime.now()), str(batch))) writer.write_result(sess, batch) writer.close(sess) sys.stdout.flush()
def test_example_coding_without_encode(context): tf_context = TFContext(context) if 'ps' == tf_context.get_role_name(): from time import sleep while True: sleep(1) else: index = tf_context.get_index() job_name = tf_context.get_role_name() cluster_json = tf_context.get_tf_cluster() cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=index) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % index]) with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:' + str(index), cluster=cluster)): # with flink_server_device(tf_context) as (device, server, sess_config): # reader = FlinkReader(tf_context) writer = FlinkWriter(tf_context) with tf.train.ChiefSessionCreator( master=server.target, config=sess_config).create_session() as sess: # while True: # batch = reader.next_batch(sess) # if batch is None: # break # writer.write_result(sess, batch) for i in range(10): writer.write_result(sess, ['output-%d' % i]) writer.close(sess) sys.stdout.flush()
def map_fun(context): tf_context = TFContext(context) job_name = tf_context.get_role_name() task_index = tf_context.get_index() cluster_json = tf_context.get_tf_cluster() print (cluster_json) sys.stdout.flush() props = context.properties batch_size = int(props.get("batch_size")) checkpoint_dir = props.get("checkpoint_dir") export_dir = props.get("export_dir") # Parameters IMAGE_PIXELS = 28 hidden_units = 128 cluster = tf.train.ClusterSpec(cluster=cluster_json) server = tf.train.Server(cluster, job_name=job_name, task_index=task_index) def feed_dict(images, labels): xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs / 255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": from time import sleep while True: sleep(1) elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter(worker_device="/job:worker/task:" + str(task_index), cluster=cluster)): # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") # Variables of the hidden layer hid_w = tf.Variable( tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) # Variables of the softmax layer sm_w = tf.Variable( tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.train.get_or_create_global_step() loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) train_op = tf.train.AdagradOptimizer(0.01).minimize(loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") iter = input_iter(tf_context, batch_size) next_batch = iter.get_next() is_chief = (task_index == 0) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_index]) # The MonitoredTrainingSession takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs mon_sess = tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief, checkpoint_dir=checkpoint_dir, stop_grace_period_secs=10, max_wait_secs=300, config=sess_config, chief_only_hooks=[ExportHook(export_dir, x, prediction)]) processed = 0 while not mon_sess.should_stop(): # Run a training step asynchronously # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. try: images, labels = mon_sess.run(next_batch) processed += images.shape[0] # print mon_sess.run(next_batch) except tf.errors.OutOfRangeError: break batch_xs, batch_ys = feed_dict(images, labels) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0 and not mon_sess.should_stop(): _, step = mon_sess.run([train_op, global_step], feed_dict=feed) if step % 100 == 0: print("{0}, Task {1} step: {2} accuracy: {3}".format( datetime.now().isoformat(), task_index, step, mon_sess.run(accuracy, {x: batch_xs, y_: batch_ys}))) sys.stdout.flush() print(str(processed) + " records processed.") print("{0} stopping MonitoredTrainingSession".format(datetime.now().isoformat())) mon_sess.close() SummaryWriterCache.clear()
def entry_func(self, context: Context): tf_context = TFContext(context) properties = tf_context.properties print('properties', properties, flush=True) # intra_op_parallelism is set by akdl, because there is a bug in TensorFlow 1.x # See: https://stackoverflow.com/questions/34426268/restricting-number-of-cores-used intra_op_parallelism = int(properties['ALINK:intra_op_parallelism']) if self.engine_type == TF1_TYPE: tf_helper.set_intra_op_parallelism( intra_op_parallelism_threads=intra_op_parallelism) elif self.engine_type == TF2_TYPE: gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.threading.set_intra_op_parallelism_threads( intra_op_parallelism) num_workers = int(properties['ALINK:num_workers']) work_dir = properties['ALINK:work_dir'] cluster, task_type, task_index = tf_context.export_estimator_cluster() if self.is_batch(): java_queue_file = JavaFile(context.from_java(), context.to_java()) dataset_file = os.path.join(work_dir, 'dataset.tfrecords') dataset, dataset_length = io_helper.convert_java_queue_file_to_repeatable_dataset( java_queue_file, dataset_file) print("number of records: " + str(dataset_length), flush=True) dataset_fn: Callable[ [], tf.data.TFRecordDataset] = lambda: tf.data.TFRecordDataset( dataset_file) else: dataset_fn: Callable[ [], tf.data. TFRecordDataset] = lambda: tf_context.flink_stream_dataset() dataset = None dataset_file = None dataset_length = None saved_model_dir = os.path.join(work_dir, 'savedmodel') user_params: Dict = json.loads(properties['ALINK:user_defined_params']) for i in range(1, 1024): key = "ALINK:bc_" + str(i) if key in properties: user_params[key] = context.properties[key] key = "ALINK:model_dir" if key in properties: user_params[key] = properties[key] output_writer = DirectOutputWriter(tf_context.from_java(), tf_context.to_java()) locals_copy = locals().copy() locals_copy.pop("self") print("locals_copy = ", locals_copy, flush=True) args = self.construct_args(**locals_copy) func = self.get_func_by_name(self.func_name) func(args) print("task_type = {}, task_index = {}: done tf_user_main".format( task_type, task_index), flush=True) local_vars = locals().copy() local_vars.pop('self') self.post_process(**local_vars) print("task_type = {}, task_index = {}: exit".format( task_type, task_index), flush=True) output_writer.close()