Example #1
0
def map_func(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print ("cluster:" + str(cluster_json))
    print ("job name:" + job_name)
    print ("current index:" + str(index))
    sys.stdout.flush()
    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=index)
    sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False,
                                 device_filters=["/job:ps", "/job:worker/task:%d" % index])
    t = time.time()
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:' + str(index), cluster=cluster)):
            train_ops = build_graph()
            print("python worker index:" + str(index))
            sys.stdout.flush()
            try:
                hooks = [tf.train.StopAtStepHook(last_step=2)]
                with tf.train.MonitoredTrainingSession(master=server.target, config=sess_config,
                                                       checkpoint_dir="./target/tmp/s1/" + str(t),
                                                       hooks=hooks) as mon_sess:
                    while not mon_sess.should_stop():
                        print (mon_sess.run(train_ops, feed_dict={a: [1.0, 2.0, 3.0]}))
                        sys.stdout.flush()
                        time.sleep(1)
            finally:
                SummaryWriterCache.clear()
def main_on_flink(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()

    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=index)
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):
            tf_hyperparameter = context.properties.get('TF_Hyperparameter')

            argv = tf.app.flags.FLAGS(tf_hyperparameter.split(' '),
                                      known_only=True)  # Parse known flags.
            if FLAGS.mode == 'train':
                training_on_flink(tf_context, sess_config, server.target)
            elif FLAGS.mode == 'decode':
                inference_on_flink(tf_context, sess_config, server.target)
            else:
                raise ValueError(
                    "The 'mode' flag must be one of train/eval/decode")
def map_fun(context):
    print(tf.__version__)
    sys.stdout.flush()
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print(cluster_json)
    sys.stdout.flush()
    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=index)
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):
            record_defaults = [[9], [tf.constant(value=9, dtype=tf.int64)],
                               [9.0],
                               [tf.constant(value=9.0, dtype=tf.float64)],
                               ["9.0"]]
            dataset = context.flinkStreamDataSet(buffer_size=0)
            dataset = dataset.map(lambda record: tf.decode_csv(
                record, record_defaults=record_defaults))
            dataset = dataset.batch(3)
            iterator = dataset.make_one_shot_iterator()
            input_records = iterator.get_next()

            global_step = tf.train.get_or_create_global_step()
            global_step_inc = tf.assign_add(global_step, 1)
            out_list = [input_records[0], input_records[2], input_records[4]]
            out = tff_ops.encode_csv(input_list=out_list)
            is_chief = (index == 0)
            t = time.time()
            try:
                with tf.train.MonitoredTrainingSession(
                        master=server.target,
                        is_chief=is_chief,
                        config=sess_config,
                        checkpoint_dir="./target/tmp/input_output/" +
                        str(t)) as mon_sess:
                    # while not mon_sess.should_stop():
                    while True:
                        print(index, mon_sess.run([global_step_inc, out]))
                        sys.stdout.flush()
                        # time.sleep(1)
            except Exception as e:
                print('traceback.print_exc():')
                traceback.print_exc()
                sys.stdout.flush()
            finally:
                SummaryWriterCache.clear()
Example #4
0
def map_fun(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print(cluster_json)
    sys.stdout.flush()
    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=index)
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):

            global_step = tf.contrib.framework.get_or_create_global_step()
            global_step_inc = tf.assign_add(global_step, 1)
            input_records = [
                tf.constant([1, 2, 3]),
                tf.constant([1.0, 2.0, 3.0]),
                tf.constant(['1.0', '2.0', '3.0'])
            ]
            out = tff_ops.encode_csv(input_list=input_records, field_delim='|')
            fw = tff_ops.FlinkTFRecordWriter(address=context.toFlink())
            w = fw.write([out])
            is_chief = (index == 0)
            t = time.time()
            try:
                hooks = [tf.train.StopAtStepHook(last_step=50)]
                with tf.train.MonitoredTrainingSession(
                        master=server.target,
                        config=sess_config,
                        is_chief=is_chief,
                        checkpoint_dir="./target/tmp/with_output/" + str(t),
                        hooks=hooks) as mon_sess:
                    while not mon_sess.should_stop():
                        print(index, mon_sess.run([global_step_inc, w]))
                        sys.stdout.flush()
                        time.sleep(1)
            finally:
                SummaryWriterCache.clear()
Example #5
0
def map_func(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print (cluster_json)
    sys.stdout.flush()
    if "worker" == job_name and 0 == index:
        time.sleep(3)
        print("worker 0 finish!")
        sys.stdout.flush()
    else:
        while True:
            print("hello world!")
            sys.stdout.flush()
            time.sleep(3)
Example #6
0
def flink_stream_train(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    export_model_path = tf_context.get_property("model_save_path")
    train(cluster_json, job_name, index, export_model_path, tf_context.flink_stream_dataset())
Example #7
0
def test_source_sink(context):
    tf_context = TFContext(context)
    if 'ps' == tf_context.get_role_name():
        from time import sleep
        while True:
            sleep(1)
    else:
        index = tf_context.get_index()
        job_name = tf_context.get_role_name()
        cluster_json = tf_context.get_tf_cluster()
        cluster = tf.train.ClusterSpec(cluster=cluster_json)

        server = tf.train.Server(cluster, job_name=job_name, task_index=index)
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % index])
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):
            # with flink_server_device(tf_context) as (device, server, sess_config):
            reader = FlinkReader(tf_context)
            writer = FlinkWriter(tf_context)

            with tf.train.ChiefSessionCreator(
                    master=server.target,
                    config=sess_config).create_session() as sess:
                while True:
                    batch = reader.next_batch(sess)
                    if batch is None:
                        break
                    tf.logging.info("[TF][%s]process %s" %
                                    (str(datetime.datetime.now()), str(batch)))

                    writer.write_result(sess, batch)
                writer.close(sess)
                sys.stdout.flush()
def test_example_coding_without_encode(context):
    tf_context = TFContext(context)
    if 'ps' == tf_context.get_role_name():
        from time import sleep
        while True:
            sleep(1)
    else:
        index = tf_context.get_index()
        job_name = tf_context.get_role_name()
        cluster_json = tf_context.get_tf_cluster()
        cluster = tf.train.ClusterSpec(cluster=cluster_json)

        server = tf.train.Server(cluster, job_name=job_name, task_index=index)
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % index])
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):
            # with flink_server_device(tf_context) as (device, server, sess_config):
            #     reader = FlinkReader(tf_context)
            writer = FlinkWriter(tf_context)

            with tf.train.ChiefSessionCreator(
                    master=server.target,
                    config=sess_config).create_session() as sess:
                # while True:
                #     batch = reader.next_batch(sess)
                #     if batch is None:
                #         break
                # writer.write_result(sess, batch)
                for i in range(10):
                    writer.write_result(sess, ['output-%d' % i])
                writer.close(sess)
                sys.stdout.flush()
def map_fun(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    task_index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print (cluster_json)
    sys.stdout.flush()

    props = context.properties
    batch_size = int(props.get("batch_size"))
    checkpoint_dir = props.get("checkpoint_dir")
    export_dir = props.get("export_dir")

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)

    def feed_dict(images, labels):
        xs = numpy.array(images)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.0
        ys = numpy.array(labels)
        ys = ys.astype(numpy.uint8)
        return (xs, ys)

    if job_name == "ps":
        from time import sleep
        while True:
            sleep(1)
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(worker_device="/job:worker/task:" + str(task_index), cluster=cluster)):

            # Placeholders or QueueRunner/Readers for input data
            x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
            y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

            # Variables of the hidden layer
            hid_w = tf.Variable(
                tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS),
                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            # Variables of the softmax layer
            sm_w = tf.Variable(
                tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)),
                name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.train.get_or_create_global_step()

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

            train_op = tf.train.AdagradOptimizer(0.01).minimize(loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

            iter = input_iter(tf_context, batch_size)
            next_batch = iter.get_next()

            is_chief = (task_index == 0)
            sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False,
                                         device_filters=["/job:ps", "/job:worker/task:%d" % task_index])

            # The MonitoredTrainingSession takes care of session initialization, restoring from
            #  a checkpoint, and closing when done or an error occurs
            mon_sess = tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief,
                                                         checkpoint_dir=checkpoint_dir,
                                                         stop_grace_period_secs=10, max_wait_secs=300,
                                                         config=sess_config,
                                                         chief_only_hooks=[ExportHook(export_dir, x,
                                                                                      prediction)])
            processed = 0
            while not mon_sess.should_stop():
                # Run a training step asynchronously
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                try:
                    images, labels = mon_sess.run(next_batch)
                    processed += images.shape[0]
                    # print mon_sess.run(next_batch)
                except tf.errors.OutOfRangeError:
                    break

                batch_xs, batch_ys = feed_dict(images, labels)
                feed = {x: batch_xs, y_: batch_ys}

                if len(batch_xs) > 0 and not mon_sess.should_stop():
                    _, step = mon_sess.run([train_op, global_step], feed_dict=feed)
                    if step % 100 == 0:
                        print("{0}, Task {1} step: {2} accuracy: {3}".format(
                            datetime.now().isoformat(), task_index, step,
                            mon_sess.run(accuracy, {x: batch_xs, y_: batch_ys})))
                        sys.stdout.flush()

            print(str(processed) + " records processed.")
            print("{0} stopping MonitoredTrainingSession".format(datetime.now().isoformat()))
            mon_sess.close()

    SummaryWriterCache.clear()
Example #10
0
    def entry_func(self, context: Context):
        tf_context = TFContext(context)
        properties = tf_context.properties
        print('properties', properties, flush=True)

        # intra_op_parallelism is set by akdl, because there is a bug in TensorFlow 1.x
        # See: https://stackoverflow.com/questions/34426268/restricting-number-of-cores-used
        intra_op_parallelism = int(properties['ALINK:intra_op_parallelism'])
        if self.engine_type == TF1_TYPE:
            tf_helper.set_intra_op_parallelism(
                intra_op_parallelism_threads=intra_op_parallelism)
        elif self.engine_type == TF2_TYPE:
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            tf.config.threading.set_intra_op_parallelism_threads(
                intra_op_parallelism)

        num_workers = int(properties['ALINK:num_workers'])
        work_dir = properties['ALINK:work_dir']
        cluster, task_type, task_index = tf_context.export_estimator_cluster()

        if self.is_batch():
            java_queue_file = JavaFile(context.from_java(), context.to_java())
            dataset_file = os.path.join(work_dir, 'dataset.tfrecords')
            dataset, dataset_length = io_helper.convert_java_queue_file_to_repeatable_dataset(
                java_queue_file, dataset_file)
            print("number of records: " + str(dataset_length), flush=True)
            dataset_fn: Callable[
                [], tf.data.TFRecordDataset] = lambda: tf.data.TFRecordDataset(
                    dataset_file)
        else:
            dataset_fn: Callable[
                [], tf.data.
                TFRecordDataset] = lambda: tf_context.flink_stream_dataset()
            dataset = None
            dataset_file = None
            dataset_length = None

        saved_model_dir = os.path.join(work_dir, 'savedmodel')

        user_params: Dict = json.loads(properties['ALINK:user_defined_params'])
        for i in range(1, 1024):
            key = "ALINK:bc_" + str(i)
            if key in properties:
                user_params[key] = context.properties[key]

        key = "ALINK:model_dir"
        if key in properties:
            user_params[key] = properties[key]

        output_writer = DirectOutputWriter(tf_context.from_java(),
                                           tf_context.to_java())

        locals_copy = locals().copy()
        locals_copy.pop("self")
        print("locals_copy = ", locals_copy, flush=True)
        args = self.construct_args(**locals_copy)

        func = self.get_func_by_name(self.func_name)
        func(args)

        print("task_type = {}, task_index = {}: done tf_user_main".format(
            task_type, task_index),
              flush=True)
        local_vars = locals().copy()
        local_vars.pop('self')
        self.post_process(**local_vars)
        print("task_type = {}, task_index = {}: exit".format(
            task_type, task_index),
              flush=True)

        output_writer.close()