Esempio n. 1
0
def export():
    tf.logging.set_verbosity(tf.logging.INFO)
    inp = tf.placeholder(tf.float32, [None], name=INPUT_TENSOR_NAME)
    model_fn(dict(data=inp), None, tf.estimator.ModeKeys.PREDICT)
    sess = get_session()
    tf.train.Saver().save(sess, os.path.join(EXPORT_FOLDER, 'checkpoint.ckpt'))
    tf.train.write_graph(sess.graph_def, EXPORT_FOLDER, 'graph.pbtxt', True)
    sess.close()
    print("Freezing graph")
    lp = get_latest_export()
    ckpt = tf.train.get_checkpoint_state(EXPORT_FOLDER)
    freeze_graph(
        os.path.join(EXPORT_FOLDER, 'graph.pbtxt'),
        None,
        False,
        ckpt.model_checkpoint_path,
        OUTPUT_TENSOR_NAME,
        'save/restore_all',
        'save/Const:0',
        os.path.join(EXPORT_FOLDER, 'fozen.pb'),
        True,
        ''
    )
    input_graph_def = tf.GraphDef()
    with tf.gfile.Open(os.path.join(EXPORT_FOLDER, 'fozen.pb'), "rb") as f:
        input_graph_def.ParseFromString(f.read())
    output_graph = optimize_for_inference(
        input_graph_def,
        [INPUT_TENSOR_NAME],
        [OUTPUT_TENSOR_NAME],
        tf.float32.as_datatype_enum
    )
    with tf.gfile.FastGFile(EXPORTED_MODEL_NAME, 'w') as f:
        f.write(output_graph.SerializeToString())
Esempio n. 2
0
def build_and_run_exports(latest, job_dir, serving_input_fn, hidden_units):
  """Given the latest checkpoint file export the saved model.

  Args:
    latest (string): Latest checkpoint file
    job_dir (string): Location of checkpoints and model files
    name (string): Name of the checkpoint to be exported. Used in building the
      export path.
    hidden_units (list): Number of hidden units
    learning_rate (float): Learning rate for the SGD
  """

  prediction_graph = tf.Graph()
  with prediction_graph.as_default():
    features, inputs_dict = serving_input_fn()
    prediction_dict = model.model_fn(
        model.PREDICT,
        features.copy(),
        None,  # labels
        hidden_units=hidden_units,
        learning_rate=None  # learning_rate unused in prediction mode
    )
    saver = tf.train.Saver()

  with tf.Session(graph=prediction_graph) as session:
    session.run([tf.local_variables_initializer(), tf.tables_initializer()])
    saver.restore(session, latest)
    saved_model_util.simple_save(
        session, os.path.join(job_dir, 'export'), inputs_dict, prediction_dict)
Esempio n. 3
0
def test(args):
    # import parmeter data
    hps = params_utils.parse_params(params.mnist_classification_args)

    # load dataset
    dataset = load_mnist(show_info=False)

    # inputs setting
    with tf.variable_scope('input'):
        if hps.data_params.is_flattened:
            shape = [None, np.prod(hps.data_params.image_size)]
        else:
            shape = [None] + list(hps.data_params.image_size)
        x = tf.placeholder(tf.float32, shape)
        y = tf.placeholder(tf.float32, [None, 10])
        is_training = tf.placeholder(tf.bool, shape=None)

    # format image
    if hps.data_params.is_flattened:
        if len(hps.data_params.image_size) == 3:
            image_shape = [-1] + list(hps.data_params.image_size)
        elif len(hps.data_params.image_size) == 2:
            image_shape = [-1] + list(hps.data_params.image_size) + [1]
        else:
            raise NotImplementedError('image shape should be NHW or NHWC')
    _x = tf.reshape(x, image_shape)

    # input -> model -> output
    y_hat = model.model_fn(_x, hps, is_training)

    with tf.name_scope('metrics'):
        with tf.name_scope('accuracy'):
            correctness = tf.equal(tf.argmax(y_hat), tf.argmax(y))
            correctness = tf.cast(correctness, tf.float32)
            accuracy = tf.reduce_mean(correctness)

    saver = tf.train.Saver()
    path_prefix = Path(args.prefix)
    save_path = hps.paths.model_path / path_prefix
    ckpt = tf.train.get_checkpoint_state(save_path)

    with tf.Session() as sess:
        if ckpt:
            print('restore variable')
            last_model = ckpt.model_checkpoint_path
            saver.restore(sess, last_model)
        else:
            raise Exception()

        accs = []
        for step in tqdm(range(hps.hyper_parameters.step)):
            batch = dataset.test.next_batch(hps.hyper_parameters.batch_size)
            acc = sess.run(accuracy,
                           feed_dict={
                               x: batch[0],
                               y: batch[1],
                               is_training: False
                           })
            accs.append(acc)
        print(np.mean(accs))
Esempio n. 4
0
def evaluate(args):
    # Build model
    model = model_fn(
        first_layer=[args.inp_kernel, args.inp_window],
        gcn_layers=[args.n_GC_units] * args.n_GC_layers,
        conv_layer_filters=[int(k) for k in args.conv_kernels.split(',')],
        conv_layer_windows=[int(k) for k in args.conv_windows.split(',')],
        nBins=1250,
        lr=args.lr,
        nMarks=args.n_marks,
        verbose=1
    )

    model.load_weights('v11_temp_model_7.h5')

    # Use the model to predict chromosome 2
    ch = 'chr2'
    res = 1000  # Input resolution: 1000 bp

    # Load epigenetic data
    epi_names = ['ATAC_seq', 'CTCF', 'H3K4me1', 'H3K4me3',
                 'H3K9ac', 'H3K27ac', 'H3K27me3', 'H3K36me3']
    epigenetic_data = load_epigenetic_data([ch], epi_names, args.cell_line)

    path = '/nfs/turbo/umms-drjieliu/proj/4dn/data/microC/high_res_map_project_training_data'

    for pos in range(100000, 242100000 - 249999, 125000):
        print(pos)
        hic = np.load(f'{path}/HiC/{args.cell_line}/{ch}/{ch}_{res}bp_{pos}_{pos + 250000}.npy')
        epi = epigenetic_data[ch][pos // 200 - (args.inp_window // 2): pos // 200 + 1250 + (args.inp_window // 2), :]
        hics = np.array([hic])
        epis = np.array([epi])
        m = model.predict([hics, epis])[0, :, :]  # Model Input: HiC and Epigenetic data
        np.save(f'outputs/pred_{ch}_{res}bp_{pos}.npy', m)
Esempio n. 5
0
def dispatch(train_files,
             eval_files,
             job_dir,
             train_steps,
             eval_steps,
             train_batch_size,
             eval_batch_size,
             learning_rate,
             eval_frequency,
             first_layer_size,
             num_layers,
             scale_factor,
             eval_num_epochs,
             num_epochs,
             checkpoint_epochs):
  retinopathy_model = model.model_fn(CLASS_SIZE)

  try:
    os.makedirs(job_dir)
  except:
    pass

  # Unhappy hack to work around h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  checkpoint_path = FILE_PATH
  if not job_dir.startswith("gs://"):
    checkpoint_path = os.path.join(job_dir, checkpoint_path)

  # Model checkpoint callback
  checkpoint = keras.callbacks.ModelCheckpoint(
      checkpoint_path,
      monitor='val_loss',
      verbose=2,
      period=checkpoint_epochs,
      mode='max')

  # Continuous eval callback
  evaluation = ContinuousEval(eval_frequency,
                              job_dir)

  # Tensorboard logs callback
  tblog = keras.callbacks.TensorBoard(
      log_dir=os.path.join(job_dir, 'logs'),
      histogram_freq=0,
      write_graph=True,
      embeddings_freq=0)

  callbacks=[checkpoint, evaluation, tblog]

  [X_train,Y_train]=model.read_train_data()

  


  datagen = ImageDataGenerator(
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True)
Esempio n. 6
0
def test_set(x_test_labelled, x_test_unlabelled, y_test_labelled,
             y_test_unlabelled, params):
    '''
    Helper function for training
    '''
    # Prepare input and model
    x_l = tf.placeholder(x_test_labelled.dtype, x_test_labelled.shape)
    y_l = tf.placeholder(y_test_labelled.dtype, y_test_labelled.shape)
    x_u = tf.placeholder(x_test_unlabelled.dtype, x_test_unlabelled.shape)
    y_u = tf.placeholder(y_test_unlabelled.dtype, y_test_unlabelled.shape)
    inputs = test_input_fn(x_l, x_u, y_l, y_u, params['batch_size'],
                           params['buffer_size'])
    model = model_fn(inputs, tf.estimator.ModeKeys.EVAL, params)
    l_accuracy = model['l_accuracy']
    u_accuracy = model['u_accuracy']
    update_l_acc = model['update_l_acc']
    update_u_acc = model['update_u_acc']
    acc_init = model['acc_init']
    loss_sum = 0.0
    total_l_acc = 0.0
    total_u_acc = 0.0
    step = 0
    saver = tf.train.Saver()
    #chkp.print_tensors_in_checkpoint_file(os.path.join(params['model_dir'], 'model.ckpt'), tensor_name='', all_tensors=False, all_tensor_names=True)

    with tf.Session() as sess:
        tf.logging.info('Starting testing')
        sess.run(model['var_init'])
        sess.run(acc_init)
        # Restoring trained weights from training in params['model_dir']
        saver.restore(sess, os.path.join(params['model_dir'], 'model.ckpt'))
        sess.run(
            model['iter_init'], {
                x_l: x_test_labelled,
                x_u: x_test_unlabelled,
                y_l: y_test_labelled,
                y_u: y_test_unlabelled
            })
        #writer = tf.summary.FileWriter(os.path.join(params['model_dir'], 'test_summaries'), sess.graph)
        while True:
            try:
                step += 1
                # Update accuracy values
                _, _, l_acc_val, u_acc_val = \
                    sess.run([update_l_acc, update_u_acc, l_accuracy, u_accuracy])
                total_l_acc = l_acc_val
                total_u_acc = u_acc_val
                if params['log_step']:
                    if step % params['log_step'] == 0:
                        # Log accuracies so far if appropriate
                        tf.logging.info(
                            'Loss: {}; Labelled Accuracy: {}; Unlabelled Accuracy: {}'
                            .format(loss_sum / step, total_l_acc, total_u_acc))
            except tf.errors.OutOfRangeError:
                # At the end of the dataset
                break
    return total_l_acc, total_u_acc
Esempio n. 7
0
def train():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.log_device_placement = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)
    model = model_fn((height, width, 3), learning_rate)
    model.fit(training_input_fn, steps_per_epoch=steps_per_epoch, epochs=epochs,
              validation_data=validation_input_fn, validation_steps=1,
              callbacks=[tf.keras.callbacks.TensorBoard(log_dir=log_dir)])
Esempio n. 8
0
def build_and_run_exports(latest, job_dir, name, serving_input_fn,
                          hidden_units):
    """Given the latest checkpoint file export the saved model.

  Args:
    latest (string): Latest checkpoint file
    job_dir (string): Location of checkpoints and model files
    name (string): Name of the checkpoint to be exported. Used in building the
      export path.
    hidden_units (list): Number of hidden units
    learning_rate (float): Learning rate for the SGD
  """

    prediction_graph = tf.Graph()
    exporter = tf.saved_model.builder.SavedModelBuilder(
        os.path.join(job_dir, 'export', name))
    with prediction_graph.as_default():
        features, inputs_dict = serving_input_fn()
        prediction_dict = model.model_fn(
            model.PREDICT,
            features,
            None,  # labels
            hidden_units=hidden_units,
            learning_rate=None  # learning_rate unused in prediction mode
        )
        saver = tf.train.Saver()

        inputs_info = {
            name: tf.saved_model.utils.build_tensor_info(tensor)
            for name, tensor in inputs_dict.iteritems()
        }
        output_info = {
            name: tf.saved_model.utils.build_tensor_info(tensor)
            for name, tensor in prediction_dict.iteritems()
        }
        signature_def = tf.saved_model.signature_def_utils.build_signature_def(
            inputs=inputs_info,
            outputs=output_info,
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)

    with tf.Session(graph=prediction_graph) as session:
        session.run(
            [tf.local_variables_initializer(),
             tf.tables_initializer()])
        saver.restore(session, latest)
        exporter.add_meta_graph_and_variables(
            session,
            tags=[tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                signature_def
            },
        )

    exporter.save()
Esempio n. 9
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             train_batch_size, eval_batch_size, learning_rate, eval_frequency,
             first_layer_size, num_layers, scale_factor, eval_num_epochs,
             num_epochs, checkpoint_epochs):
    census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency,
                                eval_files,
                                learning_rate,
                                job_dir,
                                steps=train_steps)

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    callbacks = [checkpoint, evaluation, tblog]

    census_model.fit_generator(model.generator_input(train_files,
                                                     chunk_size=CHUNK_SIZE),
                               steps_per_epoch=train_steps,
                               epochs=num_epochs,
                               callbacks=callbacks)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if job_dir.startswith("gs://"):
        census_model.save(CENSUS_MODEL)
        copy_file_to_gcs(job_dir, CENSUS_MODEL)
    else:
        census_model.save(os.path.join(job_dir, CENSUS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
Esempio n. 10
0
def build_and_run_exports(latest, job_dir, serving_input_fn, hidden_units):
  """Given the latest checkpoint file export the saved model.

  Args:
    latest (string): Latest checkpoint file
    job_dir (string): Location of checkpoints and model files
    name (string): Name of the checkpoint to be exported. Used in building the
      export path.
    hidden_units (list): Number of hidden units
    learning_rate (float): Learning rate for the SGD
  """

  prediction_graph = tf.Graph()
  exporter = tf.saved_model.builder.SavedModelBuilder(
      os.path.join(job_dir, 'export'))
  with prediction_graph.as_default():
    features, inputs_dict = serving_input_fn()
    prediction_dict = model.model_fn(
        model.PREDICT,
        features.copy(),
        None,  # labels
        hidden_units=hidden_units,
        learning_rate=None  # learning_rate unused in prediction mode
    )
    saver = tf.train.Saver()

    inputs_info = {
        name: tf.saved_model.utils.build_tensor_info(tensor)
        for name, tensor in inputs_dict.iteritems()
    }
    output_info = {
        name: tf.saved_model.utils.build_tensor_info(tensor)
        for name, tensor in prediction_dict.iteritems()
    }
    signature_def = tf.saved_model.signature_def_utils.build_signature_def(
        inputs=inputs_info,
        outputs=output_info,
        method_name=sig_constants.PREDICT_METHOD_NAME
    )

  with tf.Session(graph=prediction_graph) as session:
    session.run([tf.local_variables_initializer(), tf.tables_initializer()])
    saver.restore(session, latest)
    exporter.add_meta_graph_and_variables(
        session,
        tags=[tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
        },
        legacy_init_op=main_op()
    )

  exporter.save()
Esempio n. 11
0
def train_and_evaluate(args):
    # Build model
    model = model_fn(
        first_layer=[args.inp_kernel, args.inp_window],
        gcn_layers=[args.n_GC_units] * args.n_GC_layers,
        conv_layer_filters=[int(k) for k in args.conv_kernels.split(',')],
        conv_layer_windows=[int(k) for k in args.conv_windows.split(',')],
        nBins=1250,
        lr=args.lr,
        nMarks=args.n_marks,
        verbose=1)

    # Load chromosomes
    st_ed_pos = chromosome_sizes[args.cell_line]
    chromosomes = ['chr' + elm for elm in args.chrs.split(',')
                   ] if args.chrs.lower() != 'all' else st_ed_pos.keys()

    # Load resolutions
    resolutions = [int(elm) for elm in args.inp_resolutions.split(',')]

    # Load epigenetic data
    epi_names = [
        'ATAC_seq', 'CTCF', 'H3K4me1', 'H3K4me3', 'H3K9ac', 'H3K27ac',
        'H3K27me3', 'H3K36me3'
    ]
    epigenetic_data = load_epigenetic_data(args.cell_line, chromosomes,
                                           epi_names)
    # epigenetic_data = load_processed_epigenetic_data_2(chromosomes, epi_names, args.cell_line)

    for i in range(args.epochs):
        print('Epoch', i, ':')
        t1 = time.time()
        for (epi, hics), micros in generate_batches(args.cell_line,
                                                    chromosomes, resolutions,
                                                    epi_names, epigenetic_data,
                                                    args.batch_size,
                                                    args.inp_window):
            t2 = time.time()
            print(' - Loading data:', t2 - t1, 's')
            model.train_on_batch([hics, epi], micros)
            t3 = time.time()
            print(' - Training:', t3 - t2, 's')
            mse = model.evaluate([hics, epi],
                                 micros,
                                 batch_size=args.batch_size,
                                 verbose=0)
            t1 = time.time()
            print(' - Evaluating:', t1 - t3, 's')
            print(' - MSE:', mse)

        if i % args.checkpoint_frequency == 0:
            model.save_weights('temp_model_{0}.h5'.format(i))
Esempio n. 12
0
def load_model(args):
    model = model_fn(
        first_layer=[args.inp_kernel, args.inp_window],
        gcn_layers=[args.n_GC_units] * args.n_GC_layers,
        conv_layer_filters=[int(k) for k in args.conv_kernels.split(',')],
        conv_layer_windows=[int(k) for k in args.conv_windows.split(',')],
        nBins=1250,
        lr=args.lr,
        nMarks=args.n_marks,
        verbose=1)

    model.load_weights('v16_temp_model_11.h5')
    return model
    def _run_export(self):

        export_dir = 'export_ckpt_' + re.findall('\d+',
                                                 self._latest_checkpoint)[-1]
        tf.logging.info('Exporting model from checkpoint {0}'.format(
            self._latest_checkpoint))
        prediction_graph = tf.Graph()
        try:
            exporter = tf.saved_model.builder.SavedModelBuilder(
                os.path.join(self._checkpoint_dir, export_dir))
        except IOError:
            tf.logging.info(
                'Checkpoint {0} already exported, continuing...'.format(
                    self._latest_checkpoint))
            return

        with prediction_graph.as_default():
            image, name, inputs_dict = model.serving_input_fn()
            prediction_dict = model.model_fn(model.PREDICT, name, image, None,
                                             6, None)

            saver = tf.train.Saver()

            inputs_info = {
                name: tf.saved_model.utils.build_tensor_info(tensor)
                for name, tensor in inputs_dict.iteritems()
            }

            output_info = {
                name: tf.saved_model.utils.build_tensor_info(tensor)
                for name, tensor in prediction_dict.iteritems()
            }

            signature_def = tf.saved_model.signature_def_utils.build_signature_def(
                inputs=inputs_info,
                outputs=output_info,
                method_name=sig_constants.PREDICT_METHOD_NAME)

        with tf.Session(graph=prediction_graph) as session:
            saver.restore(session, self._latest_checkpoint)
            exporter.add_meta_graph_and_variables(
                session,
                tags=[tf.saved_model.tag_constants.SERVING],
                signature_def_map={
                    sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                    signature_def
                },
                legacy_init_op=my_main_op())

        exporter.save()
Esempio n. 14
0
def dispatch(train_files, eval_files, job_dir, learning_rate, eval_frequency,
             num_epochs, checkpoint_epochs):

    # setting the seed for reproducibility
    np.random.seed(13)

    forecast_model = model.model_fn()

    scaler = model.build_scaler(train_files + eval_files)

    try:
        os.makedirs(job_dir)
    except Exception as e:
        print(e)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = CHECKPOINT_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 verbose=1,
                                                 period=checkpoint_epochs)

    # Continuous eval callback
    with ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir,
                        scaler) as evaluation:

        # Tensorboard logs callback
        tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(
            job_dir, 'logs'),
                                            histogram_freq=0,
                                            write_graph=True,
                                            embeddings_freq=0)

        callbacks = [checkpoint, evaluation, tblog]

        x, y = model.load_features(train_files, scaler)
        forecast_model.fit(x, y, epochs=num_epochs, callbacks=callbacks)

        # Unhappy hack to work around h5py not being able to write to GCS.
        # Force snapshots and saves to local filesystem, then copy them over to GCS.
        if job_dir.startswith("gs://"):
            forecast_model.save(MODEL_FILENAME)
            copy_file_to_gcs(job_dir, MODEL_FILENAME)
        else:
            forecast_model.save(os.path.join(job_dir, MODEL_FILENAME))
Esempio n. 15
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             train_batch_size, eval_batch_size, learning_rate, eval_frequency,
             first_layer_size, num_layers, scale_factor, eval_num_epochs,
             num_epochs, checkpoint_epochs):
    census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(os.path.join(
        job_dir, FILE_PATH),
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate,
                                job_dir)

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    # TODO: This needs to be fixed in h5py so that writes to GCS are possible
    # Don't attempt to create checkpoints on Cloud ML Engine for now because
    # h5py doesn't come with native GCS write capability
    if job_dir.startswith('gs://'):
        callbacks = [evaluation, tblog]
    else:
        callbacks = [checkpoint, evaluation, tblog]

    start_time = time.time()

    census_model.fit_generator(model.generator_input(train_files,
                                                     chunk_size=CHUNK_SIZE),
                               steps_per_epoch=train_steps,
                               epochs=num_epochs,
                               callbacks=callbacks)

    print "\nTime used.", time.time() - start_time

    census_model.save(os.path.join(job_dir, CENSUS_MODEL))
Esempio n. 16
0
def run(args):
    ms.context.set_context(
        mode=ms.context.GRAPH_MODE,
        device_target=args.device,
        save_graphs=False,
    )

    net = model_fn()

    loss = ms.nn.loss.SoftmaxCrossEntropyWithLogits(
        sparse=True,
        reduction='mean',
    )
    opt = build_optimizer(args, net)

    if args.mode == 'init':
        save_checkpoint(
            net,
            ckpt_file_name=os.path.join('seeds', '%d.ckpt' % (time.time())),
        )

    if args.mode == 'train':
        ds_train = create_dataset(
            data_path=os.path.join(args.data_path, 'train'),
            batch_size=args.device_batch_size,
        )

        if args.init_ckpt:
            print('using init checkpoint %s' % (args.init_ckpt))
            load_ckpt(net, args.init_ckpt)
        train(args, net, loss, opt, ds_train)

    if args.mode == 'test':
        ds_test = create_dataset(
            data_path=os.path.join(args.data_path, 'test'),
            batch_size=args.device_batch_size,
        )

        if args.ckpt_files:
            checkpoints = args.ckpt_files.split(',')
        else:
            steps = [10, 20, 30, 40]
            checkpoints = [get_ckpt_file_name(args, i) for i in steps]
        print('will test %d checkpoints' % (len(checkpoints)))
        # for i, n in enumerate(checkpoints):
        #     print('[%d]=%s' % (i, n))
        test(args, net, loss, opt, ds_test, checkpoints)
Esempio n. 17
0
def train_and_evaluate(hparams):
  img_shape, num_classes, test_input_fn, train_input_fn = get_dataset(hparams.dataset)  

  model = model_fn(img_shape, num_classes, hparams.learning_rate)

#  model.summary()
  model.load_weights('my_model_weights.h5')
  strategy = {
          'mirror': MirroredStrategy(num_gpus=hparams.num_gpus, prefetch_on_device=True),
          'collective': CollectiveAllReduceStrategy(num_gpus_per_worker=hparams.num_gpus)
          }[hparams.dist]

#  config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=500)
   
  dist_config = tf.contrib.distribute.DistributeConfig(
           train_distribute=strategy,
           eval_distribute=strategy,
           remote_cluster=None
           )
  session_config = tf.ConfigProto(
           inter_op_parallelism_threads=0,
           intra_op_parallelism_threads=0,
           allow_soft_placement=True
           )
  config = tf.estimator.RunConfig(
           save_checkpoints_steps=2000,
           session_config=session_config,
           experimental_distribute=dist_config
           )

  estimator = tf.keras.estimator.model_to_estimator(model,
          model_dir=hparams.job_dir,
          config=config)
  
  train_spec = tf.estimator.TrainSpec(
          input_fn=lambda: train_input_fn(hparams.batch_size),
          max_steps=8000)

  eval_spec = tf.estimator.EvalSpec(
          input_fn=lambda: test_input_fn(hparams.batch_size),
          steps=50,
          start_delay_secs=10,
          throttle_secs=10)

  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Esempio n. 18
0
def evaluate_files(pattern, x_splits=7, y_splits=5):
    """
       Evaluates the images matching the pattern and outputs the results as a generator
    """
    image_reader = tf.WholeFileReader()
    sip = tf.train.string_input_producer(
        tf.train.match_filenames_once(pattern), num_epochs=1)
    image_name, image_file = image_reader.read(sip)
    image = tf.image.decode_png(image_file)
    images = []
    shape = tf.shape(image)
    height, width = shape[0], shape[1]
    dx = (width - CROPPED_IMAGE_SIZE) / (x_splits - 1)
    dy = (height - CROPPED_IMAGE_SIZE) / (y_splits - 1)
    for x in range(x_splits):
        for y in range(y_splits):
            images.append(
                tf.image.crop_to_bounding_box(
                    image,
                    tf.minimum(tf.to_int32(dy * y),
                               height - CROPPED_IMAGE_SIZE),
                    tf.minimum(tf.to_int32(dx * x),
                               width - CROPPED_IMAGE_SIZE), CROPPED_IMAGE_SIZE,
                    CROPPED_IMAGE_SIZE))
    images = tf.reshape(
        tf.to_float(images) / 255.0,
        (-1, CROPPED_IMAGE_SIZE, CROPPED_IMAGE_SIZE, 3))
    nn = model_fn(dict(input=images), None, tf.estimator.ModeKeys.PREDICT)
    preds = tf.reshape(nn.predictions['predictions'], (x_splits, y_splits))
    sess = get_session()
    coord = tf.train.Coordinator()
    tf.train.start_queue_runners(sess, coord)
    try:
        while True:
            name, predictions = sess.run([image_name, preds])
            yield name, predictions
    except KeyboardInterrupt:
        pass
    except tf.errors.OutOfRangeError:
        pass
    finally:
        coord.request_stop()
        coord.join()
        sess.close()
Esempio n. 19
0
def create_model(args):
    model = keras.Model(*model_fn(args))
    if args.nb_ipus_per_replica > 1:
        set_pipeline_options(model, args)
        model.print_pipeline_stage_assignment_summary()
    elif args.nb_ipus_per_replica == 1:
        model.set_gradient_accumulation_options(
            gradient_accumulation_steps_per_replica=args.
            gradient_accumulation_count,
            offload_weight_update_variables=False)
    model.compile(
        optimizer=get_optimizer(args),
        loss=dice_ce_loss,
        # Number of micro batches to process sequentially in a single execution
        steps_per_execution=args.steps_per_execution
        if args.nb_ipus_per_replica > 0 else None,
        metrics=[dice_coef_accuracy_fn, ce_loss])

    return model
Esempio n. 20
0
def evaluate():
    tf.logging.set_verbosity(tf.logging.INFO)
    input_dict, label_dict = input_fn()
    nn = model_fn(input_dict, None, tf.estimator.ModeKeys.PREDICT)
    stats = PredictionStats()
    sess = get_session()
    coord = tf.train.Coordinator()
    tf.train.start_queue_runners(sess, coord)
    try:
        while True:
            pred, act = sess.run(
                [nn.predictions['predictions'], label_dict['labels']])
            stats.add_predictions(pred, act)
            print('Predictions: %00000d, Accuracy: %.4f' %
                  (stats.get_amount(), stats.get_accuracy()))
    except KeyboardInterrupt:
        pass
    finally:
        coord.request_stop()
        coord.join()
        sess.close()
    print()
    stats.print_result()
Esempio n. 21
0
class ModelWrapper(object):
    config = load_config()
    model = model_fn(config)
    model_path = '../model/best/%s-%d' % (config.model_name, 5)
    model.load_weights(model_path)
    # Hack: This fixes a keras bug with TF backend in async environment,
    # see https://github.com/keras-team/keras/issues/2397 for details.
    graph = tf.get_default_graph()
    print 'successfully loaded model from: %s' % model_path

    @classmethod
    def normalize(cls, img):
        """
        cropping and zero-centering
        """
        img = imutils.crop(img, std_size=(cls.config.width, cls.config.height))
        img = (img - 128.0) / 255.0
        return np.array([img])

    @classmethod
    def predict(cls, img):
        with cls.graph.as_default():
            X = cls.normalize(img)
            return id2building[np.argmax(cls.model.predict(X)[0])]
Esempio n. 22
0
                        type=str,
                        choices=['one', 'all'],
                        required=True)
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = get_args()

    X = np.load(X_path)
    y = np.load(y_path)
    ids = np.load(ids_path)
    config = load_config()

    model = model_fn(config)

    if args.mode == 'all':
        for epoch in xrange(20):
            model.load_weights('../model/%s-%d' % (config.model_name, epoch))

            score = model.evaluate(X, y)
            print 'epoch:', epoch
            print 'score:', score
    else:
        model.load_weights('../model/best/%s-%d' %
                           (config.model_name, args.epoch))

        score = model.evaluate(X, y)
        print 'score:', score
Esempio n. 23
0
def run():

    config = Config()  # Load configs

    save_path = 'keras_models/keras_model'  # Model save path

    x_train_path = 'data/xtrain.txt'
    x_test_path = 'data/xtest.txt'
    y_train_path = 'data/ytrain.txt'

    x_idx = prep.Indexer()

    X = prep.read_file(x_train_path, raw=True)
    y = prep.read_file(y_train_path, label=True)

    t = CountVectorizer(analyzer='char',
                        ngram_range=(config.ngram, config.ngram))
    t.fit(X)
    X = prep.transform(X, t, x_idx)
    X = np.array(pad_sequences(X, config.maxlen))

    x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size=config.test_size, shuffle=config.shuffle)

    #############################################
    # Train model
    print("BEGINNING TRAINING")
    tsv_logger = CSVLogger('training-data.tsv', append=True, separator='\t')

    m = model_fn(config=config, input_length=x_idx.max_number() + 1)

    # m = load_model(save_path)

    m.fit(x_train,
          y_train,
          epochs=config.n_epochs,
          batch_size=config.batch_size,
          verbose=1,
          shuffle=True,
          callbacks=[tsv_logger],
          validation_data=(x_test, y_test))

    m.save(save_path)

    print("MODEL REPORT")
    score, acc = m.evaluate(x_test, y_test)

    print("\nSCORE: ", score)
    print("ACCURACY: ", acc)

    pred = [np.argmax(label) for label in m.predict(x_test)]

    report = classification_report(y_test, pred)

    print(report)

    ###############################################
    # Predict and write labels for xtest.txt

    print("PREDICTION")

    X = prep.read_file(x_test_path, raw=True)
    X = prep.transform(X, t, x_idx, add_if_new=False)
    X = np.array(pad_sequences(X, config.maxlen))

    pred = [np.argmax(label) for label in m.predict(X)]

    with open("".join(["keras_prediction/ytest.txt"]), "w+",
              encoding="utf-8") as rec:
        for label in pred:
            rec.write("%s\n" % label)

        rec.close()
Esempio n. 24
0
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir,
        train_files, eval_files, train_batch_size, eval_batch_size,
        learning_rate, eval_frequency, first_layer_size, num_layers,
        scale_factor, num_epochs, export_format):
    """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(first_layer_size * scale_factor**i))
        for i in range(num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info("Created DNN hidden units {}".format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                eval_files,
                num_epochs=None if eval_steps else 1,
                batch_size=eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=learning_rate)

        hooks = [
            EvaluationRunHook(
                job_dir,
                metric_dict,
                evaluation_graph,
                eval_frequency,
                eval_steps=eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue
            features, labels = model.input_fn(train_files,
                                              num_epochs=num_epochs,
                                              batch_size=train_batch_size)

            # Returns the training graph and global step tensor
            train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=learning_rate)

        # Creates a MonitoredSession for training
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=job_dir,
                hooks=hooks,
                save_checkpoint_secs=20,
                save_summaries_steps=50) as session:
            # Global step to keep track of global number of steps particularly in
            # distributed setting
            step = global_step_tensor.eval(session=session)

            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while (train_steps is None
                   or step < train_steps) and not session.should_stop():
                step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(latest_checkpoint, job_dir,
                                  model.SERVING_INPUT_FUNCTIONS[export_format],
                                  hidden_units)
Esempio n. 25
0
    # Load Vocabularies
    words = tf.contrib.lookup.index_table_from_file(path_words,
                                                    num_oov_buckets=1)

    # Create the input data pipeline
    logging.info("Creating the datasets...")
    train_sentences = load_dataset_from_text(path_train_sentences)

    eval_sentences = load_dataset_from_text(path_eval_sentences)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.dev_size
    params.buffer_size = params.train_size  # buffer size for shuffling
    params.id_pad_word = words.lookup(tf.constant(params.pad_word))

    # Create the two iterators over the two datasets
    train_inputs = input_fn('train', train_sentences, words, params)
    eval_inputs = input_fn('eval', eval_sentences, words, params)
    logging.info("- done.")

    # Define the models (2 different set of nodes that share weights for train and eval)
    logging.info("Creating the model...")
    train_model_spec = model_fn('train', train_inputs, params)
    eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
    logging.info("- done.")

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
    train_and_evaluate(train_model_spec, eval_model_spec, args.model_dir,
                       params, args.restore_dir)
Esempio n. 26
0
    def build_model(self):
        x_shape = self.train_iter.get_shape("x")

        self.model = model_fn(x_shape=x_shape, rnn=self.config.rnn)
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser(
        description='train the model for all model')
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--model_dir', type=str, default='training_model')
    parser.add_argument('--n_threads', type=int, default=4)

    args = parser.parse_args()

    epochs = args.epochs
    batch_size = args.batch_size
    model_dir = args.model_dir
    input_shape = [28, 28, 1]
    num_classes = 10
    mnist_generator = MnistDataGenerator(args.batch_size)
    custom_runner = CustomRunner(input_shape, num_classes, args.batch_size,
                                 mnist_generator.train_iterator_heavy)

    images, labels = custom_runner.get_inputs()
    train_inputs = {'x': images, 'y': labels}
    valid_inputs = {
        'x': tf.placeholder(tf.float32, [
            None,
        ] + input_shape),
        'y': tf.placeholder(tf.float32, [None, num_classes])
    }

    train_model_spec = model_fn(train_inputs, is_train=True)
    valid_model_spec = model_fn(valid_inputs, reuse=True, is_train=False)

    os.makedirs(model_dir, exist_ok=True)
    set_logger(os.path.join(model_dir, 'train.log'))
    save_dir = os.path.join(model_dir, 'weights')
    save_path = os.path.join(save_dir, 'epoch')
    begin_at_epoch = 0
    steps_per_epoch = mnist_generator.num_train_sample // batch_size

    with tf.Session() as sess:
        saver = tf.train.Saver(max_to_keep=5)  # will keep last 5 epochs
        sess.run(tf.global_variables_initializer())

        tf.train.start_queue_runners(sess=sess)
        custom_runner.start_threads(sess, n_threads=args.n_threads)

        if os.path.isdir(save_dir):
            restore_from = tf.train.latest_checkpoint(save_dir)
            begin_at_epoch = int(restore_from.split('-')[-1])
            saver.restore(sess, restore_from)
            epochs += begin_at_epoch

        for epoch in range(begin_at_epoch, epochs):
            logging.info('Epoch {}/{}'.format(epoch + 1, epochs))
            train_loss, train_acc = train(sess, train_model_spec,
                                          steps_per_epoch)
            valid_loss, valid_acc = eval(sess, valid_model_spec,
                                         mnist_generator.test_iterator)
            logging.info('train/acc: {:.4f}, train/loss: {:.4f}'.format(
                train_acc, train_loss))
            logging.info('valid/acc: {:.4f}, valid/loss: {:.4f}'.format(
                valid_acc, valid_loss))
            saver.save(sess, save_path, global_step=epoch + 1)
Esempio n. 28
0
metrics = {}

# Count batches
print('Counting batches')
num_batches = 0
for _ in batch_generator(hparams, 'test'):
    num_batches += 1
print(num_batches, 'batches')

for mode in ['eval', 'eval_sample']:
    # Model
    tf.reset_default_graph()
    tf.set_random_seed(0)
    inputs = tf.placeholder(tf.int32, [None, None])
    labels = tf.placeholder(tf.int32, [None, None])
    probs, losses = model_fn(inputs, hparams, mode, labels)

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    # This kostyl helps to run evaluation second time with different graph
    gc.collect()

    # Evaluate model on test set
    with tf.Session() as sess:
        # Load model
        saver.restore(sess, os.path.join(hparams['model_path'], 'model.ckpt'))
        # Get batches from generator
        with PG(batch_generator(hparams, 'test'), 10) as g:
            g = GeneratorLen(g, num_batches)  # use progressbar
            for batch_x, batch_y in tqdm(g):
Esempio n. 29
0
def train(args: Dict):
    # import parmeter data
    hps = params_utils.parse_params(params.mnist_classification_args)

    # load dataset
    dataset = load_mnist(show_info=False)

    # inputs setting
    with tf.variable_scope('input'):
        if hps.data_params.is_flattened:
            shape = [None, np.prod(hps.data_params.image_size)]
        else:
            shape = [None] + list(hps.data_params.image_size)
        x = tf.placeholder(tf.float32, shape)
        y = tf.placeholder(tf.float32, [None, 10])
        is_training = tf.placeholder(tf.bool, shape=None)

    # format image
    if hps.data_params.is_flattened:
        if len(hps.data_params.image_size) == 3:
            image_shape = [-1] + list(hps.data_params.image_size)
        elif len(hps.data_params.image_size) == 2:
            image_shape = [-1] + list(hps.data_params.image_size) + [1]
        else:
            raise NotImplementedError('image shape should be NHW or NHWC')
    _x = tf.reshape(x, image_shape)

    # input -> model -> output
    y_hat = model.model_fn(_x, hps, is_training)

    # setup metrics
    with tf.name_scope('metrics'):
        with tf.name_scope('accuracy'):
            correctness = tf.equal(tf.argmax(y_hat), tf.argmax(y))
            correctness = tf.cast(correctness, tf.float32)
            accuracy = tf.reduce_mean(correctness)
        with tf.name_scope('loss'):
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=y, logits=y_hat)
            loss = tf.reduce_mean(cross_entropy)

        # note:
        # xxx is THE value, xxx_op is the OPERATION to update xxx
        with tf.name_scope('train'):
            train_loss, train_loss_op = tf.metrics.mean(loss,
                                                        name='train_loss')
            train_acc, train_acc_op = tf.metrics.mean(accuracy,
                                                      name='train_acc')
            tf.summary.scalar('loss', train_loss, collections=['train'])
            tf.summary.scalar('acc', train_acc, collections=['train'])

        with tf.name_scope('val'):
            val_loss, val_loss_op = tf.metrics.mean(loss, name='val_loss')
            val_acc, val_acc_op = tf.metrics.mean(accuracy, name='val_acc')
            tf.summary.scalar('loss', val_loss, collections=['val'])
            tf.summary.scalar('acc', val_acc, collections=['val'])

        # metrics initializer
        train_metrics_initialzie_op = tf.variables_initializer(
            [var for var in tf.local_variables() if 'train/' in var.name])
        val_metrics_initialize_op = tf.variables_initializer(
            [var for var in tf.local_variables() if 'val/' in var.name])

        # gathered summary operation
        train_summary_op = tf.summary.merge_all('train')
        val_summary_op = tf.summary.merge_all('val')

    # optimizer settings
    with tf.name_scope('optimizer'):
        global_step = tf.Variable(0, trainable=False, name='global_step')
        learning_rate = hps.hyper_parameters.learning_rate
        if hps.hyper_parameters.optimizer == model.Optimizer.ADAM:
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        elif hps.hypter_paramters.optimizer == model.Optimizer.SGD:
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        else:
            raise NotImplementedError('optimizer is in {}'.format(
                list(model.Optimizer)))
        train_step = optimizer.minimize(loss, global_step=global_step)

    init_op = tf.global_variables_initializer()
    local_init_op = tf.local_variables_initializer()

    saver = tf.train.Saver()
    path_prefix = Path(args.prefix)
    save_path = hps.paths.model_path / path_prefix
    save_path.mkdir(parents=True, exist_ok=True)
    ckpt = tf.train.get_checkpoint_state(save_path)

    with tf.Session() as sess:
        if ckpt:
            print('restore variable')
            last_model = ckpt.model_checkpoint_path
            saver.restore(sess, last_model)
            sess.run(train_metrics_initialzie_op)
            sess.run(val_metrics_initialize_op)
            writer = tf.summary.FileWriter(hps.paths.log_path / path_prefix,
                                           sess.graph)

        else:
            # initialize all variable and operations
            sess.run(init_op)
            sess.run(local_init_op)
            sess.run(train_metrics_initialzie_op)
            sess.run(val_metrics_initialize_op)
            sess.run(init_op)
            writer = tf.summary.FileWriter(hps.paths.log_path / path_prefix,
                                           sess.graph)

        for step in tqdm(range(hps.hyper_parameters.step)):
            step += 1
            batch = dataset.train.next_batch(hps.hyper_parameters.batch_size)
            sess.run([train_step, train_loss_op, train_acc_op],
                     feed_dict={
                         x: batch[0],
                         y: batch[1],
                         is_training: True
                     })
            # train_log
            if step % 100 == 0:
                summary, gstep = sess.run([train_summary_op, global_step])
                writer.add_summary(summary, global_step=gstep)
                sess.run(train_metrics_initialzie_op)
                saver.save(sess,
                           save_path / Path('model.ckpt'),
                           global_step=global_step)
            # validation log
            if step % 1000 == 0:
                sess.run(val_metrics_initialize_op)
                for _ in range(50):
                    val_batch = dataset.train.next_batch(100)
                    sess.run([val_loss_op, val_acc_op],
                             feed_dict={
                                 x: val_batch[0],
                                 y: val_batch[1],
                                 is_training: False
                             })
                summary, gstep = sess.run([val_summary_op, global_step])
                writer.add_summary(summary, global_step=gstep)
Esempio n. 30
0
def run(target,
        cluster_spec,
        is_chief,
        train_steps,
        eval_steps,
        job_dir,
        train_files,
        eval_files,
        train_batch_size,
        eval_batch_size,
        learning_rate,
        eval_frequency,
        first_layer_size,
        num_layers,
        scale_factor,
        num_epochs,
        export_format):

  """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

  # Calculate the number of hidden units
  hidden_units = [
      max(2, int(first_layer_size * scale_factor**i))
      for i in range(num_layers)
  ]

  # If the server is chief which is `master`
  # In between graph replication Chief is one node in
  # the cluster with extra responsibility and by default
  # is worker task zero. We have assigned master as the chief.
  #
  # See https://youtu.be/la_M6bCV91M?t=1203 for details on
  # distributed TensorFlow and motivation about chief.
  if is_chief:
    tf.logging.info("Created DNN hidden units {}".format(hidden_units))
    evaluation_graph = tf.Graph()
    with evaluation_graph.as_default():

      # Features and label tensors
      features, labels = model.input_fn(
          eval_files,
          num_epochs=None if eval_steps else 1,
          batch_size=eval_batch_size,
          shuffle=False
      )
      # Accuracy and AUROC metrics
      # model.model_fn returns the dict when EVAL mode
      metric_dict = model.model_fn(
          model.EVAL,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=learning_rate
      )

    hooks = [EvaluationRunHook(
        job_dir,
        metric_dict,
        evaluation_graph,
        eval_frequency,
        eval_steps=eval_steps,
    )]
  else:
    hooks = []

  # Create a new graph and specify that as default
  with tf.Graph().as_default():
    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

      # Features and label tensors as read using filename queue
      features, labels = model.input_fn(
          train_files,
          num_epochs=num_epochs,
          batch_size=train_batch_size
      )

      # Returns the training graph and global step tensor
      train_op, global_step_tensor = model.model_fn(
          model.TRAIN,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=learning_rate
      )

    # Creates a MonitoredSession for training
    # MonitoredSession is a Session-like object that handles
    # initialization, recovery and hooks
    # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
    with tf.train.MonitoredTrainingSession(master=target,
                                           is_chief=is_chief,
                                           checkpoint_dir=job_dir,
                                           hooks=hooks,
                                           save_checkpoint_secs=20,
                                           save_summaries_steps=50) as session:
      # Global step to keep track of global number of steps particularly in
      # distributed setting
      step = global_step_tensor.eval(session=session)

      # Run the training graph which returns the step number as tracked by
      # the global step tensor.
      # When train epochs is reached, session.should_stop() will be true.
      while (train_steps is None or
             step < train_steps) and not session.should_stop():
        step, _ = session.run([global_step_tensor, train_op])

    # Find the filename of the latest saved checkpoint file
    latest_checkpoint = tf.train.latest_checkpoint(job_dir)

    # Only perform this if chief
    if is_chief:
      build_and_run_exports(latest_checkpoint,
                            job_dir,
                            model.SERVING_INPUT_FUNCTIONS[export_format],
                            hidden_units)
Esempio n. 31
0
import tensorflow as tf
import numpy as np
from multiprocessing_generator import ParallelGenerator as PG
import os
from tqdm import tqdm

from featurizer import Encoder
from model import model_fn
from my_utils import *

from load_hparams import hparams, PrintHparamsInfo
PrintHparamsInfo(hparams)

# Model
inputs = tf.placeholder(tf.float32, [None, hparams['latent_size']])
decoded = model_fn(inputs, hparams, 'decode')
text_encoder = Encoder(hparams)

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Create output file
opath = os.path.join(hparams['output_path'], 'decoded.txt')
ofile = open(opath, 'w')

# Decode sequences
with tf.Session() as sess:
    # Load model
    saver.restore(sess, os.path.join(hparams['model_path'], 'model.ckpt'))
    # Get vectors from file
    for line in tqdm(
Esempio n. 32
0
from model import model_fn, pipeline_model_fn
from utils import load_data, parse_params
import time, os

# Store class and shape information.
num_classes = 10
input_shape = (28, 28, 1)

x_train, y_train, x_test, y_test = load_data()
args = parse_params()

start = time.time()  # 시작 시간 저장

if not args.use_ipu:
    # Model.__init__ takes two required arguments, inputs and outputs.
    model = keras.Model(*model_fn(input_shape, num_classes))

    # Compile our model with Stochastic Gradient Descent as an optimizer
    # and Categorical Cross Entropy as a loss.
    model.compile(optimizer='sgd',
                  loss='categorical_crossentropy',
                  metrics=["accuracy"])
    model.summary()
    print('Training')
    model.fit(x_train, y_train, epochs=3, batch_size=64)
    print('Evaluation')
    model.evaluate(x_test, y_test)
else:
    # Standard IPU TensorFlow setup.
    ipu_config = ipu.utils.create_ipu_config()
    ipu_config = ipu.utils.auto_select_ipus(ipu_config, 2)