def _run_batch_prediction(self, output_dir, use_target):
    reglinear.batch_predict(
        training_dir=self._train_output,
        prediction_input_file=(self._csv_eval_filename if use_target
                               else self._csv_predict_filename),
        output_dir=output_dir,
        mode='evaluation' if use_target else 'prediction',
        batch_size=4,
        output_format='csv')

    # check errors file is empty
    errors = file_io.get_matching_files(os.path.join(output_dir, 'errors*'))
    self.assertEqual(len(errors), 1)
    self.assertEqual(os.path.getsize(errors[0]), 0)

    # check predictions files are not empty
    predictions = file_io.get_matching_files(os.path.join(output_dir,
                                                          'predictions*'))
    self.assertGreater(os.path.getsize(predictions[0]), 0)

    # check the schema is correct
    schema_file = os.path.join(output_dir, 'csv_schema.json')
    self.assertTrue(os.path.isfile(schema_file))
    schema = json.loads(file_io.read_file_to_string(schema_file))
    self.assertEqual(schema[0]['name'], 'key')
    self.assertEqual(schema[1]['name'], 'predicted')
    if use_target:
      self.assertEqual(schema[2]['name'], 'target')
      self.assertEqual(len(schema), 3)
    else:
      self.assertEqual(len(schema), 2)
Beispiel #2
0
def stereo_stream(dir):
    print("loading files...")
    left_files = sorted(file_io.get_matching_files("{}/left/*.png".format(dir)))
    right_files = sorted(file_io.get_matching_files("{}/right/*.png".format(dir)))
    norm_files = sorted(file_io.get_matching_files("{}/norms/*.png".format(dir)))
    envmap_files = sorted(file_io.get_matching_files("{}/envmaps/*.hdr".format(dir)))
    bg_files = sorted(file_io.get_matching_files("{}/bg/*.png".format(dir)))
    print("loaded files")
    left_shape = get_input_size(left_files[0])
    right_shape = get_input_size(right_files[0])
    norms_shape = get_input_size(norm_files[0])
    envmap_shape = get_input_size(envmap_files[0])
    bg_shape = get_input_size(bg_files[0])
    assert len(left_files) == len(right_files) == len(envmap_files) == len(norm_files) == len(bg_files)
    assert left_shape == right_shape
    left = tf.data.Dataset.from_tensor_slices(
        tf.convert_to_tensor(left_files, dtype=tf.string))
    right = tf.data.Dataset.from_tensor_slices(
        tf.convert_to_tensor(right_files, dtype=tf.string))
    envmaps = tf.data.Dataset.from_tensor_slices(
        tf.convert_to_tensor(envmap_files, dtype=tf.string))
    norms = tf.data.Dataset.from_tensor_slices(
        tf.convert_to_tensor(norm_files, dtype=tf.string))
    bgs = tf.data.Dataset.from_tensor_slices(
        tf.convert_to_tensor(bg_files, dtype=tf.string))
    print("prepared data size: {}".format(len(left_files)))
    return (left, left_shape), (right, right_shape), (envmaps, envmap_shape), (norms, norms_shape), (bgs, bg_shape)
def latest_checkpoint(checkpoint_dir, latest_filename=None):
  """Finds the filename of latest saved checkpoint file.

  Args:
    checkpoint_dir: Directory where the variables were saved.
    latest_filename: Optional name for the protocol buffer file that
      contains the list of most recent checkpoint filenames.
      See the corresponding argument to `Saver.save()`.

  Returns:
    The full path to the latest checkpoint or `None` if no checkpoint was found.
  """
  # Pick the latest checkpoint based on checkpoint state.
  ckpt = get_checkpoint_state(checkpoint_dir, latest_filename)
  if ckpt and ckpt.model_checkpoint_path:
    # Look for either a V2 path or a V1 path, with priority for V2.
    v2_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path,
                                         saver_pb2.SaverDef.V2)
    v1_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path,
                                         saver_pb2.SaverDef.V1)
    if file_io.get_matching_files(v2_path) or file_io.get_matching_files(
        v1_path):
      return ckpt.model_checkpoint_path
    else:
      logging.error("Couldn't match files for checkpoint %s",
                    ckpt.model_checkpoint_path)
  return None
Beispiel #4
0
def latest_checkpoint(checkpoint_dir, latest_filename=None):
  """Finds the filename of latest saved checkpoint file.

  Args:
    checkpoint_dir: Directory where the variables were saved.
    latest_filename: Optional name for the protocol buffer file that
      contains the list of most recent checkpoint filenames.
      See the corresponding argument to `Saver.save()`.

  Returns:
    The full path to the latest checkpoint or `None` if no checkpoint was found.
  """
  # Pick the latest checkpoint based on checkpoint state.
  ckpt = get_checkpoint_state(checkpoint_dir, latest_filename)
  if ckpt and ckpt.model_checkpoint_path:
    # Look for either a V2 path or a V1 path, with priority for V2.
    v2_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path,
                                         saver_pb2.SaverDef.V2)
    v1_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path,
                                         saver_pb2.SaverDef.V1)
    if file_io.get_matching_files(v2_path) or file_io.get_matching_files(
        v1_path):
      return ckpt.model_checkpoint_path
    else:
      logging.error("Couldn't match files for checkpoint %s",
                    ckpt.model_checkpoint_path)
  return None
def run_experiment(args):
    if args.restart_training:
        shutil.rmtree(args.job_dir, ignore_errors=True)

    content_size = style_size = (args.image_size, args.image_size)

    num_content_samples = sum(1 for f in file_io.get_matching_files(args.content_files)
                            for _ in tf.python_io.tf_record_iterator(f))

    num_style_samples = sum(1 for f in file_io.get_matching_files(args.style_files)
                            for _ in tf.python_io.tf_record_iterator(f))

    print("Number of training content samples: " + str(num_content_samples))
    print("Number of training style samples: " + str(num_style_samples))

    model_fn = create_model_fn(data_format=args.data_format)

    loss_model_fn = create_loss_model_fn(weights_path=args.vgg_path, data_format=args.data_format)

    loss_fn = create_loss_fn(data_format=args.data_format)

    estimator_fn = create_estimator_fn(
        model_fn=model_fn,
        loss_model_fn=loss_model_fn,
        loss_fn=loss_fn,
        data_format=args.data_format)

    config = tf.estimator.RunConfig(
        tf_random_seed=42,
        save_summary_steps=args.log_iter,
        save_checkpoints_steps=args.checkpoint_iter,
        log_step_count_steps=args.log_iter,
        model_dir=args.job_dir)

    params = tf_training.HParams(
        learning_rate=args.learning_rate,
        content_features=args.content_features,
        style_features=args.style_features,
        content_weight=args.content_weight,
        style_weight=args.style_weight)

    estimator = tf.estimator.Estimator(
        model_fn=estimator_fn,
        params=params,
        config=config)

    style_epochs = args.num_epochs * num_content_samples // args.batch_size

    train_inputs_fn = create_inputs_fn(
        content_tfrecords=args.content_files,
        style_tfrecords=args.style_files,
        content_size=content_size,
        style_size=style_size,
        content_epochs=args.num_epochs,
        style_epochs=style_epochs,
        batch_size=args.batch_size,
        shuffle_buffer_size=args.shuffle_buffer_size,
        scope="train_inputs")

    estimator.train(input_fn=train_inputs_fn)
Beispiel #6
0
def get_train_eval_files(input_dir):
  """Get preprocessed training and eval files."""
  data_dir = _get_latest_data_dir(input_dir)
  train_pattern = os.path.join(data_dir, 'train*.tfrecord.gz')
  eval_pattern = os.path.join(data_dir, 'eval*.tfrecord.gz')
  train_files = file_io.get_matching_files(train_pattern)
  eval_files = file_io.get_matching_files(eval_pattern)
  return train_files, eval_files
Beispiel #7
0
def get_train_eval_files(input_dir):
    """Get preprocessed training and eval files."""
    data_dir = _get_latest_data_dir(input_dir)
    train_pattern = os.path.join(data_dir, 'train*.tfrecord.gz')
    eval_pattern = os.path.join(data_dir, 'eval*.tfrecord.gz')
    train_files = file_io.get_matching_files(train_pattern)
    eval_files = file_io.get_matching_files(eval_pattern)
    return train_files, eval_files
Beispiel #8
0
 def delete_backup(self):
     """Delete the backup directories.
     Delete the backup directories which should not exist after `fit()`
     successfully finishes.
     """
     for pathname in file_io.get_matching_files(self.write_checkpoint_manager._prefix + '*'):
         _delete_file_or_dir(pathname)
     for pathname in file_io.get_matching_files(os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
         _delete_file_or_dir(pathname)
Beispiel #9
0
def train_eval(traindir, evaldir, batchsize, bucket, epochs, outputdir, hidden_units, feat_eng_cols, job_dir, learn_rate, dropout,  **kwargs):
    # define classifier config
    
    classifier_config=tf.estimator.RunConfig(save_checkpoints_steps=10)
    
    hidden_units = hidden_units.split(',')
    real_feature_columns, all_feature_columns = get_features(feat_eng_cols)

    optimizer = tf.train.ProximalAdagradOptimizer(
            learning_rate=float(learn_rate),
            l1_regularization_strength=0.1,
            l2_regularization_strength=0.01
            )
    # define classifier
 
    classifier = tf.estimator.DNNLinearCombinedClassifier(
        linear_feature_columns=all_feature_columns,
        dnn_feature_columns=real_feature_columns,
        dnn_hidden_units = hidden_units,
        n_classes=len(class_labels),
        label_vocabulary=class_labels,
        model_dir=job_dir,
        config=classifier_config, 
        dnn_dropout=float(dropout),
        # dnn_optimizer=optimizer
        )
    
    # load training and eval files    
    traindata =   [file for file in file_io.get_matching_files(traindir + '/trajectories.csv*')]
    evaldata =    [file for file in file_io.get_matching_files(evaldir + '/trajectories.csv*')]

    # define training and eval params
    train_input = lambda: my_input_fn(
            traindata,
            batch_size=batchsize,
            epochs=epochs,
            perform_shuffle=True
        )

    eval_input = lambda: my_input_fn(
        evaldata,
        perform_shuffle=False,
        epochs=1
    )

    # define training, eval spec for train and evaluate including
    train_spec = tf.estimator.TrainSpec(train_input, 
                                        max_steps=1000
                                        )
    
    exporter = tf.estimator.LatestExporter('exporter',serving_input_fn)                                    
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                    name='trajectory-eval',
                                    exporters=[exporter]
                                    )                                  
    # run training and evaluation
    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
Beispiel #10
0
  def delete_backup(self):
    """Delete the backup directories.

    Delete the backup directories which should not exist after `fit()`
    successfully finishes.
    """
    # pylint: disable=protected-access
    for pathname in file_io.get_matching_files(
        self.write_checkpoint_manager._prefix + '*'):
      file_io.delete_recursively(pathname)
    for pathname in file_io.get_matching_files(
        os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
      file_io.delete_recursively(pathname)
Beispiel #11
0
def _get_training_validation_testing_dataset(bottleneck_dir, label_table,
                                             testing_dataset_size):
    """Gets the dataset for training, validation and testing.

  Args:
    bottleneck_dir: Directory containing the bottleneck TFRecords.
    label_table: A tensorflow Table of all labels found in dataset.
    testing_dataset_size: The number of images in the testing dataset.


  Returns:
    (training_dataset, validation_dataset, testing_dataset) tuple.

    training_dataset is a Dataset containing training images.
    validation_dataset is a Dataset containing validation images.
    testing_dataset is a Dataset containing testing images.
  """
    def _full_tfrecord_parser(serialized_example):
        """Parses a tf.Example into (image, label index, bottleneck) Tensors."""
        features = {
            'image_path':
            tf.FixedLenFeature((), tf.string),
            'label':
            tf.FixedLenFeature((), tf.string),
            'bottleneck':
            tf.FixedLenFeature([INCEPTION_V3_BOTTLENECK_SIZE], tf.float32),
        }
        example = tf.parse_single_example(serialized_example,
                                          features=features)
        label_index = label_table.lookup(example['label'])
        return example['image_path'], label_index, example['bottleneck']

    training_bottleneck_files = file_io.get_matching_files(
        os.path.join(bottleneck_dir, constants.TRAINING_DATASET + '*'))
    training_dataset = tf.data.TFRecordDataset(training_bottleneck_files).map(
        _full_tfrecord_parser).repeat().batch(FLAGS.train_batch_size)

    validation_bottleneck_files = file_io.get_matching_files(
        os.path.join(bottleneck_dir, constants.VALIDATION_DATASET + '*'))
    validation_dataset = tf.data.TFRecordDataset(
        validation_bottleneck_files).map(_full_tfrecord_parser).repeat().batch(
            FLAGS.validation_batch_size)

    testing_bottleneck_files = file_io.get_matching_files(
        os.path.join(bottleneck_dir, constants.TESTING_DATASET + '*'))
    testing_dataset = tf.data.TFRecordDataset(testing_bottleneck_files).map(
        _full_tfrecord_parser).batch(testing_dataset_size)
    return training_dataset, validation_dataset, testing_dataset
    def begin(self):
        """
    Restore parameters if a pre-trained model is available and
    we haven't trained previously.
    """
        if not self.initialized:
            #checkpoint = tf.train.latest_checkpoint(self.model_path)
            all_checkpoints = file_io.get_matching_files(
                os.path.join(self.model_path, 'model.ckpt-*.index'))

            if not all_checkpoints:
                raise ValueError('No checkpoint files found matching %s.' %
                                 (self.model_path + '*'))

            all_checkpoints = [
                x.replace('.index', '') for x in all_checkpoints
            ]
            all_checkpoints = sorted(all_checkpoints,
                                     key=lambda x: int(x.split('-')[-1]))
            checkpoint = all_checkpoints[-1]

            if checkpoint is None:
                logging.info('No pre-trained model is available at %s, '
                             'training from scratch.' % self.model_path)
            else:
                logging.info(
                    'Pre-trained model {0} found in {1} - warmstarting.'.
                    format(checkpoint, self.model_path))
                tf.train.warm_start(checkpoint)
            self.initialized = True
Beispiel #13
0
def _batch_predict(args, cell):
  if args['cloud_config'] and not args['cloud']:
    raise ValueError('"cloud_config" is provided but no "--cloud". '
                     'Do you want local run or cloud run?')

  if args['cloud']:
    parts = args['model'].split('.')
    if len(parts) != 2:
      raise ValueError('Invalid model name for cloud prediction. Use "model.version".')

    version_name = ('projects/%s/models/%s/versions/%s' %
                    (Context.default().project_id, parts[0], parts[1]))

    cloud_config = args['cloud_config'] or {}
    job_id = cloud_config.pop('job_id', None)
    job_request = {
      'version_name': version_name,
      'data_format': 'TEXT',
      'input_paths': file_io.get_matching_files(args['prediction_data']['csv']),
      'output_path': args['output'],
    }
    job_request.update(cloud_config)
    job = datalab_ml.Job.submit_batch_prediction(job_request, job_id)
    _show_job_link(job)
  else:
    print('local prediction...')
    _local_predict.local_batch_predict(args['model'],
                                       args['prediction_data']['csv'],
                                       args['output'],
                                       args['format'],
                                       args['batch_size'])
    print('done.')
Beispiel #14
0
def create_object_test():
    """Verifies file_io's object manipulation methods ."""
    starttime = int(round(time.time() * 1000))
    dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
    print("Creating dir %s." % dir_name)
    file_io.create_dir(dir_name)

    # Create a file in this directory.
    file_name = "%s/test_file.txt" % dir_name
    print("Creating file %s." % file_name)
    file_io.write_string_to_file(file_name, "test file creation.")

    list_files_pattern = "%s/test_file*.txt" % dir_name
    print("Getting files matching pattern %s." % list_files_pattern)
    files_list = file_io.get_matching_files(list_files_pattern)
    print(files_list)

    assert len(files_list) == 1
    assert files_list[0] == file_name

    # Cleanup test files.
    print("Deleting file %s." % file_name)
    file_io.delete_file(file_name)

    # Delete directory.
    print("Deleting directory %s." % dir_name)
    file_io.delete_recursively(dir_name)
Beispiel #15
0
    def _run_batch_prediction(self):
        """Run batch prediction using the cloudml engine prediction service.

    There is no local version of this step as it's the last step.
    """

        job_name = 'test_mltoolbox_batchprediction_%s' % uuid.uuid4().hex
        cmd = [
            'gcloud ml-engine jobs submit prediction ' + job_name,
            '--data-format=TEXT',
            '--input-paths=' + self._csv_predict_filename,
            '--output-path=' + self._prediction_output,
            '--model-dir=' + os.path.join(self._train_output, 'model'),
            '--runtime-version=1.0', '--region=us-central1'
        ]
        self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd))
        subprocess.check_call(' '.join(cmd), shell=True)  # async call.
        subprocess.check_call('gcloud ml-engine jobs stream-logs ' + job_name,
                              shell=True)

        # check that there was no errors.
        error_files = file_io.get_matching_files(
            os.path.join(self._prediction_output, 'prediction.errors_stats*'))
        self.assertEqual(1, len(error_files))
        error_str = file_io.read_file_to_string(error_files[0])
        self.assertEqual('', error_str)
Beispiel #16
0
def input_fn(files, num_epochs=None, shuffle=False, shared_name=None):

    # get file names
    if file_io.is_directory(files[0]):
        file_names = file_io.get_matching_files(files[0] + '/*tfrecord')
    else:
        file_names = files

    # shuffle if required
    if shuffle:
        shuffle_fn(file_names)

    # queue with the file names that can be shared amongst workers during training
    filename_queue = tf.FIFOQueue(100, tf.string, shared_name=shared_name)
    enque_op = filename_queue.enqueue_many([tf.train.limit_epochs(file_names, num_epochs)])
    close_op = filename_queue.close(cancel_pending_enqueues=True)

    # create queue runner and add it to queue runners
    qr = tf.train.QueueRunner(filename_queue, [enque_op], close_op,
                              queue_closed_exception_types=(tf.errors.OutOfRangeError, tf.errors.CancelledError))
    tf.train.add_queue_runner(qr)

    # read example from file
    reader = tf.TFRecordReader()
    _, example = reader.read(filename_queue)

    # parse example
    image, ground_truth, example_name = parse_example(example)

    return image, ground_truth, example_name
Beispiel #17
0
 def get_checkpoint(self, last_global_step_val):
     if FLAGS.start_eval_from_ckpt:
         files = file_io.get_matching_files(
             join(self.train_dir, 'model.ckpt-*.index'))
         # No files
         if not files:
             return None, None
         files = sorted(files, key=self._get_global_step_from_ckpt)
         start_at = FLAGS.start_eval_from_ckpt
         if str(start_at).isdigit():
             start_at = int(start_at)
             files = list(
                 filter(
                     lambda x: self._get_global_step_from_ckpt(x) >
                     start_at, files))
         for filename in files:
             filname_global_step = self._get_global_step_from_ckpt(filename)
             if last_global_step_val < filname_global_step:
                 return filename[:-6], filname_global_step
         return None, None
     else:
         latest_checkpoint = tf.train.latest_checkpoint(self.train_dir)
         if latest_checkpoint is None:
             return None, None
         global_step = self._get_global_step_from_ckpt(latest_checkpoint)
         return latest_checkpoint, global_step
Beispiel #18
0
    def __init__(self, mode, batch_size, img_size, preprocessfunc=None):
        self.mode = mode
        self.batch_size = batch_size
        self.img_size = img_size
        self.preprocessfunc = preprocessfunc
        self.data = []
        self.file_location = join(GCP_paths['data'], mode)
        print('Loading data from {}'.format(self.file_location))

        for fpath in file_io.get_matching_files(
                join(self.file_location, '*.jpg')):
            fname = fpath.split('/')[-1]
            if self.mode != 'test':
                split = fname[0:-4].split('_')
                labels = split[3][1:-1].split(',')
                labels = list(map(int, labels))
                self.data.append({
                    'id': int(split[1]),
                    'labels': labels,
                    'file': fpath
                })
            else:
                self.data.append({'id': fname[0:-4], 'file': fpath})
        self.data = pd.DataFrame(self.data, columns=['id', 'labels', 'file'])
        self.n_samples = len(self.data)
        self.n_batches = int(np.ceil(self.n_samples / self.batch_size))

        print('SequenceFromGCP <{}>: {} samples'.format(
            self.mode, self.n_samples))
Beispiel #19
0
    def input_fn(self, mode: tf.estimator.ModeKeys, params, data_dir):
        file_paths = file_io.get_matching_files(
            os.path.join(data_dir, "part-r-*"))  #返回一个列表
        data_set = tf.data.TFRecordDataset(file_paths, buffer_size=800 * 1024)
        batch_size = params["batch_size"]

        def parse(raw):
            context_dic,seq_dic =\
                tf.parse_single_sequence_example(serialized=raw
                                                 ,context_features=self.conttext_spec
                                                 ,sequence_features=self.sequence_spec)
            label = context_dic.pop(params["item_relevance"])
            label = label * 3
            #seq_dic[self.params["user_car_serial"]] = context_dic[self.params["user_car_serial"]]
            for k, v in six.iteritems(seq_dic):
                context_dic[k] = v
            return context_dic, label

        if mode == tf.estimator.ModeKeys.TRAIN:
            data_set = data_set.repeat(None).map(parse).batch(
                batch_size)  #.prefetch(buffer_size=None)
        elif mode == tf.estimator.ModeKeys.EVAL:
            data_set = data_set.repeat(None) \
                .take(3000).map(parse).batch(60)#.prefetch(buffer_size=None)
        it = data_set.make_one_shot_iterator()
        feature, label = it.get_next()
        label = tf.sparse_tensor_to_dense(label, default_value=-1)
        return feature, label
Beispiel #20
0
def main(argv):
  if FLAGS.output_type not in VALID_OUTPUT_TYPES:
    raise ValueError('output_type "%s" not in allowed types: %s' %
                     (FLAGS.output_type, VALID_OUTPUT_TYPES))

  # Exclude argv[0], which is the current binary.
  patterns = argv[1:]
  if not patterns:
    raise ValueError('PNG file glob(s) must be specified')
  input_paths = []
  for pattern in patterns:
    pattern_paths = file_io.get_matching_files(pattern)
    if not pattern_paths:
      raise ValueError('Pattern "%s" failed to match any files' % pattern)
    input_paths.extend(pattern_paths)

  start = time.time()
  output = run(
      input_paths,
      FLAGS.glyphs_saved_model,
      output_notesequence=FLAGS.output_type == 'NoteSequence')
  end = time.time()
  sys.stderr.write('OMR elapsed time: %.2f\n' % (end - start))

  if FLAGS.output_type == 'MusicXML':
    output_bytes = conversions.score_to_musicxml(output)
  else:
    if FLAGS.text_format:
      output_bytes = text_format.MessageToString(output).encode('utf-8')
    else:
      output_bytes = output.SerializeToString()
  file_io.write_string_to_file(FLAGS.output, output_bytes)
  def testAPIBackwardsCompatibility(self):
    # Extract all API stuff.
    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()

    public_api_visitor = public_api.PublicAPIVisitor(visitor)
    public_api_visitor.do_not_descend_map[''].append('contrib')
    traverse.traverse(tf, public_api_visitor)

    proto_dict = visitor.GetProtos()

    # Read all golden files.
    expression = os.path.join(
        resource_loader.get_root_dir_with_all_resources(),
        _KeyToFilePath('*'))
    golden_file_list = file_io.get_matching_files(expression)

    def _ReadFileToProto(filename):
      """Read a filename, create a protobuf from its contents."""
      ret_val = api_objects_pb2.TFAPIObject()
      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
      return ret_val

    golden_proto_dict = {
        _FileNameToKey(filename): _ReadFileToProto(filename)
        for filename in golden_file_list
    }

    # Diff them. Do not fail if called with update.
    # If the test is run to update goldens, only report diffs but do not fail.
    self._AssertProtoDictEquals(
        golden_proto_dict,
        proto_dict,
        verbose=FLAGS.verbose_diffs,
        update_goldens=FLAGS.update_goldens)
Beispiel #22
0
    def _get_list_checkpoint(self, n_export, model_dir):
        """Get the checkpoints that we want to export.

    Args:
      n_export: Number of models to export.
      model_dir: Directory containing the checkpoints.

    Returns:
      List of checkpoint path.

    If n_export==1, we take only the last checkpoint.
    Otherwise, we consider the list of steps for each for which we have a checkpoint.
    Then we choose n_export number of checkpoints such as their steps are as equidistant as possible.  
    """

        checkpoints = file_io.get_matching_files(
            os.path.join(model_dir, 'model.ckpt-*.index'))
        checkpoints = [x.replace('.index', '') for x in checkpoints]
        checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))

        if n_export == 1:
            return [checkpoints[-1]]

        # We want to cover a distance of (len(checkpoints) - 1): for 3 points, we have a distance of 2.
        # with a number of points of (n_export -1): because 1 point is set at the end.
        step = float(len(checkpoints) - 1) / (n_export - 1)
        if step <= 1:  # Fewer checkpoints available than the desired number.
            return checkpoints

        checkpoints_to_export = [
            checkpoints[int(i * step)] for i in range(n_export - 1)
        ]
        checkpoints_to_export.append(checkpoints[-1])

        return checkpoints_to_export
def _load_tf_custom_op(model_path):
    """Loads a custom TF OP (in .so format) from /assets.extra directory."""
    assets_dir = os.path.join(model_path, _CUSTOM_OP_DIRECTORY_NAME)
    if file_io.is_directory(assets_dir):
        custom_ops_pattern = os.path.join(assets_dir, _CUSTOM_OP_SUFFIX)
        for custom_op_path_original in file_io.get_matching_files(
                custom_ops_pattern):
            logging.info("Found custom op file: %s", custom_op_path_original)
            if custom_op_path_original.startswith("gs://"):
                if not os.path.isdir(_CUSTOM_OP_LOCAL_DIR):
                    os.makedirs(_CUSTOM_OP_LOCAL_DIR)
                custom_op_path_local = os.path.join(
                    _CUSTOM_OP_LOCAL_DIR,
                    os.path.basename(custom_op_path_original))
                logging.info("Copying custop op from: %s to: %s",
                             custom_op_path_original, custom_op_path_local)
                file_io.copy(custom_op_path_original, custom_op_path_local,
                             True)
            else:
                custom_op_path_local = custom_op_path_original
            try:
                import tensorflow as tf  # pylint: disable=g-import-not-at-top
                logging.info("Loading custom op: %s", custom_op_path_local)
                logging.info("TF Version: %s", tf.__version__)
                tf.load_op_library(custom_op_path_local)
            except RuntimeError as e:
                logging.exception(
                    "Failed to load custom op: %s with error: %s. Prediction "
                    "will likely fail due to missing operations.",
                    custom_op_path_local, e)
Beispiel #24
0
def _get_image_label_info(bottleneck_dir):
    # type: str -> (int, List[str])
    """Calculates the number of images and unique labels in dataset.

  Args:
    bottleneck_dir: Directory containing the bottleneck TFRecords.

  Returns:
    image_count: Total number of images in dataset.
    label_list: List of labels found in dataset.
  This function will parse the TFRecords found in bottleneck dir and will
  only return the labels.
  """
    labels = OrderedDict()
    dataset_to_image_count = defaultdict(int)
    bottleneck_files = file_io.get_matching_files(
        os.path.join(bottleneck_dir, '*'))
    for bottleneck_file in bottleneck_files:
        for it in tf.compat.v1.io.tf_record_iterator(bottleneck_file):
            example = tf.train.Example()
            example.ParseFromString(it)
            label = example.features.feature['label'].bytes_list.value[0]
            labels[label] = True
            dataset = example.features.feature['dataset'].bytes_list.value[0]
            dataset_to_image_count[dataset] += 1

    label_list = []
    for key in labels:
        label_list.append(key)
    return dataset_to_image_count, label_list
Beispiel #25
0
  def _GetBaseApiMap(self):
    """Get a map from graph op name to its base ApiDef.

    Returns:
      Dictionary mapping graph op name to corresponding ApiDef.
    """
    # Convert base ApiDef in Multiline format to Proto format.
    converted_base_api_dir = os.path.join(
        test.get_temp_dir(), 'temp_base_api_defs')
    subprocess.check_call(
        [os.path.join(resource_loader.get_root_dir_with_all_resources(),
                      _CONVERT_FROM_MULTILINE_SCRIPT),
         _BASE_API_DIR, converted_base_api_dir])

    name_to_base_api_def = {}
    base_api_files = file_io.get_matching_files(
        os.path.join(converted_base_api_dir, 'api_def_*.pbtxt'))
    for base_api_file in base_api_files:
      if file_io.file_exists(base_api_file):
        api_defs = api_def_pb2.ApiDefs()
        text_format.Merge(
            file_io.read_file_to_string(base_api_file), api_defs)
        for api_def in api_defs.op:
          name_to_base_api_def[api_def.graph_op_name] = api_def
    return name_to_base_api_def
Beispiel #26
0
  def _load_metadata_files(self):
    """Load and parse metadata files in the dump root.

    Check that all metadata files have a common tfdbg_run_id, and raise
    a ValueError if their tfdbg_run_ids differ.

    Returns:
      A list of metadata file paths in ascending order of their starting
        wall_time timestamp.
    """

    metadata_paths = file_io.get_matching_files(
        os.path.join(self._dump_root, "*%s" % self._METADATA_SUFFIX))
    if not metadata_paths:
      raise ValueError("Cannot find any tfdbg metadata file in directory: %s" %
                       self._dump_root)
    wall_times = []
    run_ids = []
    tensorflow_versions = []
    file_versions = []
    for metadata_path in metadata_paths:
      reader = tf_record.tf_record_random_reader(metadata_path)
      try:
        record = reader.read(0)[0]
        debug_event = debug_event_pb2.DebugEvent.FromString(record)
        wall_times.append(debug_event.wall_time)
        run_ids.append(debug_event.debug_metadata.tfdbg_run_id)
        tensorflow_versions.append(
            debug_event.debug_metadata.tensorflow_version)
        file_versions.append(debug_event.debug_metadata.file_version)
      finally:
        reader.close()
    self._starting_wall_time = wall_times[0]
    self._tfdbg_run_id = run_ids[0]
    self._tensorflow_version = tensorflow_versions[0]
    self._file_version = file_versions[0]
    if len(metadata_paths) == 1:
      # Fast path for a common case (only one DebugEvent file set.)
      return metadata_paths

    num_no_id = len([run_id for run_id in run_ids if not run_id])
    if num_no_id:
      paths_without_run_id = [
          metadata_path
          for metadata_path, run_id in zip(metadata_paths, run_ids)
          if not run_id
      ]
      raise ValueError(
          "Found %d tfdbg metadata files and %d of them do not "
          "have tfdbg run ids. The metadata files without run ids are: %s" %
          (len(run_ids), num_no_id, paths_without_run_id))
    elif len(set(run_ids)) != 1:
      raise ValueError(
          "Unexpected: Found multiple (%d) tfdbg2 runs in directory %s" %
          (len(set(run_ids)), self._dump_root))
    # Return the metadata files in ascending order of their timestamps.
    paths_and_timestamps = sorted(
        zip(metadata_paths, wall_times), key=lambda t: t[1])
    self._starting_wall_time = paths_and_timestamps[0][1]
    return [path[0] for path in paths_and_timestamps]
Beispiel #27
0
def create_object_test():
  """Verifies file_io's object manipulation methods ."""
  starttime = int(round(time.time() * 1000))
  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
  print("Creating dir %s." % dir_name)
  file_io.create_dir(dir_name)

  # Create a file in this directory.
  file_name = "%s/test_file.txt" % dir_name
  print("Creating file %s." % file_name)
  file_io.write_string_to_file(file_name, "test file creation.")

  list_files_pattern = "%s/test_file*.txt" % dir_name
  print("Getting files matching pattern %s." % list_files_pattern)
  files_list = file_io.get_matching_files(list_files_pattern)
  print(files_list)

  assert len(files_list) == 1
  assert files_list[0] == file_name

  # Cleanup test files.
  print("Deleting file %s." % file_name)
  file_io.delete_file(file_name)

  # Delete directory.
  print("Deleting directory %s." % dir_name)
  file_io.delete_recursively(dir_name)
Beispiel #28
0
def get_all_checkpoints(output_dir):
    """docstring."""
    ckpt = cm.get_checkpoint_state(output_dir, None)
    res = []
    if not ckpt:
        return None
    for path in ckpt.all_model_checkpoint_paths:
        # Look for either a V2 path or a V1 path, with priority for V2.
        v2_path = cm._prefix_to_checkpoint_path(path, saver_pb2.SaverDef.V2)
        v1_path = cm._prefix_to_checkpoint_path(path, saver_pb2.SaverDef.V1)
        if file_io.get_matching_files(v2_path) or file_io.get_matching_files(
                v1_path):
            res.append(path)
        else:
            tf.logging.error("Couldn't match files for checkpoint %s", path)
    return res
Beispiel #29
0
  def raw_training_input_fn():
    """Training input function that reads raw data and applies transforms."""

    if isinstance(raw_data_file_pattern, six.string_types):
      filepath_list = [raw_data_file_pattern]
    else:
      filepath_list = raw_data_file_pattern

    files = []
    for path in filepath_list:
      files.extend(file_io.get_matching_files(path))

    filename_queue = tf.train.string_input_producer(
        files, num_epochs=num_epochs, shuffle=randomize_input)

    csv_id, csv_lines = tf.TextLineReader().read_up_to(filename_queue, training_batch_size)

    queue_capacity = (reader_num_threads + 3) * training_batch_size + min_after_dequeue
    if randomize_input:
      _, batch_csv_lines = tf.train.shuffle_batch(
          tensors=[csv_id, csv_lines],
          batch_size=training_batch_size,
          capacity=queue_capacity,
          min_after_dequeue=min_after_dequeue,
          enqueue_many=True,
          num_threads=reader_num_threads,
          allow_smaller_final_batch=allow_smaller_final_batch)

    else:
      _, batch_csv_lines = tf.train.batch(
          tensors=[csv_id, csv_lines],
          batch_size=training_batch_size,
          capacity=queue_capacity,
          enqueue_many=True,
          num_threads=reader_num_threads,
          allow_smaller_final_batch=allow_smaller_final_batch)

    csv_header, record_defaults = csv_header_and_defaults(features, schema, stats, keep_target=True)
    parsed_tensors = tf.decode_csv(batch_csv_lines, record_defaults, name='csv_to_tensors')
    raw_features = dict(zip(csv_header, parsed_tensors))

    transform_fn = make_preprocessing_fn(analysis_output_dir, features, keep_target=True)
    transformed_tensors = transform_fn(raw_features)

    # Expand the dims of non-sparse tensors. This is needed by tf.learn.
    transformed_features = {}
    for k, v in six.iteritems(transformed_tensors):
      if isinstance(v, tf.Tensor) and v.get_shape().ndims == 1:
        transformed_features[k] = tf.expand_dims(v, -1)
      else:
        transformed_features[k] = v

    # Remove the target tensor, and return it directly
    target_name = get_target_name(features)
    if not target_name or target_name not in transformed_features:
      raise ValueError('Cannot find target transform in features')

    transformed_target = transformed_features.pop(target_name)

    return transformed_features, transformed_target
Beispiel #30
0
    def __init__(self, dump_root):
        if not file_io.is_directory(dump_root):
            raise ValueError("Specified dump_root is not a directory: %s" %
                             dump_root)
        metadata_paths = file_io.get_matching_files(
            os.path.join(dump_root, "*.metadata"))
        if not metadata_paths:
            raise ValueError("Cannot find any metadata file in directory: %s" %
                             dump_root)
        elif len(metadata_paths) > 1:
            raise ValueError(
                "Unexpected: Found multiple (%d) metadata in directory: %s" %
                (len(metadata_paths), dump_root))
        self._metadata_path = compat.as_bytes(metadata_paths[0])
        self._metadata_reader = None

        prefix = metadata_paths[0][:-len(".metadata")]
        self._source_files_path = compat.as_bytes("%s.source_files" % prefix)
        self._stack_frames_path = compat.as_bytes("%s.stack_frames" % prefix)
        self._graphs_path = compat.as_bytes("%s.graphs" % prefix)
        self._execution_path = compat.as_bytes("%s.execution" % prefix)
        self._graph_execution_traces_path = compat.as_bytes(
            "%s.graph_execution_traces" % prefix)
        self._readers = dict()  # A map from file path to reader.
        # A map from file path to current reading offset.
        self._reader_offsets = dict()
        # Lock for reader creation.
        self._readers_lock = threading.Lock()
        # Locks for read operation on individual readers.
        self._reader_read_locks = dict()

        self._offsets = dict()
  def checkBackwardsCompatibility(self, root, golden_file_pattern, api_version):
    # Extract all API stuff.
    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()

    public_api_visitor = public_api.PublicAPIVisitor(visitor)
    public_api_visitor.do_not_descend_map['tf'].append('contrib')
    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
    traverse.traverse(root, public_api_visitor)

    proto_dict = visitor.GetProtos()

    # Read all golden files.
    golden_file_list = file_io.get_matching_files(golden_file_pattern)

    def _ReadFileToProto(filename):
      """Read a filename, create a protobuf from its contents."""
      ret_val = api_objects_pb2.TFAPIObject()
      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
      return ret_val

    golden_proto_dict = {
        _FileNameToKey(filename): _ReadFileToProto(filename)
        for filename in golden_file_list
    }

    # Diff them. Do not fail if called with update.
    # If the test is run to update goldens, only report diffs but do not fail.
    self._AssertProtoDictEquals(
        golden_proto_dict,
        proto_dict,
        verbose=FLAGS.verbose_diffs,
        update_goldens=FLAGS.update_goldens,
        api_version=api_version)
  def testAPIBackwardsCompatibility(self):
    # Extract all API stuff.
    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()

    public_api_visitor = public_api.PublicAPIVisitor(visitor)
    public_api_visitor.do_not_descend_map['tf'].append('contrib')
    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
    traverse.traverse(tf, public_api_visitor)

    proto_dict = visitor.GetProtos()

    # Read all golden files.
    expression = os.path.join(
        resource_loader.get_root_dir_with_all_resources(),
        _KeyToFilePath('*'))
    golden_file_list = file_io.get_matching_files(expression)

    def _ReadFileToProto(filename):
      """Read a filename, create a protobuf from its contents."""
      ret_val = api_objects_pb2.TFAPIObject()
      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
      return ret_val

    golden_proto_dict = {
        _FileNameToKey(filename): _ReadFileToProto(filename)
        for filename in golden_file_list
    }

    # Diff them. Do not fail if called with update.
    # If the test is run to update goldens, only report diffs but do not fail.
    self._AssertProtoDictEquals(
        golden_proto_dict,
        proto_dict,
        verbose=FLAGS.verbose_diffs,
        update_goldens=FLAGS.update_goldens)
Beispiel #33
0
    def _checkBackwardsCompatibility(self,
                                     root,
                                     golden_file_patterns,
                                     api_version,
                                     additional_private_map=None,
                                     omit_golden_symbols_map=None):
        # Extract all API stuff.
        visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()

        public_api_visitor = public_api.PublicAPIVisitor(visitor)
        public_api_visitor.private_map['tf'].append('contrib')
        if api_version == 2:
            public_api_visitor.private_map['tf'].append('enable_v2_behavior')

        public_api_visitor.do_not_descend_map['tf.GPUOptions'] = [
            'Experimental'
        ]
        # Do not descend into these numpy classes because their signatures may be
        # different between internal and OSS.
        public_api_visitor.do_not_descend_map['tf.experimental.numpy'] = [
            'bool_', 'complex_', 'complex128', 'complex64', 'float_',
            'float16', 'float32', 'float64', 'inexact', 'int_', 'int16',
            'int32', 'int64', 'int8', 'object_', 'string_', 'uint16', 'uint32',
            'uint64', 'uint8', 'unicode_', 'iinfo'
        ]
        if FLAGS.only_test_core_api:
            public_api_visitor.do_not_descend_map['tf'].extend(
                _NON_CORE_PACKAGES)
        if additional_private_map:
            public_api_visitor.private_map.update(additional_private_map)

        traverse.traverse(root, public_api_visitor)
        proto_dict = visitor.GetProtos()

        # Read all golden files.
        golden_file_list = file_io.get_matching_files(golden_file_patterns)
        if FLAGS.only_test_core_api:
            golden_file_list = _FilterNonCoreGoldenFiles(golden_file_list)

        def _ReadFileToProto(filename):
            """Read a filename, create a protobuf from its contents."""
            ret_val = api_objects_pb2.TFAPIObject()
            text_format.Merge(file_io.read_file_to_string(filename), ret_val)
            return ret_val

        golden_proto_dict = {
            _FileNameToKey(filename): _ReadFileToProto(filename)
            for filename in golden_file_list
        }
        golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict,
                                                   omit_golden_symbols_map)

        # Diff them. Do not fail if called with update.
        # If the test is run to update goldens, only report diffs but do not fail.
        self._AssertProtoDictEquals(golden_proto_dict,
                                    proto_dict,
                                    verbose=FLAGS.verbose_diffs,
                                    update_goldens=FLAGS.update_goldens,
                                    api_version=api_version)
Beispiel #34
0
    def _run_transform(self):
        """Runs DataFlow for makint tf.example files.

    Only the train file uses DataFlow, the eval file runs beam locally to save
    time.
    """
        cloud = True
        extra_args = []
        if cloud:
            extra_args = [
                '--cloud',
                '--job-name=test-mltoolbox-df-%s' % uuid.uuid4().hex,
                '--project-id=%s' % self._get_default_project_id(),
                '--num-workers=3'
            ]

        cmd = [
            'python %s' % os.path.join(CODE_PATH, 'transform.py'),
            '--csv-file-pattern=' + self._csv_train_filename,
            '--output-dir-from-analysis-step=' + self._analysis_output,
            '--output-filename-prefix=features_train',
            '--output-dir=' + self._transform_output, '--shuffle'
        ] + extra_args

        self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd))
        subprocess.check_call(' '.join(cmd), shell=True)

        # Don't wate time running a 2nd DF job, run it locally.
        cmd = [
            'python %s' % os.path.join(CODE_PATH, 'transform.py'),
            '--csv-file-pattern=' + self._csv_eval_filename,
            '--output-dir-from-analysis-step=' + self._analysis_output,
            '--output-filename-prefix=features_eval',
            '--output-dir=' + self._transform_output
        ]

        self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd))
        subprocess.check_call(' '.join(cmd), shell=True)

        # Check the files were made
        train_files = file_io.get_matching_files(
            os.path.join(self._transform_output, 'features_train*'))
        eval_files = file_io.get_matching_files(
            os.path.join(self._transform_output, 'features_eval*'))
        self.assertNotEqual([], train_files)
        self.assertNotEqual([], eval_files)
Beispiel #35
0
def load_dataset(directory):
    files = gcsfile.get_matching_files(directory + "/*")
    labels = list(
        map(lambda filename: int(os.path.basename(filename)[0:1] == '1'),
            files))
    boxes = tf.zeros(shape=[len(files), 4])
    return tf.contrib.data.Dataset.from_tensor_slices(
        (tf.constant(files), tf.constant(labels), boxes)), len(files)
Beispiel #36
0
 def testGetMatchingFiles(self):
   dir_path = os.path.join(self._base_dir, "temp_dir")
   file_io.create_dir(dir_path)
   files = ["file1.txt", "file2.txt", "file3.txt"]
   for name in files:
     file_path = os.path.join(dir_path, name)
     file_io.FileIO(file_path, mode="w").write("testing")
   expected_match = [os.path.join(dir_path, name) for name in files]
   self.assertItemsEqual(
       file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
       expected_match)
   self.assertItemsEqual(file_io.get_matching_files(tuple()), [])
   files_subset = [
       os.path.join(dir_path, files[0]), os.path.join(dir_path, files[2])
   ]
   self.assertItemsEqual(
       file_io.get_matching_files(files_subset), files_subset)
   file_io.delete_recursively(dir_path)
   self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
  def _run_transform(self):
    """Runs DataFlow for makint tf.example files.

    Only the train file uses DataFlow, the eval file runs beam locally to save
    time.
    """
    cloud = True
    extra_args = []
    if cloud:
      extra_args = ['--cloud',
                    '--job-name=test-mltoolbox-df-%s' % uuid.uuid4().hex,
                    '--project-id=%s' % self._get_default_project_id(),
                    '--num-workers=3']

    cmd = ['python %s' % os.path.join(CODE_PATH, 'transform.py'),
           '--csv=' + self._csv_train_filename,
           '--analysis=' + self._analysis_output,
           '--prefix=features_train',
           '--output=' + self._transform_output,
           '--shuffle'] + extra_args

    self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd))
    subprocess.check_call(' '.join(cmd), shell=True)

    # Don't wate time running a 2nd DF job, run it locally.
    cmd = ['python %s' % os.path.join(CODE_PATH, 'transform.py'),
           '--csv=' + self._csv_eval_filename,
           '--analysis=' + self._analysis_output,
           '--prefix=features_eval',
           '--output=' + self._transform_output]

    self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd))
    subprocess.check_call(' '.join(cmd), shell=True)

    # Check the files were made
    train_files = file_io.get_matching_files(
        os.path.join(self._transform_output, 'features_train*'))
    eval_files = file_io.get_matching_files(
        os.path.join(self._transform_output, 'features_eval*'))
    self.assertNotEqual([], train_files)
    self.assertNotEqual([], eval_files)
Beispiel #38
0
 def testGetMatchingFiles(self):
   dir_path = os.path.join(self._base_dir, "temp_dir")
   file_io.create_dir(dir_path)
   files = ["file1.txt", "file2.txt", "file3.txt"]
   for name in files:
     file_path = os.path.join(dir_path, name)
     file_io.write_string_to_file(file_path, "testing")
   expected_match = [os.path.join(dir_path, name) for name in files]
   self.assertItemsEqual(
       file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
       expected_match)
   file_io.delete_recursively(dir_path)
   self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
def checkpoint_exists(checkpoint_prefix):
  """Checks whether a V1 or V2 checkpoint exists with the specified prefix.

  This is the recommended way to check if a checkpoint exists, since it takes
  into account the naming difference between V1 and V2 formats.

  Args:
    checkpoint_prefix: the prefix of a V1 or V2 checkpoint, with V2 taking
      priority.  Typically the result of `Saver.save()` or that of
      `tf.train.latest_checkpoint()`, regardless of sharded/non-sharded or
      V1/V2.
  Returns:
    A bool, true iff a checkpoint referred to by `checkpoint_prefix` exists.
  """
  pathname = _prefix_to_checkpoint_path(checkpoint_prefix,
                                        saver_pb2.SaverDef.V2)
  if file_io.get_matching_files(pathname):
    return True
  elif file_io.get_matching_files(checkpoint_prefix):
    return True
  else:
    return False
  def testNewAPIBackwardsCompatibility(self):
    # Extract all API stuff.
    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()

    public_api_visitor = public_api.PublicAPIVisitor(visitor)
    public_api_visitor.do_not_descend_map['tf'].append('contrib')
    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
    # TODO(annarev): Make slide_dataset available in API.
    public_api_visitor.private_map['tf'] = ['slide_dataset']
    traverse.traverse(api, public_api_visitor)

    proto_dict = visitor.GetProtos()

    # Read all golden files.
    expression = os.path.join(
        resource_loader.get_root_dir_with_all_resources(),
        _KeyToFilePath('*'))
    golden_file_list = file_io.get_matching_files(expression)

    def _ReadFileToProto(filename):
      """Read a filename, create a protobuf from its contents."""
      ret_val = api_objects_pb2.TFAPIObject()
      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
      return ret_val

    golden_proto_dict = {
        _FileNameToKey(filename): _ReadFileToProto(filename)
        for filename in golden_file_list
    }

    # user_ops is an empty module. It is currently available in TensorFlow API
    # but we don't keep empty modules in the new API.
    # We delete user_ops from golden_proto_dict to make sure assert passes
    # when diffing new API against goldens.
    # TODO(annarev): remove user_ops from goldens once we switch to new API.
    tf_module = golden_proto_dict['tensorflow'].tf_module
    for i in range(len(tf_module.member)):
      if tf_module.member[i].name == 'user_ops':
        del tf_module.member[i]
        break

    # Diff them. Do not fail if called with update.
    # If the test is run to update goldens, only report diffs but do not fail.
    self._AssertProtoDictEquals(
        golden_proto_dict,
        proto_dict,
        verbose=FLAGS.verbose_diffs,
        update_goldens=False,
        additional_missing_object_message=
        'Check if tf_export decorator/call is missing for this symbol.')
  def _checkBackwardsCompatibility(self,
                                   root,
                                   golden_file_pattern,
                                   api_version,
                                   additional_private_map=None,
                                   omit_golden_symbols_map=None):
    # Extract all API stuff.
    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()

    public_api_visitor = public_api.PublicAPIVisitor(visitor)
    public_api_visitor.private_map['tf'] = ['contrib']
    if api_version == 2:
      public_api_visitor.private_map['tf'].append('enable_v2_behavior')

    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
    if FLAGS.only_test_core_api:
      public_api_visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
    if additional_private_map:
      public_api_visitor.private_map.update(additional_private_map)

    traverse.traverse(root, public_api_visitor)
    proto_dict = visitor.GetProtos()

    # Read all golden files.
    golden_file_list = file_io.get_matching_files(golden_file_pattern)
    if FLAGS.only_test_core_api:
      golden_file_list = _FilterNonCoreGoldenFiles(golden_file_list)

    def _ReadFileToProto(filename):
      """Read a filename, create a protobuf from its contents."""
      ret_val = api_objects_pb2.TFAPIObject()
      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
      return ret_val

    golden_proto_dict = {
        _FileNameToKey(filename): _ReadFileToProto(filename)
        for filename in golden_file_list
    }
    golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict,
                                               omit_golden_symbols_map)

    # Diff them. Do not fail if called with update.
    # If the test is run to update goldens, only report diffs but do not fail.
    self._AssertProtoDictEquals(
        golden_proto_dict,
        proto_dict,
        verbose=FLAGS.verbose_diffs,
        update_goldens=FLAGS.update_goldens,
        api_version=api_version)
Beispiel #42
0
def get_latest_checkpoint():
  index_files = file_io.get_matching_files(os.path.join(FLAGS.train_dir, 'model.ckpt-*.index'))

  # No files
  if not index_files:
    return None


  # Index file path with the maximum step size.
  latest_index_file = sorted(
      [(int(os.path.basename(f).split("-")[-1].split(".")[0]), f)
       for f in index_files])[-1][1]

  # Chop off .index suffix and return
  return latest_index_file[:-6]
Beispiel #43
0
def copy_data_to_tmp(input_files):
  """Copies data to /tmp/ and returns glob matching the files."""
  files = []
  for e in input_files:
    for path in e.split(','):
      files.extend(file_io.get_matching_files(path))

  for path in files:
    if not path.startswith('gs://'):
      return input_files

  tmp_path = os.path.join('/tmp/', str(uuid.uuid4()))
  os.makedirs(tmp_path)
  subprocess.check_call(['gsutil', '-m', '-q', 'cp', '-r'] + files + [tmp_path])
  return [os.path.join(tmp_path, '*')]
Beispiel #44
0
def read_examples(input_files, batch_size, shuffle, num_epochs=None):
  """Creates readers and queues for reading example protos."""
  files = []
  for e in input_files:
    for path in e.split(','):
      files.extend(file_io.get_matching_files(path))
  thread_count = multiprocessing.cpu_count()

  # The minimum number of instances in a queue from which examples are drawn
  # randomly. The larger this number, the more randomness at the expense of
  # higher memory requirements.
  min_after_dequeue = 1000

  # When batching data, the queue's capacity will be larger than the batch_size
  # by some factor. The recommended formula is (num_threads + a small safety
  # margin). For now, we use a single thread for reading, so this can be small.
  queue_size_multiplier = thread_count + 3

  # Convert num_epochs == 0 -> num_epochs is None, if necessary
  num_epochs = num_epochs or None

  # Build a queue of the filenames to be read.
  filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle)

  options = tf.python_io.TFRecordOptions(
      compression_type=tf.python_io.TFRecordCompressionType.GZIP)
  example_id, encoded_example = tf.TFRecordReader(options=options).read_up_to(
      filename_queue, batch_size)

  if shuffle:
    capacity = min_after_dequeue + queue_size_multiplier * batch_size
    return tf.train.shuffle_batch(
        [example_id, encoded_example],
        batch_size,
        capacity,
        min_after_dequeue,
        enqueue_many=True,
        num_threads=thread_count)

  else:
    capacity = queue_size_multiplier * batch_size
    return tf.train.batch(
        [example_id, encoded_example],
        batch_size,
        capacity=capacity,
        enqueue_many=True,
        num_threads=thread_count)
def local_batch_predict(model_dir, csv_file_pattern, output_dir, output_format, batch_size=100):
  """ Batch Predict with a specified model.

  It does batch prediction, saves results to output files and also creates an output
  schema file. The output file names are input file names prepended by 'predict_results_'.

  Args:
    model_dir: The model directory containing a SavedModel (usually saved_model.pb).
    csv_file_pattern: a pattern of csv files as batch prediction source.
    output_dir: the path of the output directory.
    output_format: csv or json.
    batch_size: Larger batch_size improves performance but may
        cause more memory usage.
  """

  file_io.recursive_create_dir(output_dir)
  csv_files = file_io.get_matching_files(csv_file_pattern)
  if len(csv_files) == 0:
    raise ValueError('No files found given ' + csv_file_pattern)

  with tf.Graph().as_default(), tf.Session() as sess:
    input_alias_map, output_alias_map = _tf_load_model(sess, model_dir)
    csv_tensor_name = list(input_alias_map.values())[0]
    output_schema = _get_output_schema(sess, output_alias_map)
    for csv_file in csv_files:
      output_file = os.path.join(
          output_dir,
          'predict_results_' +
          os.path.splitext(os.path.basename(csv_file))[0] + '.' + output_format)
      with file_io.FileIO(output_file, 'w') as f:
        prediction_source = _batch_csv_reader(csv_file, batch_size)
        for batch in prediction_source:
          batch = [l.rstrip() for l in batch if l]
          predict_results = sess.run(fetches=output_alias_map, feed_dict={csv_tensor_name: batch})
          formatted_results = _format_results(output_format, output_schema, predict_results)
          f.write('\n'.join(formatted_results) + '\n')

  file_io.write_string_to_file(os.path.join(output_dir, 'predict_results_schema.json'),
                               json.dumps(output_schema, indent=2))
Beispiel #46
0
 def testMatchingFilesPermission(self):
   # Create top level directory test_dir.
   dir_path = os.path.join(self._base_dir, "test_dir")
   file_io.create_dir(dir_path)
   # Create second level directories `noread` and `any`.
   noread_path = os.path.join(dir_path, "noread")
   file_io.create_dir(noread_path)
   any_path = os.path.join(dir_path, "any")
   file_io.create_dir(any_path)
   files = ["file1.txt", "file2.txt", "file3.txt"]
   for name in files:
     file_path = os.path.join(any_path, name)
     file_io.FileIO(file_path, mode="w").write("testing")
   file_path = os.path.join(noread_path, "file4.txt")
   file_io.FileIO(file_path, mode="w").write("testing")
   # Change noread to noread access.
   os.chmod(noread_path, 0)
   expected_match = [os.path.join(any_path, name) for name in files]
   self.assertItemsEqual(
       file_io.get_matching_files(os.path.join(dir_path, "*", "file*.txt")),
       expected_match)
   # Change noread back so that it could be cleaned during tearDown.
   os.chmod(noread_path, 0o777)
  def _run_batch_prediction(self):
    """Run batch prediction using the cloudml engine prediction service.

    There is no local version of this step as it's the last step.
    """

    job_name = 'test_mltoolbox_batchprediction_%s' % uuid.uuid4().hex
    cmd = ['gcloud ml-engine jobs submit prediction ' + job_name,
           '--data-format=TEXT',
           '--input-paths=' + self._csv_predict_filename,
           '--output-path=' + self._prediction_output,
           '--model-dir=' + os.path.join(self._train_output, 'model'),
           '--runtime-version=1.0',
           '--region=us-central1']
    self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd))
    subprocess.check_call(' '.join(cmd), shell=True)  # async call.
    subprocess.check_call('gcloud ml-engine jobs stream-logs ' + job_name, shell=True)

    # check that there was no errors.
    error_files = file_io.get_matching_files(
        os.path.join(self._prediction_output, 'prediction.errors_stats*'))
    self.assertEqual(1, len(error_files))
    error_str = file_io.read_file_to_string(error_files[0])
    self.assertEqual('', error_str)
Beispiel #48
0
def run_local_analysis(output_dir, csv_file_pattern, schema, inverted_features):
  """Use pandas to analyze csv files.

  Produces a stats file and vocab files.

  Args:
    output_dir: output folder
    csv_file_pattern: list of csv file paths, may contain wildcards
    schema: BQ schema list
    inverted_features: inverted_features dict

  Raises:
    ValueError: on unknown transfrorms/schemas
  """
  sys.stdout.write('Expanding any file patterns...\n')
  sys.stdout.flush()
  header = [column['name'] for column in schema]
  input_files = []
  for file_pattern in csv_file_pattern:
    input_files.extend(file_io.get_matching_files(file_pattern))
  sys.stdout.write('file list computed.\n')
  sys.stdout.flush()

  # Make a copy of inverted_features and update the target transform to be
  # identity or one hot depending on the schema.
  inverted_features_target = copy.deepcopy(inverted_features)
  for name, transform_set in six.iteritems(inverted_features_target):
    if transform_set == set([constant.TARGET_TRANSFORM]):
      target_schema = next(col['type'].lower() for col in schema if col['name'] == name)
      if target_schema in constant.NUMERIC_SCHEMA:
        inverted_features_target[name] = {constant.IDENTITY_TRANSFORM}
      else:
        inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM}

  # initialize the results
  def _init_numerical_results():
    return {'min': float('inf'),
            'max': float('-inf'),
            'count': 0,
            'sum': 0.0}
  numerical_results = collections.defaultdict(_init_numerical_results)
  vocabs = collections.defaultdict(lambda: collections.defaultdict(int))

  num_examples = 0
  # for each file, update the numerical stats from that file, and update the set
  # of unique labels.
  for input_file in input_files:
    sys.stdout.write('Analyzing file %s...\n' % input_file)
    sys.stdout.flush()
    with file_io.FileIO(input_file, 'r') as f:
      for line in csv.reader(f):
        if len(header) != len(line):
          raise ValueError('Schema has %d columns but a csv line only has %d columns.' %
                           (len(header), len(line)))
        parsed_line = dict(zip(header, line))
        num_examples += 1

        for col_name, transform_set in six.iteritems(inverted_features_target):
          # All transforms in transform_set require the same analysis. So look
          # at the first transform.
          transform_name = next(iter(transform_set))
          if transform_name in constant.TEXT_TRANSFORMS:
            split_strings = parsed_line[col_name].split(' ')

            # If a label is in the row N times, increase it's vocab count by 1.
            # This is needed for TFIDF, but it's also an interesting stat.
            for one_label in set(split_strings):
              # Filter out empty strings
              if one_label:
                vocabs[col_name][one_label] += 1
          elif transform_name in constant.CATEGORICAL_TRANSFORMS:
            if parsed_line[col_name]:
              vocabs[col_name][parsed_line[col_name]] += 1
          elif transform_name in constant.NUMERIC_TRANSFORMS:
            if not parsed_line[col_name].strip():
              continue

            numerical_results[col_name]['min'] = (
              min(numerical_results[col_name]['min'],
                  float(parsed_line[col_name])))
            numerical_results[col_name]['max'] = (
              max(numerical_results[col_name]['max'],
                  float(parsed_line[col_name])))
            numerical_results[col_name]['count'] += 1
            numerical_results[col_name]['sum'] += float(parsed_line[col_name])

    sys.stdout.write('file %s analyzed.\n' % input_file)
    sys.stdout.flush()

  # Write the vocab files. Each label is on its own line.
  vocab_sizes = {}
  for name, label_count in six.iteritems(vocabs):
    # df is now:
    # label1,count
    # label2,count
    # ...
    # where label1 is the most frequent label, and label2 is the 2nd most, etc.
    df = pd.DataFrame([{'label': label, 'count': count}
                       for label, count in sorted(six.iteritems(label_count),
                                                  key=lambda x: x[1],
                                                  reverse=True)],
                      columns=['label', 'count'])
    csv_string = df.to_csv(index=False, header=False)

    file_io.write_string_to_file(
        os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % name),
        csv_string)

    vocab_sizes[name] = {'vocab_size': len(label_count)}

  # Update numerical_results to just have min/min/mean
  for col_name in numerical_results:
    if float(numerical_results[col_name]['count']) == 0:
      raise ValueError('Column %s has a zero count' % col_name)
    mean = (numerical_results[col_name]['sum'] /
            float(numerical_results[col_name]['count']))
    del numerical_results[col_name]['sum']
    del numerical_results[col_name]['count']
    numerical_results[col_name]['mean'] = mean

  # Write the stats file.
  numerical_results.update(vocab_sizes)
  stats = {'column_stats': numerical_results, 'num_examples': num_examples}
  file_io.write_string_to_file(
      os.path.join(output_dir, constant.STATS_FILE),
      json.dumps(stats, indent=2, separators=(',', ': ')))
def load_session_bundle_from_path(export_dir,
                                  target="",
                                  config=None,
                                  meta_graph_def=None):
  """Load session bundle from the given path.

  The function reads input from the export_dir, constructs the graph data to the
  default graph and restores the parameters for the session created.

  Args:
    export_dir: the directory that contains files exported by exporter.
    target: The execution engine to connect to. See target in
      tf.compat.v1.Session()
    config: A ConfigProto proto with configuration options. See config in
      tf.compat.v1.Session()
    meta_graph_def: optional object of type MetaGraphDef. If this object is
      present, then it is used instead of parsing MetaGraphDef from export_dir.

  Returns:
    session: a tensorflow session created from the variable files.
    meta_graph: a meta graph proto saved in the exporter directory.

  Raises:
    RuntimeError: if the required files are missing or contain unrecognizable
    fields, i.e. the exported model is invalid.
  """
  if not meta_graph_def:
    meta_graph_filename = os.path.join(export_dir,
                                       constants.META_GRAPH_DEF_FILENAME)
    if not file_io.file_exists(meta_graph_filename):
      raise RuntimeError("Expected meta graph file missing %s" %
                         meta_graph_filename)
    # Reads meta graph file.
    meta_graph_def = meta_graph_pb2.MetaGraphDef()
    meta_graph_def.ParseFromString(
        file_io.read_file_to_string(meta_graph_filename, binary_mode=True))

  variables_filename = ""
  variables_filename_list = []
  checkpoint_sharded = False

  variables_index_filename = os.path.join(export_dir,
                                          constants.VARIABLES_INDEX_FILENAME_V2)
  checkpoint_v2 = file_io.file_exists(variables_index_filename)

  # Find matching checkpoint files.
  if checkpoint_v2:
    # The checkpoint is in v2 format.
    variables_filename_pattern = os.path.join(
        export_dir, constants.VARIABLES_FILENAME_PATTERN_V2)
    variables_filename_list = file_io.get_matching_files(
        variables_filename_pattern)
    checkpoint_sharded = True
  else:
    variables_filename = os.path.join(export_dir, constants.VARIABLES_FILENAME)
    if file_io.file_exists(variables_filename):
      variables_filename_list = [variables_filename]
    else:
      variables_filename = os.path.join(export_dir,
                                        constants.VARIABLES_FILENAME_PATTERN)
      variables_filename_list = file_io.get_matching_files(variables_filename)
      checkpoint_sharded = True

  # Prepare the files to restore a session.
  if not variables_filename_list:
    restore_files = ""
  elif checkpoint_v2 or not checkpoint_sharded:
    # For checkpoint v2 or v1 with non-sharded files, use "export" to restore
    # the session.
    restore_files = constants.VARIABLES_FILENAME
  else:
    restore_files = constants.VARIABLES_FILENAME_PATTERN

  assets_dir = os.path.join(export_dir, constants.ASSETS_DIRECTORY)

  collection_def = meta_graph_def.collection_def
  graph_def = graph_pb2.GraphDef()
  if constants.GRAPH_KEY in collection_def:
    # Use serving graph_def in MetaGraphDef collection_def if exists
    graph_def_any = collection_def[constants.GRAPH_KEY].any_list.value
    if len(graph_def_any) != 1:
      raise RuntimeError("Expected exactly one serving GraphDef in : %s" %
                         meta_graph_def)
    else:
      graph_def_any[0].Unpack(graph_def)
      # Replace the graph def in meta graph proto.
      meta_graph_def.graph_def.CopyFrom(graph_def)

  ops.reset_default_graph()
  sess = session.Session(target, graph=None, config=config)
  # Import the graph.
  saver = saver_lib.import_meta_graph(meta_graph_def)
  # Restore the session.
  if restore_files:
    saver.restore(sess, os.path.join(export_dir, restore_files))

  init_op_tensor = None
  if constants.INIT_OP_KEY in collection_def:
    init_ops = collection_def[constants.INIT_OP_KEY].node_list.value
    if len(init_ops) != 1:
      raise RuntimeError("Expected exactly one serving init op in : %s" %
                         meta_graph_def)
    init_op_tensor = ops.get_collection(constants.INIT_OP_KEY)[0]

  # Create asset input tensor list.
  asset_tensor_dict = {}
  if constants.ASSETS_KEY in collection_def:
    assets_any = collection_def[constants.ASSETS_KEY].any_list.value
    for asset in assets_any:
      asset_pb = manifest_pb2.AssetFile()
      asset.Unpack(asset_pb)
      asset_tensor_dict[asset_pb.tensor_binding.tensor_name] = os.path.join(
          assets_dir, asset_pb.filename)

  if init_op_tensor:
    # Run the init op.
    sess.run(fetches=[init_op_tensor], feed_dict=asset_tensor_dict)

  return sess, meta_graph_def
def _GetGoldenApiDefs():
  old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*'))
  return {file_path: file_io.read_file_to_string(file_path)
          for file_path in old_api_def_files}
Beispiel #51
0
def create_object_test():
  """Verifies file_io's object manipulation methods ."""
  starttime_ms = int(round(time.time() * 1000))
  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime_ms)
  print("Creating dir %s." % dir_name)
  file_io.create_dir(dir_name)

  num_files = 5
  # Create files of 2 different patterns in this directory.
  files_pattern_1 = ["%s/test_file_%d.txt" % (dir_name, n)
                     for n in range(num_files)]
  files_pattern_2 = ["%s/testfile%d.txt" % (dir_name, n)
                     for n in range(num_files)]

  starttime_ms = int(round(time.time() * 1000))
  files_to_create = files_pattern_1 + files_pattern_2
  for file_name in files_to_create:
    print("Creating file %s." % file_name)
    file_io.write_string_to_file(file_name, "test file creation.")
  elapsed_ms = int(round(time.time() * 1000)) - starttime_ms
  print("Created %d files in %s milliseconds" %
        (len(files_to_create), elapsed_ms))

  # Listing files of pattern1.
  list_files_pattern = "%s/test_file*.txt" % dir_name
  print("Getting files matching pattern %s." % list_files_pattern)
  starttime_ms = int(round(time.time() * 1000))
  files_list = file_io.get_matching_files(list_files_pattern)
  elapsed_ms = int(round(time.time() * 1000)) - starttime_ms
  print("Listed files in %s milliseconds" % elapsed_ms)
  print(files_list)
  assert set(files_list) == set(files_pattern_1)

  # Listing files of pattern2.
  list_files_pattern = "%s/testfile*.txt" % dir_name
  print("Getting files matching pattern %s." % list_files_pattern)
  starttime_ms = int(round(time.time() * 1000))
  files_list = file_io.get_matching_files(list_files_pattern)
  elapsed_ms = int(round(time.time() * 1000)) - starttime_ms
  print("Listed files in %s milliseconds" % elapsed_ms)
  print(files_list)
  assert set(files_list) == set(files_pattern_2)

  # Test renaming file.
  file_to_rename = "%s/oldname.txt" % dir_name
  file_new_name = "%s/newname.txt" % dir_name
  file_io.write_string_to_file(file_to_rename, "test file.")
  assert file_io.file_exists(file_to_rename)
  assert not file_io.file_exists(file_new_name)

  print("Will try renaming file %s to %s" % (file_to_rename, file_new_name))
  starttime_ms = int(round(time.time() * 1000))
  file_io.rename(file_to_rename, file_new_name)
  elapsed_ms = int(round(time.time() * 1000)) - starttime_ms
  print("File %s renamed to %s in %s milliseconds" % (
      file_to_rename, file_new_name, elapsed_ms))
  assert not file_io.file_exists(file_to_rename)
  assert file_io.file_exists(file_new_name)

  # Delete directory.
  print("Deleting directory %s." % dir_name)
  file_io.delete_recursively(dir_name)
def load_session_bundle_from_path(export_dir, target="", config=None):
  """Load session bundle from the given path.

  The function reads input from the export_dir, constructs the graph data to the
  default graph and restores the parameters for the session created.

  Args:
    export_dir: the directory that contains files exported by exporter.
    target: The execution engine to connect to. See target in tf.Session()
    config: A ConfigProto proto with configuration options. See config in
    tf.Session()

  Returns:
    session: a tensorflow session created from the variable files.
    meta_graph: a meta graph proto saved in the exporter directory.

  Raises:
    RuntimeError: if the required files are missing or contain unrecognizable
    fields, i.e. the exported model is invalid.
  """
  meta_graph_filename = os.path.join(export_dir,
                                     constants.META_GRAPH_DEF_FILENAME)
  if not file_io.file_exists(meta_graph_filename):
    raise RuntimeError("Expected meta graph file missing %s" %
                       meta_graph_filename)
  variables_filename = os.path.join(export_dir,
                                    constants.VARIABLES_FILENAME)
  if not file_io.file_exists(variables_filename):
    variables_filename = os.path.join(
        export_dir, constants.VARIABLES_FILENAME_PATTERN)
    if not file_io.get_matching_files(variables_filename):
      # If graph_util.convert_variables_to_constants() is called on a model
      # it won't have any variables, and that's OK.
      #
      # TODO(yxshi): verify that the graph_def in fact does not have any
      # reachable variables.
      variables_filename = None
  assets_dir = os.path.join(export_dir, constants.ASSETS_DIRECTORY)

  # Reads meta graph file.
  meta_graph_def = meta_graph_pb2.MetaGraphDef()
  meta_graph_def.ParseFromString(file_io.read_file_to_string(
      meta_graph_filename))

  collection_def = meta_graph_def.collection_def
  graph_def = tf.GraphDef()
  if constants.GRAPH_KEY in collection_def:
    # Use serving graph_def in MetaGraphDef collection_def if exists
    graph_def_any = collection_def[constants.GRAPH_KEY].any_list.value
    if len(graph_def_any) != 1:
      raise RuntimeError(
          "Expected exactly one serving GraphDef in : %s" % meta_graph_def)
    else:
      graph_def_any[0].Unpack(graph_def)
      # Replace the graph def in meta graph proto.
      meta_graph_def.graph_def.CopyFrom(graph_def)

  tf.reset_default_graph()
  sess = tf.Session(target, graph=None, config=config)
  # Import the graph.
  saver = tf.train.import_meta_graph(meta_graph_def)
  # Restore the session.
  if variables_filename:
    saver.restore(sess, variables_filename)

  init_op_tensor = None
  if constants.INIT_OP_KEY in collection_def:
    init_ops = collection_def[constants.INIT_OP_KEY].node_list.value
    if len(init_ops) != 1:
      raise RuntimeError(
          "Expected exactly one serving init op in : %s" % meta_graph_def)
    init_op_tensor = tf.get_collection(constants.INIT_OP_KEY)[0]

  # Create asset input tensor list.
  asset_tensor_dict = {}
  if constants.ASSETS_KEY in collection_def:
    assets_any = collection_def[constants.ASSETS_KEY].any_list.value
    for asset in assets_any:
      asset_pb = manifest_pb2.AssetFile()
      asset.Unpack(asset_pb)
      asset_tensor_dict[asset_pb.tensor_binding.tensor_name] = os.path.join(
          assets_dir, asset_pb.filename)

  if init_op_tensor:
    # Run the init op.
    sess.run(fetches=[init_op_tensor], feed_dict=asset_tensor_dict)

  return sess, meta_graph_def
  def transformed_training_input_fn():
    """Training input function that reads transformed data."""

    if isinstance(raw_data_file_pattern, six.string_types):
      filepath_list = [raw_data_file_pattern]
    else:
      filepath_list = raw_data_file_pattern

    files = []
    for path in filepath_list:
      files.extend(file_io.get_matching_files(path))

    filename_queue = tf.train.string_input_producer(
        files, num_epochs=num_epochs, shuffle=randomize_input)

    options = tf.python_io.TFRecordOptions(
        compression_type=tf.python_io.TFRecordCompressionType.GZIP)
    ex_id, ex_str = tf.TFRecordReader(options=options).read_up_to(
        filename_queue, training_batch_size)

    queue_capacity = (reader_num_threads + 3) * training_batch_size + min_after_dequeue
    if randomize_input:
      _, batch_ex_str = tf.train.shuffle_batch(
          tensors=[ex_id, ex_str],
          batch_size=training_batch_size,
          capacity=queue_capacity,
          min_after_dequeue=min_after_dequeue,
          enqueue_many=True,
          num_threads=reader_num_threads,
          allow_smaller_final_batch=allow_smaller_final_batch)

    else:
      _, batch_ex_str = tf.train.batch(
          tensors=[ex_id, ex_str],
          batch_size=training_batch_size,
          capacity=queue_capacity,
          enqueue_many=True,
          num_threads=reader_num_threads,
          allow_smaller_final_batch=allow_smaller_final_batch)

    feature_spec = {}
    feature_info = get_transformed_feature_info(features, schema)
    for name, info in six.iteritems(feature_info):
      if info['size'] is None:
        feature_spec[name] = tf.VarLenFeature(dtype=info['dtype'])
      else:
        feature_spec[name] = tf.FixedLenFeature(shape=[info['size']], dtype=info['dtype'])

    parsed_tensors = tf.parse_example(batch_ex_str, feature_spec)

    # Expand the dims of non-sparse tensors. This is needed by tf.learn.
    transformed_features = {}
    for k, v in six.iteritems(parsed_tensors):
      if isinstance(v, tf.Tensor) and v.get_shape().ndims == 1:
        transformed_features[k] = tf.expand_dims(v, -1)
      else:
        # Sparse tensor
        transformed_features[k] = v

    transformed_features = image_feature_engineering(
        features=features,
        feature_tensors_dict=transformed_features)

    # Remove the target tensor, and return it directly
    target_name = get_target_name(features)
    if not target_name or target_name not in transformed_features:
      raise ValueError('Cannot find target transform in features')

    transformed_target = transformed_features.pop(target_name)

    return transformed_features, transformed_target
 def match_maybe_append(pathname):
   fnames = file_io.get_matching_files(pathname)
   if fnames:
     mtimes.append(file_io.stat(fnames[0]).mtime_nsec / 1e9)
     return True
   return False
def _delete_file_if_exists(filespec):
  """Deletes files matching `filespec`."""
  for pathname in file_io.get_matching_files(filespec):
    file_io.delete_file(pathname)
def run_numerical_categorical_analysis(args, schema_list):
  """Makes the numerical and categorical analysis files.

  Args:
    args: the command line args
    schema_list: python object of the schema json file.

  Raises:
    ValueError: if schema contains unknown column types.
  """
  header = [column['name'] for column in schema_list]
  input_files = file_io.get_matching_files(args.input_file_pattern)

  # Check the schema is valid
  for col_schema in schema_list:
    col_type = col_schema['type'].lower()
    if col_type != 'string' and col_type != 'integer' and col_type != 'float':
      raise ValueError('Schema contains an unsupported type %s.' % col_type)

  # initialize the results
  def _init_numerical_results():
    return {'min': float('inf'),
            'max': float('-inf'),
            'count': 0,
            'sum': 0.0}
  numerical_results = collections.defaultdict(_init_numerical_results)
  categorical_results = collections.defaultdict(set)

  # for each file, update the numerical stats from that file, and update the set
  # of unique labels.
  for input_file in input_files:
    with file_io.FileIO(input_file, 'r') as f:
      for line in f:
        parsed_line = dict(zip(header, line.strip().split(',')))

        for col_schema in schema_list:
          col_name = col_schema['name']
          col_type = col_schema['type']
          if col_type.lower() == 'string':
            categorical_results[col_name].update([parsed_line[col_name]])
          else:
            # numerical column.

            # if empty, skip
            if not parsed_line[col_name].strip():
              continue

            numerical_results[col_name]['min'] = (
              min(numerical_results[col_name]['min'],
                  float(parsed_line[col_name])))
            numerical_results[col_name]['max'] = (
              max(numerical_results[col_name]['max'],
                  float(parsed_line[col_name])))
            numerical_results[col_name]['count'] += 1
            numerical_results[col_name]['sum'] += float(parsed_line[col_name])

  # Update numerical_results to just have min/min/mean
  for col_schema in schema_list:
    if col_schema['type'].lower() != 'string':
      col_name = col_schema['name']
      mean = numerical_results[col_name]['sum'] / numerical_results[col_name]['count']
      del numerical_results[col_name]['sum']
      del numerical_results[col_name]['count']
      numerical_results[col_name]['mean'] = mean

  # Write the numerical_results to a json file.
  file_io.write_string_to_file(
      os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE),
      json.dumps(numerical_results, indent=2, separators=(',', ': ')))

  # Write the vocab files. Each label is on its own line.
  for name, unique_labels in six.iteritems(categorical_results):
    labels = '\n'.join(list(unique_labels))
    file_io.write_string_to_file(
        os.path.join(args.output_dir, CATEGORICAL_ANALYSIS_FILE % name),
        labels)
Beispiel #57
0
  def testMultipleColumnsTransformed(self):
    """Test training starting from tf.example."""
    output_dir = tempfile.mkdtemp()
    try:
      features = {
          'num': {'transform': 'identity'},
          'num2': {'transform': 'key', 'source_column': 'num'},
          'target': {'transform': 'target'},
          'text': {'transform': 'bag_of_words'},
          'text2': {'transform': 'tfidf', 'source_column': 'text'},
          'text3': {'transform': 'key', 'source_column': 'text'}}
      schema = [
          {'name': 'num', 'type': 'integer'},
          {'name': 'target', 'type': 'float'},
          {'name': 'text', 'type': 'string'}]
      data = ['1,2,hello world\n', '4,8,bye moon\n', '5,10,hello moon\n', '11,22,moon moon\n']
      file_io.recursive_create_dir(output_dir)
      file_io.write_string_to_file(os.path.join(output_dir, 'schema.json'),
                                   json.dumps(schema, indent=2))
      file_io.write_string_to_file(os.path.join(output_dir, 'features.json'),
                                   json.dumps(features, indent=2))
      file_io.write_string_to_file(os.path.join(output_dir, 'data.csv'),
                                   ''.join(data))

      cmd = ['python %s' % os.path.join(CODE_PATH, 'analyze.py'),
             '--output=' + os.path.join(output_dir, 'analysis'),
             '--csv=' + os.path.join(output_dir, 'data.csv'),
             '--schema=' + os.path.join(output_dir, 'schema.json'),
             '--features=' + os.path.join(output_dir, 'features.json')]
      subprocess.check_call(' '.join(cmd), shell=True)

      cmd = ['python %s' % os.path.join(CODE_PATH, 'transform.py'),
             '--output=' + os.path.join(output_dir, 'transform'),
             '--csv=' + os.path.join(output_dir, 'data.csv'),
             '--analysis=' + os.path.join(output_dir, 'analysis'),
             '--prefix=features']
      subprocess.check_call(' '.join(cmd), shell=True)

      # Check tf.example file has the expected features
      file_list = file_io.get_matching_files(os.path.join(output_dir, 'transform', 'features*'))
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      record_iter = tf.python_io.tf_record_iterator(path=file_list[0], options=options)
      tf_example = tf.train.Example()
      tf_example.ParseFromString(next(record_iter))

      self.assertEqual(1, len(tf_example.features.feature['num'].int64_list.value))
      self.assertEqual(1, len(tf_example.features.feature['num2'].int64_list.value))
      self.assertEqual(1, len(tf_example.features.feature['target'].float_list.value))
      self.assertEqual(2, len(tf_example.features.feature['text_ids'].int64_list.value))
      self.assertEqual(2, len(tf_example.features.feature['text_weights'].float_list.value))
      self.assertEqual(2, len(tf_example.features.feature['text2_ids'].int64_list.value))
      self.assertEqual(2, len(tf_example.features.feature['text2_weights'].float_list.value))
      self.assertEqual(1, len(tf_example.features.feature['text3'].bytes_list.value))

      cmd = ['cd %s && ' % CODE_PATH,
             'python -m trainer.task',
             '--train=' + os.path.join(output_dir, 'data.csv'),
             '--eval=' + os.path.join(output_dir, 'data.csv'),
             '--job-dir=' + os.path.join(output_dir, 'training'),
             '--analysis=' + os.path.join(output_dir, 'analysis'),
             '--model=linear_regression',
             '--train-batch-size=4',
             '--eval-batch-size=4',
             '--max-steps=200',
             '--learning-rate=0.1',
             '--transform']
      subprocess.check_call(' '.join(cmd), shell=True)

      result = run_exported_model(
          model_path=os.path.join(output_dir, 'training', 'model'),
          csv_data=['20,hello moon'])

      # check keys were made
      self.assertEqual(20, result['num2'])
      self.assertEqual('hello moon', result['text3'])
    finally:
      shutil.rmtree(output_dir)
Beispiel #58
0
def _files(pattern):
    """Converts a file pattern to a list of files."""
    files = file_io.get_matching_files(pattern)
    if not files:
        raise IOError('Unable to find input files.')
    return files