Beispiel #1
0
def _make_batch_queue(input, capacity, num_threads=1):
    queue = tf.PaddingFIFOQueue(capacity=capacity,
                                dtypes=[s.dtype for s in input],
                                shapes=[s.get_shape() for s in input])
    tf.summary.scalar("fraction_of_%d_full" % capacity,
                      tf.cast(queue.size(), tf.float32) * (1. / capacity))
    enqueue_ops = [queue.enqueue(input)] * num_threads
    queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
    return queue
  def __init__(self, config):
    self.config = config
    self.context_embeddings = util.EmbeddingDictionary(config["context_embeddings"])
    self.context_embeddings_size = self.context_embeddings.size

    self.char_embedding_size = config["char_embedding_size"]
    self.char_dict = util.load_char_dict(config["char_vocab_path"])

    if self.config["lm_path"].lower() == "none":
      self.lm_file = None
    else:
      self.lm_file = h5py.File(self.config["lm_path"], "r")
    self.lm_layers = self.config["lm_layers"]
    self.lm_size = self.config["lm_size"]

    self.eval_data = None  # Load eval data lazily.
    self.ner_types = self.config['ner_types']
    self.ner_maps = {ner: (i + 1) for i, ner in enumerate(self.ner_types)}
    self.num_types = len(self.ner_types)

    input_props = []
    input_props.append((tf.string, [None, None]))  # Tokens.
    input_props.append((tf.float32, [None, None, self.context_embeddings_size]))  # Context embeddings.
    input_props.append((tf.float32, [None, None, self.lm_size, self.lm_layers]))  # LM embeddings.
    input_props.append((tf.int32, [None, None, None]))  # Character indices.
    input_props.append((tf.int32, [None]))  # Text lengths.
    input_props.append((tf.bool, []))  # Is training.
    input_props.append((tf.int32, [None]))  # Gold NER Label

    self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in input_props]
    dtypes, shapes = zip(*input_props)
    queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
    self.enqueue_op = queue.enqueue(self.queue_input_tensors)
    self.input_tensors = queue.dequeue()

    self.predictions, self.loss = self.get_predictions_and_loss(self.input_tensors)
    self.global_step = tf.Variable(0, name="global_step", trainable=False)
    self.reset_global_step = tf.assign(self.global_step, 0)
    learning_rate = tf.train.exponential_decay(self.config["learning_rate"], self.global_step,
                                               self.config["decay_frequency"], self.config["decay_rate"],
                                               staircase=True)
    trainable_params = tf.trainable_variables()
    gradients = tf.gradients(self.loss, trainable_params)
    gradients, _ = tf.clip_by_global_norm(gradients, self.config["max_gradient_norm"])
    optimizers = {
      "adam": tf.train.AdamOptimizer,
      "sgd": tf.train.GradientDescentOptimizer
    }
    optimizer = optimizers[self.config["optimizer"]](learning_rate)
    self.train_op = optimizer.apply_gradients(zip(gradients, trainable_params), global_step=self.global_step)
    def init_queue(self):
        queue_types = []
        queue_shapes = []
        self.placeholders = []
        for (name, shape) in self.data_names_and_shapes:
            if self.data_types is not None and name in self.data_types:
                types = self.data_types[name]
            else:
                types = tf.float32
            queue_shapes.append(shape)
            queue_types.append(types)
            self.placeholders.append(
                tf.placeholder(types, shape, name='placeholder_' + name))

        self.queue = tf.PaddingFIFOQueue(self.queue_size, queue_types,
                                         queue_shapes)
        self.enqueue = self.queue.enqueue(self.placeholders)
def prefetch(tensor_dict, capacity):
    """Creates a prefetch queue for tensors.

  Creates a FIFO queue to asynchronously enqueue tensor_dicts and returns a
  dequeue op that evaluates to a tensor_dict. This function is useful in
  prefetching preprocessed tensors so that the data is readily available for
  consumers.

  Example input pipeline when you don't need batching:
  ----------------------------------------------------
  key, string_tensor = slim.parallel_reader.parallel_read(...)
  tensor_dict = decoder.decode(string_tensor)
  tensor_dict = preprocessor.preprocess(tensor_dict, ...)
  prefetch_queue = prefetcher.prefetch(tensor_dict, capacity=20)
  tensor_dict = prefetch_queue.dequeue()
  outputs = Model(tensor_dict)
  ...
  ----------------------------------------------------

  For input pipelines with batching, refer to core/batcher.py

  Args:
    tensor_dict: a dictionary of tensors to prefetch.
    capacity: the size of the prefetch queue.

  Returns:
    a FIFO prefetcher queue
  """
    names = list(tensor_dict.keys())
    dtypes = [t.dtype for t in tensor_dict.values()]
    shapes = [t.get_shape() for t in tensor_dict.values()]
    prefetch_queue = tf.PaddingFIFOQueue(capacity,
                                         dtypes=dtypes,
                                         shapes=shapes,
                                         names=names,
                                         name='prefetch_queue')
    enqueue_op = prefetch_queue.enqueue(tensor_dict)
    tf.train.queue_runner.add_queue_runner(
        tf.train.queue_runner.QueueRunner(prefetch_queue, [enqueue_op]))
    tf.summary.scalar(
        'queue/%s/fraction_of_%d_full' % (prefetch_queue.name, capacity),
        tf.cast(prefetch_queue.size(), dtype=tf.float32) * (1. / capacity))
    return prefetch_queue
Beispiel #5
0
    def __init__(self, config):
        self.config = config
        self.context_embeddings = util.EmbeddingDictionary(
            config["context_embeddings"], config['datapath'])
        self.head_embeddings = util.EmbeddingDictionary(
            config["context_embeddings"],
            config['datapath'],
            maybe_cache=self.context_embeddings)
        self.char_embedding_size = config["char_embedding_size"]
        self.char_dict = util.load_char_dict(
            os.path.join(config['datapath'], config["char_vocab_path"]))
        self.max_span_width = config["max_span_width"]
        self.genres = {g: i for i, g in enumerate(config["genres"])}
        if config["lm_path"]:
            self.lm_file = h5py.File(
                os.path.join(config['datapath'], self.config["lm_path"]), "r")
        else:
            self.lm_file = None
        if config["lm_model_name"]:
            self.bert_tokenizer, self.bert_model = bert.load_bert(
                self.config["lm_model_name"])
        else:
            self.bert_tokenizer = None
            self.bert_model = None
        self.lm_layers = self.config["lm_layers"]
        self.lm_size = self.config["lm_size"]
        self.eval_data = None  # Load eval data lazily.

        input_props = []
        input_props.append((tf.string, [None, None]))  # Tokens.
        # Context embeddings.
        input_props.append(
            (tf.float32, [None, None, self.context_embeddings.size]))
        # Head embeddings.
        input_props.append(
            (tf.float32, [None, None, self.head_embeddings.size]))
        # LM embeddings.
        input_props.append(
            (tf.float32, [None, None, self.lm_size, self.lm_layers]))
        # Character indices.
        input_props.append((tf.int32, [None, None, None]))
        input_props.append((tf.int32, [None]))  # Text lengths..
        input_props.append((tf.int32, []))  # Genre.
        input_props.append((tf.bool, []))  # Is training.
        input_props.append((tf.int32, [None]))  # Gold starts.
        input_props.append((tf.int32, [None]))  # Gold ends.
        input_props.append((tf.int32, [None]))  # Cluster ids.

        self.queue_input_tensors = [
            tf.placeholder(dtype, shape) for dtype, shape in input_props
        ]
        dtypes, shapes = zip(*input_props)
        queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
        self.enqueue_op = queue.enqueue(self.queue_input_tensors)
        self.input_tensors = queue.dequeue()

        self.predictions, self.loss = self.get_predictions_and_loss(
            *self.input_tensors)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.reset_global_step = tf.assign(self.global_step, 0)
        learning_rate = tf.train.exponential_decay(
            self.config["learning_rate"],
            self.global_step,
            self.config["decay_frequency"],
            self.config["decay_rate"],
            staircase=True)
        trainable_params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, trainable_params)
        gradients, _ = tf.clip_by_global_norm(gradients,
                                              self.config["max_gradient_norm"])
        optimizers = {
            "adam": tf.train.AdamOptimizer,
            "sgd": tf.train.GradientDescentOptimizer
        }
        optimizer = optimizers[self.config["optimizer"]](learning_rate)
        self.train_op = optimizer.apply_gradients(zip(gradients,
                                                      trainable_params),
                                                  global_step=self.global_step)