Example #1
0
def _read_pretrained_embeddings(pretrained_file, tmp_dir, embedding_dim, vocab,
                                vocab_namespace):
    if not os.path.exists(pretrained_file):
        logger.error("Pretrained embedding file is not existing")
        return None
    if tmp_dir:
        if not os.path.exists(tmp_dir):
            tf.gfile.MakeDirs(tmp_dir)
        cache_embedding_file = os.path.join(tmp_dir, "embedding.pkl.gz")
    else:
        cache_embedding_file = None
    if tmp_dir and os.path.exists(cache_embedding_file):
        logger.info("loading cache embedding from %s." % cache_embedding_file)
        with gzip.open(cache_embedding_file, 'rb') as pkl_file:
            embeddings = pickle.load(pkl_file)
    else:
        file_ext = get_file_extension(pretrained_file)
        if file_ext in ['.txt']:
            embeddings = _read_pretrained_embeddings_text(
                pretrained_file, embedding_dim, vocab, vocab_namespace)
        else:
            logger.error("Do not support this embedding file type.")
            return None
        if cache_embedding_file:
            with gzip.open(cache_embedding_file, 'wb') as pkl_file:
                pickle.dump(embeddings, pkl_file)
    return embeddings
Example #2
0
 def valid(self):
     for namespace, mapping in self._index_to_token.items():
         for (index, token) in mapping.items():
             if self._token_to_index[namespace][token] != int(index):
                 logger.error("index/token in index_to_token with namespace %s: %s/%s not in token_to_index" %
                              (namespace, index, token))
                 return False
     return True
Example #3
0
 def init_from_instances(cls, instances, vocab_init_files=None, pretrained_files=None, only_include_pretrained_words=False):
     logger.info("create vocab from instance")
     namespace_counter = collections.defaultdict(lambda: collections.defaultdict(int))
     try:
         for i, instance in enumerate(tqdm.tqdm(instances)):
             instance.count_vocab(namespace_counter)
     except StopIteration as e:
         logger.error("The data reader builds vocabulary error with StopIteration.")
     return cls(namespace_counter, pretrained_files=pretrained_files,
                vocab_init_files=vocab_init_files,
                only_include_pretrained_words=only_include_pretrained_words)
Example #4
0
 def get_token_index(self, token, namespace):
     token = str(token)
     if token in self._token_to_index[namespace]:
         return self._token_to_index[namespace][token]
     else:
         try:
             return self._token_to_index[namespace][self._oov_token]
         except KeyError:
             logger.error('Namespace: %s', namespace)
             logger.error('Token: %s', token)
             raise
Example #5
0
    def get_schedule(self, train_steps=None, warmup_proportion=None, decay_steps=None,
                     decay_rate=None, decay_type='polynomial'):
        global_step = tf.train.get_or_create_global_step()
        rate = tf.constant(value=1.0, shape=[], dtype=tf.float32)
        num_train_steps = train_steps
        if decay_type == 'polynomial':
            if num_train_steps:
                rate = tf.train.polynomial_decay(
                    rate,
                    global_step,
                    num_train_steps,
                    end_learning_rate=0.0,
                    power=1.0,
                    cycle=False)
        elif decay_type == 'exponential':
            if decay_steps and decay_rate:
                rate = tf.train.exponential_decay(
                    rate,
                    global_step,
                    decay_steps,
                    decay_rate,
                    staircase=False, name=None)
        else:
            logger.error("The decay type %s is not supported." % decay_type)

        if warmup_proportion and num_train_steps:
            num_warmup_steps = int(num_train_steps * self._warmup_proportion)
            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_rate = rate * warmup_percent_done

            is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
            rate = (
                    (1.0 - is_warmup) * rate + is_warmup * warmup_rate)
        return rate
Example #6
0
 def get_vocab_path(self, namespace):
     if namespace not in self._namespace_to_path:
         logger.error("%s vocab file does not exist." % namespace)
     return self._namespace_to_path.get(namespace, None)