def _read_pretrained_embeddings(pretrained_file, tmp_dir, embedding_dim, vocab, vocab_namespace): if not os.path.exists(pretrained_file): logger.error("Pretrained embedding file is not existing") return None if tmp_dir: if not os.path.exists(tmp_dir): tf.gfile.MakeDirs(tmp_dir) cache_embedding_file = os.path.join(tmp_dir, "embedding.pkl.gz") else: cache_embedding_file = None if tmp_dir and os.path.exists(cache_embedding_file): logger.info("loading cache embedding from %s." % cache_embedding_file) with gzip.open(cache_embedding_file, 'rb') as pkl_file: embeddings = pickle.load(pkl_file) else: file_ext = get_file_extension(pretrained_file) if file_ext in ['.txt']: embeddings = _read_pretrained_embeddings_text( pretrained_file, embedding_dim, vocab, vocab_namespace) else: logger.error("Do not support this embedding file type.") return None if cache_embedding_file: with gzip.open(cache_embedding_file, 'wb') as pkl_file: pickle.dump(embeddings, pkl_file) return embeddings
def valid(self): for namespace, mapping in self._index_to_token.items(): for (index, token) in mapping.items(): if self._token_to_index[namespace][token] != int(index): logger.error("index/token in index_to_token with namespace %s: %s/%s not in token_to_index" % (namespace, index, token)) return False return True
def init_from_instances(cls, instances, vocab_init_files=None, pretrained_files=None, only_include_pretrained_words=False): logger.info("create vocab from instance") namespace_counter = collections.defaultdict(lambda: collections.defaultdict(int)) try: for i, instance in enumerate(tqdm.tqdm(instances)): instance.count_vocab(namespace_counter) except StopIteration as e: logger.error("The data reader builds vocabulary error with StopIteration.") return cls(namespace_counter, pretrained_files=pretrained_files, vocab_init_files=vocab_init_files, only_include_pretrained_words=only_include_pretrained_words)
def get_token_index(self, token, namespace): token = str(token) if token in self._token_to_index[namespace]: return self._token_to_index[namespace][token] else: try: return self._token_to_index[namespace][self._oov_token] except KeyError: logger.error('Namespace: %s', namespace) logger.error('Token: %s', token) raise
def get_schedule(self, train_steps=None, warmup_proportion=None, decay_steps=None, decay_rate=None, decay_type='polynomial'): global_step = tf.train.get_or_create_global_step() rate = tf.constant(value=1.0, shape=[], dtype=tf.float32) num_train_steps = train_steps if decay_type == 'polynomial': if num_train_steps: rate = tf.train.polynomial_decay( rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) elif decay_type == 'exponential': if decay_steps and decay_rate: rate = tf.train.exponential_decay( rate, global_step, decay_steps, decay_rate, staircase=False, name=None) else: logger.error("The decay type %s is not supported." % decay_type) if warmup_proportion and num_train_steps: num_warmup_steps = int(num_train_steps * self._warmup_proportion) global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_rate = rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) rate = ( (1.0 - is_warmup) * rate + is_warmup * warmup_rate) return rate
def get_vocab_path(self, namespace): if namespace not in self._namespace_to_path: logger.error("%s vocab file does not exist." % namespace) return self._namespace_to_path.get(namespace, None)