def load_featurizer(params): tokenizer = featurization.Tokenizer( vocab_path=params["vocab_path"], do_lower_case=params["do_lower_case"]) return featurization.Featurizer( query_seq_len=params["query_seq_len"], candidate_seq_len=params["candidate_seq_len"], num_candidates=params["num_candidates"], max_masks=params["max_masks"], tokenizer=tokenizer)
def load_featurizer(): """Loads a Featurizer.""" tokenizer = featurization.Tokenizer( vocab_path=FLAGS.vocab_path, do_lower_case=FLAGS.do_lower_case) featurizer = featurization.Featurizer( query_seq_len=FLAGS.query_seq_len, candidate_seq_len=FLAGS.candidate_seq_len, num_candidates=FLAGS.num_candidates, max_masks=FLAGS.max_masks, tokenizer=tokenizer) logging.info('Loaded featurizer.') return featurizer
def input_fn(params): """Constructs the dataset fed to Estimator.""" # We cannot access self._featurizer via closure, because this function is # passed to another device. Hence, we need to reconstruct the featurizer # from its hyerparameters (passed through `params`). tokenizer = featurization.Tokenizer( vocab_path=params['vocab_path'], do_lower_case=params['do_lower_case']) featurizer = featurization.Featurizer( query_seq_len=params['query_seq_len'], candidate_seq_len=params['candidate_seq_len'], num_candidates=params['num_candidates'], max_masks=params['max_masks'], tokenizer=tokenizer, separate_candidate_segments=params[ 'separate_candidate_segments']) dataset = get_documents_dataset() def featurize(doc_dict): return featurizer.featurize_document_tf( doc_dict['title_token_ids'], doc_dict['body_token_ids']) dataset = dataset.map( featurize, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Add a document index variable. dataset = dataset.enumerate() def _enumerate_to_dict(result_idx, tensor_dict): return dict(tensor_dict, result_idx=result_idx) dataset = dataset.map( _enumerate_to_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Pad the end of the dataset with one full extra batch. # This ensures that we don't drop the remainder. if total_docs % batch_size != 0: # Pad using the first value of the dataset, repeated batch_size times. pad_vals = dataset.take(1).repeat(batch_size) dataset = dataset.concatenate(pad_vals) # Batch the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(2) # Prefetch for efficiency. return dataset
def load_featurizer(): """Loads a featurizer from hyperparams specified in model_dir.""" params_path = os.path.join(FLAGS.model_dir, "estimator_params.json") with tf.gfile.GFile(params_path) as f: params = json.load(f) tokenizer = featurization.Tokenizer(vocab_path=params["vocab_path"], do_lower_case=params["do_lower_case"]) featurizer = featurization.Featurizer( query_seq_len=params["query_seq_len"], candidate_seq_len=params["candidate_seq_len"], num_candidates=params["num_candidates"], max_masks=params["max_masks"], tokenizer=tokenizer) logging.info("Loaded featurizer.") return featurizer
def __init__(self, vocab_path, do_lower_case): self._tokenizer = featurization.Tokenizer(vocab_path=vocab_path, do_lower_case=do_lower_case)