def main(): parser = argparse.ArgumentParser() parser.add_argument("vecs") parser.add_argument("vocab") parser.add_argument("output") args = parser.parse_args() voc = set() with open(args.vocab) as f: for line in f: voc.add(line.strip()) voc = load_word_vectors(args.vecs, voc) with open(args.output, "wb") as f: pickle.dump(voc, f)
def build_model_and_evaluator_runner(model_config, max_answer_len, n_paragraphs): with open(model_config.model_pickle_file, 'rb') as f: model = pickle.load(f) model.lm_model.weight_file = model_config.lm_weights_file model.lm_model.lm_vocab_file = model_config.vocab_file model.lm_model.embed_weights_file = model_config.lm_token_weights_file model.lm_model.options_file = model_config.lm_options_file model.word_embed.vec_name = model_config.word_vector_file vocab_to_ignore = {'<S>', '</S>', '<UNK>', '!!!MAXTERMID'} vocab_to_init_with = { line.strip() for line in open(model_config.vocab_file, encoding="utf-8") if line.strip() not in vocab_to_ignore } #evaluator_runner = AysncEvaluatorRunner([RecordParagraphSpanPrediction(max_answer_len, True)], model, 10) sess = tf.Session() with sess.as_default(): model.set_input_spec(ParagraphAndQuestionSpec(None, None, None, 14), vocab_to_init_with, word_vec_loader=ResourceLoader( load_vec_fn=lambda x, y: load_word_vectors( x, y, is_path=True))) evaluator_runner = AysncEvaluatorRunner( [RecordParagraphSpanPrediction(max_answer_len, True)], model, 10) input_dict = { p: x for p, x in zip(model.get_placeholders(), evaluator_runner.dequeue_op) } pred = model.get_predictions_for(input_dict) evaluator_runner.set_input(pred) all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars_to_restore = [x for x in all_vars if x.name not in lm_var_names] saver = tf.train.Saver(vars_to_restore) saver.restore(sess, model_config.checkpoint_file) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) return sess, model, evaluator_runner
def _build_model(self): vocab_to_init_with = { line.strip() for line in open(self.config.vocab_file, encoding="utf-8") if line.strip() not in vocab_to_ignore } self.model.word_embed.vec_name = self.config.word_vector_file with self.sess.as_default(): self.model.set_input_spec( ParagraphAndQuestionSpec(None, None, None, 14), vocab_to_init_with, word_vec_loader=ResourceLoader( load_vec_fn=lambda x, y: load_word_vectors( x, y, is_path=True))) pred = self.model.get_production_predictions_for( {x: x for x in self.model.get_placeholders()}) return pred.start_logits, pred.end_logits, self.model.context_rep
def get_pruned_word_vecs(self, word_vec_name, voc=None): """ Loads word vectors that have been pruned to the case-insensitive vocab of this corpus. WARNING: this includes dev words This exists since loading word-vecs each time we startup can be a big pain, so we cache the pruned vecs on-disk as a .npy file we can re-load quickly. """ vec_file = join(self.dir, word_vec_name + self.WORD_VEC_SUFFIX + ".npy") if isfile(vec_file): print("Loading word vec %s for %s from cache" % (word_vec_name, self.name)) with open(vec_file, "rb") as f: return pickle.load(f) else: print("Building pruned word vec %s for %s" % (self.name, word_vec_name)) voc = self.get_vocab() vecs = load_word_vectors(word_vec_name, voc) with open(vec_file, "wb") as f: pickle.dump(vecs, f) return vecs
def load_word_vec(self, vec_name, voc=None): return load_word_vectors(join(self.path, vec_name), voc, True)