def test_serialization(): all_readers = [ fastqa_reader, modular_qa_reader, # fastqa_reader_torch, dam_snli_reader, cbilstm_nli_reader, modular_nli_reader, distmult_reader, complex_reader, transe_reader, ] for reader in all_readers: vocab, config = {}, {} data = None if reader in {distmult_reader, complex_reader, transe_reader}: data = loaders['jack']( 'tests/test_data/WN18/wn18-snippet.jack.json') config['repr_dim'] = 50 elif reader in {cbilstm_nli_reader, dam_snli_reader}: data = loaders['snli']( 'tests/test_data/SNLI/1000_samples_snli_1.0_train.jsonl') embeddings = load_embeddings("data/GloVe/glove.the.50d.txt", 'glove') vocab = Vocab(emb=embeddings, init_from_embeddings=True) config['repr_dim_input'] = 50 config['repr_dim'] = 50 elif reader in {fastqa_reader}: data = loaders['squad']('data/SQuAD/snippet.json') embeddings = load_embeddings("data/GloVe/glove.the.50d.txt", 'glove') vocab = Vocab(emb=embeddings, init_from_embeddings=True) config['repr_dim_input'] = 50 config['repr_dim'] = 50 if data is not None: tf.reset_default_graph() shared_resources = SharedResources(vocab, config) reader_instance = reader(shared_resources) reader_instance.setup_from_data(data) temp_dir_path = tempfile.mkdtemp() reader_instance.store(temp_dir_path) reader_instance.load(temp_dir_path) assert reader_instance is not None
def test_shared_resources_store(): embeddings_file = "data/GloVe/glove.the.50d.txt" embeddings = load_embeddings(embeddings_file, 'glove') config = {"embedding_file": embeddings_file, "embedding_format": "glove"} some_vocab = Vocab(vocab=embeddings.vocabulary) some_vocab('foo') shared_resources = SharedResources(some_vocab, config, embeddings) import tempfile with tempfile.TemporaryDirectory() as tmp_dir: path = tmp_dir + "_resources" shared_resources.store(path) new_shared_resources = SharedResources() new_shared_resources.load(path) type_a, type_b = type(new_shared_resources.vocab), type( shared_resources.vocab) assert type_a == type_b for k in new_shared_resources.vocab.__dict__: assert new_shared_resources.vocab.__dict__[ k] == shared_resources.vocab.__dict__[k] assert new_shared_resources.config == shared_resources.config assert new_shared_resources.embeddings.lookup.shape == embeddings.lookup.shape assert np.array_equal(new_shared_resources.embeddings.get(b"the"), embeddings.get(b"the"))
def test_memory_map_dir(): import tempfile from jack.io.embeddings.memory_map import save_as_memory_map_dir, load_memory_map_dir embeddings_file = "data/GloVe/glove.the.50d.txt" embeddings = load_embeddings(embeddings_file, 'glove') with tempfile.TemporaryDirectory() as tmp_dir: mem_map_dir = tmp_dir + "/glove.the.50d.memmap" save_as_memory_map_dir(mem_map_dir, embeddings) loaded_embeddings = load_memory_map_dir(mem_map_dir) assert loaded_embeddings.shape == embeddings.shape assert len(loaded_embeddings.vocabulary) == 1 assert loaded_embeddings.vocabulary["the"] == 0 assert "foo" not in loaded_embeddings.vocabulary assert np.isclose(loaded_embeddings.get("the"), embeddings.get("the"), 1.e-5).all()
def main(): import argparse parser = argparse.ArgumentParser(description='Convert embeddings to memory map directory') parser.add_argument("input_file", help="The input embedding file.") parser.add_argument("output_dir", help="The name of the directory to store the memory map in. Will be created if it doesn't " "exist.") parser.add_argument("-f", "--input_format", help="Format of input embeddings.", default="glove", choices=["glove", "word2vec", "memory_map_dir"]) args = parser.parse_args() input_name = args.input_file output_dir = args.output_dir embeddings = load_embeddings(input_name, typ=args.input_format) logging.info("Loaded embeddings from {}".format(input_name)) save_as_memory_map_dir(output_dir, embeddings) logging.info("Stored embeddings to {}".format(output_dir))
def load(self, path: str): conf_file = os.path.join(path, "conf.yaml") emb_file = os.path.join(path, "emb.pkl") remainder_file = os.path.join(path, "remainder.pkl") if os.path.exists(conf_file): with open(conf_file, "r") as f: config = yaml.load(f) if config["embedding_file"] is not None: emb = load_embeddings(config["embedding_file"], typ=config.get("emb_format", None)) elif os.path.exists(emb_file): with open(emb_file, "rb") as f: emb = pickle.load(f) elif os.path.exists(emb_file): with open(emb_file, "rb") as f: emb = pickle.load(f) with open(remainder_file, "rb") as f: remaining = pickle.load(f) self.__dict__ = remaining self.__dict__["emb"] = emb
shape = meta['shape'] mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape) result = Embeddings(meta['vocab'], mem_map, filename=file_prefix, emb_format="mem_map") return result def save_as_memory_map(file_prefix: str, emb: Embeddings): meta_file = file_prefix + "_meta.pkl" mem_map_file = file_prefix + "_memmap" with open(meta_file, "wb") as f: pickle.dump({"vocab": emb.vocabulary, "shape": emb.shape}, f) mem_map = np.memmap(mem_map_file, dtype='float32', mode='w+', shape=emb.shape) mem_map[:] = emb.lookup[:] mem_map.flush() del mem_map if __name__ == "__main__": input_name = sys.argv[1] output_prefix = sys.argv[2] embeddings = load_embeddings(input_name) logging.info("Loaded embeddings from {}".format(input_name)) save_as_memory_map(output_prefix, embeddings) logging.info("Stored embeddings to {}".format(output_prefix))
tf.app.flags.DEFINE_string('file', None, 'dataset file') tf.app.flags.DEFINE_string('dataset_type', 'squad', 'either squad or jack') tf.app.flags.DEFINE_string('model', None, 'Name of the reader') tf.app.flags.DEFINE_string('model_dir', None, 'directory to saved model') tf.app.flags.DEFINE_string('embedding_path', None, 'path to embeddings') tf.app.flags.DEFINE_string('embedding_format', 'glove', 'embeddings format') tf.app.flags.DEFINE_string('device', "/cpu:0", 'device to use') tf.app.flags.DEFINE_string('out', "results.json", 'Result file path.') tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size') tf.app.flags.DEFINE_integer('beam_size', 1, 'beam size') tf.app.flags.DEFINE_string('kwargs', '{}', 'additional reader-specific configurations') FLAGS = tf.app.flags.FLAGS logger.info("Loading embeddings from {}...".format(FLAGS.embedding_path)) emb = load_embeddings(FLAGS.embedding_path, FLAGS.embedding_format) vocab = Vocab(emb=emb, init_from_embeddings=True) logger.info("Creating and loading reader from {}...".format(FLAGS.model_dir)) config = {"beam_size": FLAGS.beam_size, 'batch_size': FLAGS.batch_size, "max_support_length": None} config.update(json.loads(FLAGS.kwargs)) reader = readers[FLAGS.model](vocab, config) with tf.device(FLAGS.device): reader.load_and_setup(FLAGS.model_dir) if FLAGS.dataset_type == "squad": dataset_jtr = convert_squad(FLAGS.file) else: with open(FLAGS.file) as f: dataset_jtr = json.load(f)