def test_convert_with_output_pathlib(): data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER]) output_type = random.choice(list(FileType)) output = pathlib.Path(rand_str()) input_path = DATA / data with patch("word_vectors.convert_module.write") as write_patch: w, wv = read(input_path) convert(input_path, output, output_file_type=output_type) call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0] assert call_file == output assert call_w == w assert call_type == output_type np.testing.assert_allclose(call_wv, wv)
def test_convert_pathlib(): data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER]) output_type = random.choice(list(FileType)) input_path = DATA / data gold_output_path = os.path.splitext(str(input_path))[0] + "." + str(output_type) with patch("word_vectors.convert_module.write") as write_patch: w, wv = read(input_path) convert(input_path, output_file_type=output_type) call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0] assert str(call_file) == gold_output_path assert call_w == w np.testing.assert_allclose(call_wv, wv) assert call_type is output_type
def main(): parser = argparse.ArgumentParser() parser.add_argument("embedding") parser.add_argument("--embed-name", "--embed_name", help="Force a name if the file name isn't descriptive") parser.add_argument("--format", type=FileType.from_string) parser.add_argument( "--output", help="The name of the output files sans extensions. Will create a .json or stats and a .png the the distirbution graph", ) args = parser.parse_args() v, wv = read(args.embedding, args.format) lengths = Counter(len(token.encode("utf-8")) for token in v) vsz = len(v) dsz = wv.shape[-1] min_l = min(lengths) max_l = max(lengths) avg_l = mean(lengths) std_l = std(lengths) print(f"Vocab size: {vsz}") print(f"Vector size: {dsz}") print(f"Shortest token: {min_l}") print(f"Longest token: {max_l}") print(f"Average token length: {avg_l}") print(f"Std of token length: {std_l}") embed_name = os.path.splitext(os.path.basename(args.embedding))[0] if args.embed_name is None else args.embed_name data = { "name": embed_name, "stats": { "vocab_size": vsz, "vector_size": dsz, "min_length": min_l, "max_length": max_l, "avg_length": avg_l, "std_length": std_l, }, "counts": {**lengths}, } if args.output is None: args.output = embed_name with open(f"{args.output}.json", "w") as wf: json.dump(data, wf, indent=2) plot_distribution(lengths, title=f"Type Length Distribution for {embed_name}", output=f"{args.output}.png")
def test_convert_with_input_pathlib(): data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER]) input_type = INPUT_MAPPING[data] output_type = random.choice(list(FileType)) input_path = DATA / data output = pathlib.Path(rand_str()) with patch("word_vectors.convert_module.read") as read_patch: with patch("word_vectors.convert_module.write") as write_patch: w, wv = read(input_path) read_patch.return_value = (w, wv) convert(input_path, output, output_file_type=output_type, input_file_type=input_type) read_patch.assert_called_once_with(input_path, input_type) call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0] assert call_file == output assert call_w == w assert call_type == output_type np.testing.assert_allclose(call_wv, wv)
def test_convert_with_output_open(): data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER]) output_type = random.choice(list(FileType)) output = rand_str() input_path = DATA / data print(output) try: with open(input_path, "r" if data in (GLOVE, W2V_TEXT) else "rb") as input_path: with open(output, "w" if output_type in (FileType.GLOVE, FileType.W2V_TEXT) else "wb") as output: with patch("word_vectors.convert_module.write") as write_patch: w, wv = read(input_path) convert(input_path, output, output_file_type=output_type) call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0] assert call_file == output assert call_w == w assert call_type == output_type np.testing.assert_allclose(call_wv, wv) finally: os.remove(output.name)
def convert( f: Union[str, TextIO, BinaryIO], output: Optional[str] = None, output_file_type: FileType = FileType.LEADER, input_file_type: Optional[FileType] = None, ): """Convert vectors from one format to another. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. output_file_type: The vector serialization format to use when writing out the vectors. input_file_type: An explicit vector format to use when reading. """ LOGGER.info("Reading vectors from %s", f) w, wv = read(f, input_file_type) output = create_output_path(f, output_file_type) if output is None else output LOGGER.info("Writing vectors to %s", output) write(output, w, wv, output_file_type)
def test_read_dupped_opened(dupped_vocab, dupped_vectors): data = random.choice([GLOVE_DUPPED, W2V_DUPPED, LEADER_DUPPED]) mode = "r" if data == GLOVE_DUPPED else "rb" w, wv = read(open(DATA / data, mode)) assert w == dupped_vocab np.testing.assert_allclose(wv, dupped_vectors)
def test_read_dupped_pathlib(dupped_vocab, dupped_vectors): data = random.choice([GLOVE_DUPPED, W2V_DUPPED, LEADER_DUPPED]) w, wv = read(DATA / data) assert w == dupped_vocab np.testing.assert_allclose(wv, dupped_vectors)
def test_read_opened(gold_vocab, gold_vectors): data = random.choice([GLOVE, W2V, LEADER, W2V_TEXT]) mode = "r" if data == GLOVE or data == W2V_TEXT else "rb" w, wv = read(open(DATA / data, mode)) assert w == gold_vocab np.testing.assert_allclose(wv, gold_vectors)
def test_read_pathlib(gold_vocab, gold_vectors): data = random.choice([GLOVE, W2V, LEADER, W2V_TEXT]) v, wv = read(DATA / data) assert v == gold_vocab np.testing.assert_allclose(wv, gold_vectors)
def test_read(gold_vocab, gold_vectors): data = random.choice([GLOVE, W2V, LEADER, W2V_TEXT]) w, wv = read(str(DATA / data)) assert w == gold_vocab np.testing.assert_allclose(wv, gold_vectors)