def test_convert_with_output_pathlib():
    data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER])
    output_type = random.choice(list(FileType))
    output = pathlib.Path(rand_str())
    input_path = DATA / data
    with patch("word_vectors.convert_module.write") as write_patch:
        w, wv = read(input_path)
        convert(input_path, output, output_file_type=output_type)
        call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0]
        assert call_file == output
        assert call_w == w
        assert call_type == output_type
        np.testing.assert_allclose(call_wv, wv)
def test_convert_pathlib():
    data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER])
    output_type = random.choice(list(FileType))
    input_path = DATA / data
    gold_output_path = os.path.splitext(str(input_path))[0] + "." + str(output_type)
    with patch("word_vectors.convert_module.write") as write_patch:
        w, wv = read(input_path)
        convert(input_path, output_file_type=output_type)
        call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0]
        assert str(call_file) == gold_output_path
        assert call_w == w
        np.testing.assert_allclose(call_wv, wv)
        assert call_type is output_type
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("embedding")
    parser.add_argument("--embed-name", "--embed_name", help="Force a name if the file name isn't descriptive")
    parser.add_argument("--format", type=FileType.from_string)
    parser.add_argument(
        "--output",
        help="The name of the output files sans extensions. Will create a .json or stats and a .png the the distirbution graph",
    )
    args = parser.parse_args()

    v, wv = read(args.embedding, args.format)

    lengths = Counter(len(token.encode("utf-8")) for token in v)

    vsz = len(v)
    dsz = wv.shape[-1]
    min_l = min(lengths)
    max_l = max(lengths)
    avg_l = mean(lengths)
    std_l = std(lengths)

    print(f"Vocab size: {vsz}")
    print(f"Vector size: {dsz}")
    print(f"Shortest token: {min_l}")
    print(f"Longest token: {max_l}")
    print(f"Average token length: {avg_l}")
    print(f"Std of token length: {std_l}")

    embed_name = os.path.splitext(os.path.basename(args.embedding))[0] if args.embed_name is None else args.embed_name

    data = {
        "name": embed_name,
        "stats": {
            "vocab_size": vsz,
            "vector_size": dsz,
            "min_length": min_l,
            "max_length": max_l,
            "avg_length": avg_l,
            "std_length": std_l,
        },
        "counts": {**lengths},
    }

    if args.output is None:
        args.output = embed_name

    with open(f"{args.output}.json", "w") as wf:
        json.dump(data, wf, indent=2)

    plot_distribution(lengths, title=f"Type Length Distribution for {embed_name}", output=f"{args.output}.png")
def test_convert_with_input_pathlib():
    data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER])
    input_type = INPUT_MAPPING[data]
    output_type = random.choice(list(FileType))
    input_path = DATA / data
    output = pathlib.Path(rand_str())
    with patch("word_vectors.convert_module.read") as read_patch:
        with patch("word_vectors.convert_module.write") as write_patch:
            w, wv = read(input_path)
            read_patch.return_value = (w, wv)
            convert(input_path, output, output_file_type=output_type, input_file_type=input_type)
            read_patch.assert_called_once_with(input_path, input_type)
            call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0]
            assert call_file == output
            assert call_w == w
            assert call_type == output_type
            np.testing.assert_allclose(call_wv, wv)
def test_convert_with_output_open():
    data = random.choice([GLOVE, W2V, W2V_TEXT, LEADER])
    output_type = random.choice(list(FileType))
    output = rand_str()
    input_path = DATA / data
    print(output)
    try:
        with open(input_path, "r" if data in (GLOVE, W2V_TEXT) else "rb") as input_path:
            with open(output, "w" if output_type in (FileType.GLOVE, FileType.W2V_TEXT) else "wb") as output:
                with patch("word_vectors.convert_module.write") as write_patch:
                    w, wv = read(input_path)
                    convert(input_path, output, output_file_type=output_type)
                    call_file, call_w, call_wv, call_type = write_patch.call_args_list[0][0]
                    assert call_file == output
                    assert call_w == w
                    assert call_type == output_type
                    np.testing.assert_allclose(call_wv, wv)
    finally:
        os.remove(output.name)
Beispiel #6
0
def convert(
    f: Union[str, TextIO, BinaryIO],
    output: Optional[str] = None,
    output_file_type: FileType = FileType.LEADER,
    input_file_type: Optional[FileType] = None,
):
    """Convert vectors from one format to another.

    Args:
        f: The file to read from.
        output: The name for the output file. If not provided we use the
            input file name with a modified extension.
        output_file_type: The vector serialization format to use when
            writing out the vectors.
        input_file_type: An explicit vector format to use when reading.
    """
    LOGGER.info("Reading vectors from %s", f)
    w, wv = read(f, input_file_type)
    output = create_output_path(f, output_file_type) if output is None else output
    LOGGER.info("Writing vectors to %s", output)
    write(output, w, wv, output_file_type)
Beispiel #7
0
def test_read_dupped_opened(dupped_vocab, dupped_vectors):
    data = random.choice([GLOVE_DUPPED, W2V_DUPPED, LEADER_DUPPED])
    mode = "r" if data == GLOVE_DUPPED else "rb"
    w, wv = read(open(DATA / data, mode))
    assert w == dupped_vocab
    np.testing.assert_allclose(wv, dupped_vectors)
Beispiel #8
0
def test_read_dupped_pathlib(dupped_vocab, dupped_vectors):
    data = random.choice([GLOVE_DUPPED, W2V_DUPPED, LEADER_DUPPED])
    w, wv = read(DATA / data)
    assert w == dupped_vocab
    np.testing.assert_allclose(wv, dupped_vectors)
Beispiel #9
0
def test_read_opened(gold_vocab, gold_vectors):
    data = random.choice([GLOVE, W2V, LEADER, W2V_TEXT])
    mode = "r" if data == GLOVE or data == W2V_TEXT else "rb"
    w, wv = read(open(DATA / data, mode))
    assert w == gold_vocab
    np.testing.assert_allclose(wv, gold_vectors)
Beispiel #10
0
def test_read_pathlib(gold_vocab, gold_vectors):
    data = random.choice([GLOVE, W2V, LEADER, W2V_TEXT])
    v, wv = read(DATA / data)
    assert v == gold_vocab
    np.testing.assert_allclose(wv, gold_vectors)
Beispiel #11
0
def test_read(gold_vocab, gold_vectors):
    data = random.choice([GLOVE, W2V, LEADER, W2V_TEXT])
    w, wv = read(str(DATA / data))
    assert w == gold_vocab
    np.testing.assert_allclose(wv, gold_vectors)