Exemple #1
0
def vectors_from_file_object(file_like_object,
                             delimiter=",",
                             unk_tensor=None,
                             num_cpus=10):
    r"""Create a Vectors object from a csv file like object.

    Note that the tensor corresponding to each vector is of type `torch.float`.

    Format for csv file:
        token1<delimiter>num1 num2 num3
        token2<delimiter>num4 num5 num6
        ...
        token_n<delimiter>num_m num_j num_k

    Args:
        file_like_object (FileObject): a file like object to read data from.
        delimiter (char): a character to delimit between the token and the vector. Default value is ","
        unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token.
        num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.

    Returns:
        Vectors: a Vectors object.

     Raises:
        ValueError: if duplicate tokens are found in FastText file.

    """
    vectors_obj, dup_tokens = _load_token_and_vectors_from_file(
        file_like_object.name, delimiter, num_cpus, unk_tensor)
    if dup_tokens:
        raise ValueError("Found duplicate tokens in file: {}".format(
            str(dup_tokens)))
    return Vectors(vectors_obj)
Exemple #2
0
def FastText(language="en", unk_tensor=None, root=".data", validate_file=True, num_cpus=32):
    r"""Create a FastText Vectors object.

    Args:
        language (str): the language to use for FastText. The list of supported languages options
                        can be found at https://fasttext.cc/docs/en/language-identification.html
        unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token
        root (str): folder used to store downloaded files in. Default: '.data'.
        validate_file (bool): flag to determine whether to validate the downloaded files checksum.
                              Should be `False` when running tests with a local asset.
        num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.

    Returns:
        torchtext.experimental.vectors.Vector: a Vectors object.

    Raises:
        ValueError: if duplicate tokens are found in FastText file.

    """
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec".format(language)

    checksum = None
    if validate_file:
        checksum = CHECKSUMS_FAST_TEXT.get(url, None)

    downloaded_file_path = download_from_url(url, root=root, hash_value=checksum)
    cpp_vectors_obj, dup_tokens = _load_token_and_vectors_from_file(downloaded_file_path, ' ', num_cpus, unk_tensor)

    if dup_tokens:
        raise ValueError("Found duplicate tokens in file: {}".format(str(dup_tokens)))

    vectors_obj = Vectors(cpp_vectors_obj)
    return vectors_obj
Exemple #3
0
def GloVe(name="840B",
          dim=300,
          unk_tensor=None,
          root=".data",
          validate_file=True,
          num_cpus=32):
    r"""Create a GloVe Vectors object.

    Args:
        name (str): the name of the GloVe dataset to use. Options are:
            - 42B
            - 840B
            - twitter.27B
            - 6B
        dim (int): the dimension for the GloVe dataset to load. Options are:
            42B:
                - 300
            840B:
                - 300
            twitter.27B:
                - 25
                - 50
                - 100
                - 200
            6B:
                - 50
                - 100
                - 200
                - 300
        unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token.
        root (str): folder used to store downloaded files in (.data)
        validate_file (bool): flag to determine whether to validate the downloaded files checksum.
                              Should be `False` when running tests with a local asset.
        num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.
    Returns:
        Vectors: a Vectors object.

    Raises:
        ValueError: if unexpected duplicate tokens are found in GloVe file.

    """
    dup_token_glove_840b = [
        "����������������������������������������������������������������������"
        "����������������������������������������������������������������������"
        "����������������������������������������������������������������������"
        "����������������������������������������������������������������������"
        "������������������������������������������������������"
    ]
    urls = {
        "42B": "https://nlp.stanford.edu/data/glove.42B.300d.zip",
        "840B": "https://nlp.stanford.edu/data/glove.840B.300d.zip",
        "twitter.27B": "https://nlp.stanford.edu/data/glove.twitter.27B.zip",
        "6B": "https://nlp.stanford.edu/data/glove.6B.zip",
    }
    valid_glove_file_names = {
        "glove.42B.300d.txt", "glove.840B.300d.txt",
        "glove.twitter.27B.25d.txt", "glove.twitter.27B.50d.txt",
        "glove.twitter.27B.100d.txt", "glove.twitter.27B.200d.txt",
        "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt",
        "glove.6B.300d.txt"
    }

    file_name = "glove.{}.{}d.txt".format(name, str(dim))
    if file_name not in valid_glove_file_names:
        raise ValueError(
            "Could not find GloVe file with name {}. Please check that `name` and `dim`"
            "are valid.".format(str(file_name)))

    url = urls[name]
    checksum = None
    if validate_file:
        checksum = CHECKSUMS_GLOVE.get(url, None)

    downloaded_file_path = download_from_url(url,
                                             root=root,
                                             hash_value=checksum)
    extracted_file_paths = extract_archive(downloaded_file_path)
    # need to get the full path to the correct file in the case when multiple files are extracted with different dims
    extracted_file_path_with_correct_dim = [
        path for path in extracted_file_paths if file_name in path
    ][0]
    cpp_vectors_obj, dup_tokens = _load_token_and_vectors_from_file(
        extracted_file_path_with_correct_dim, ' ', num_cpus, unk_tensor)

    # Ensure there is only 1 expected duplicate token present for 840B dataset
    if dup_tokens and dup_tokens != dup_token_glove_840b:
        raise ValueError("Found duplicate tokens in file: {}".format(
            str(dup_tokens)))

    vectors_obj = Vectors(cpp_vectors_obj)
    return vectors_obj