def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, num_cpus=10): r"""Create a Vectors object from a csv file like object. Note that the tensor corresponding to each vector is of type `torch.float`. Format for csv file: token1<delimiter>num1 num2 num3 token2<delimiter>num4 num5 num6 ... token_n<delimiter>num_m num_j num_k Args: file_like_object (FileObject): a file like object to read data from. delimiter (char): a character to delimit between the token and the vector. Default value is "," unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token. num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10. Returns: Vectors: a Vectors object. Raises: ValueError: if duplicate tokens are found in FastText file. """ vectors_obj, dup_tokens = _load_token_and_vectors_from_file( file_like_object.name, delimiter, num_cpus, unk_tensor) if dup_tokens: raise ValueError("Found duplicate tokens in file: {}".format( str(dup_tokens))) return Vectors(vectors_obj)
def FastText(language="en", unk_tensor=None, root=".data", validate_file=True, num_cpus=32): r"""Create a FastText Vectors object. Args: language (str): the language to use for FastText. The list of supported languages options can be found at https://fasttext.cc/docs/en/language-identification.html unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token root (str): folder used to store downloaded files in. Default: '.data'. validate_file (bool): flag to determine whether to validate the downloaded files checksum. Should be `False` when running tests with a local asset. num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10. Returns: torchtext.experimental.vectors.Vector: a Vectors object. Raises: ValueError: if duplicate tokens are found in FastText file. """ url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec".format(language) checksum = None if validate_file: checksum = CHECKSUMS_FAST_TEXT.get(url, None) downloaded_file_path = download_from_url(url, root=root, hash_value=checksum) cpp_vectors_obj, dup_tokens = _load_token_and_vectors_from_file(downloaded_file_path, ' ', num_cpus, unk_tensor) if dup_tokens: raise ValueError("Found duplicate tokens in file: {}".format(str(dup_tokens))) vectors_obj = Vectors(cpp_vectors_obj) return vectors_obj
def GloVe(name="840B", dim=300, unk_tensor=None, root=".data", validate_file=True, num_cpus=32): r"""Create a GloVe Vectors object. Args: name (str): the name of the GloVe dataset to use. Options are: - 42B - 840B - twitter.27B - 6B dim (int): the dimension for the GloVe dataset to load. Options are: 42B: - 300 840B: - 300 twitter.27B: - 25 - 50 - 100 - 200 6B: - 50 - 100 - 200 - 300 unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token. root (str): folder used to store downloaded files in (.data) validate_file (bool): flag to determine whether to validate the downloaded files checksum. Should be `False` when running tests with a local asset. num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10. Returns: Vectors: a Vectors object. Raises: ValueError: if unexpected duplicate tokens are found in GloVe file. """ dup_token_glove_840b = [ "����������������������������������������������������������������������" "����������������������������������������������������������������������" "����������������������������������������������������������������������" "����������������������������������������������������������������������" "������������������������������������������������������" ] urls = { "42B": "https://nlp.stanford.edu/data/glove.42B.300d.zip", "840B": "https://nlp.stanford.edu/data/glove.840B.300d.zip", "twitter.27B": "https://nlp.stanford.edu/data/glove.twitter.27B.zip", "6B": "https://nlp.stanford.edu/data/glove.6B.zip", } valid_glove_file_names = { "glove.42B.300d.txt", "glove.840B.300d.txt", "glove.twitter.27B.25d.txt", "glove.twitter.27B.50d.txt", "glove.twitter.27B.100d.txt", "glove.twitter.27B.200d.txt", "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt", "glove.6B.300d.txt" } file_name = "glove.{}.{}d.txt".format(name, str(dim)) if file_name not in valid_glove_file_names: raise ValueError( "Could not find GloVe file with name {}. Please check that `name` and `dim`" "are valid.".format(str(file_name))) url = urls[name] checksum = None if validate_file: checksum = CHECKSUMS_GLOVE.get(url, None) downloaded_file_path = download_from_url(url, root=root, hash_value=checksum) extracted_file_paths = extract_archive(downloaded_file_path) # need to get the full path to the correct file in the case when multiple files are extracted with different dims extracted_file_path_with_correct_dim = [ path for path in extracted_file_paths if file_name in path ][0] cpp_vectors_obj, dup_tokens = _load_token_and_vectors_from_file( extracted_file_path_with_correct_dim, ' ', num_cpus, unk_tensor) # Ensure there is only 1 expected duplicate token present for 840B dataset if dup_tokens and dup_tokens != dup_token_glove_840b: raise ValueError("Found duplicate tokens in file: {}".format( str(dup_tokens))) vectors_obj = Vectors(cpp_vectors_obj) return vectors_obj