def test_embeddings_text_file(self): txt_path = str(self.FIXTURES_ROOT / "utf-8_sample/utf-8_sample.txt") # This is for sure a correct way to read an utf-8 encoded text file with open(txt_path, "rt", encoding="utf-8") as f: correct_text = f.read() # Check if we get the correct text on plain and compressed versions of the file paths = [txt_path] + [txt_path + ext for ext in [".gz", ".zip"]] for path in paths: with EmbeddingsTextFile(path) as f: text = f.read() assert text == correct_text, "Test failed for file: " + path # Check for a file contained inside an archive with multiple files for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.lzma"]: archive_path = str( self.FIXTURES_ROOT / "utf-8_sample/archives/utf-8") + ext file_uri = format_embeddings_file_uri(archive_path, "folder/utf-8_sample.txt") with EmbeddingsTextFile(file_uri) as f: text = f.read() assert text == correct_text, "Test failed for file: " + archive_path # Passing a second level path when not reading an archive with pytest.raises(ValueError): with EmbeddingsTextFile( format_embeddings_file_uri(txt_path, "a/fake/path")): pass
def re_read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace): tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: continue vector = np.asarray(fields[1:], dtype="float32") embeddings[token] = vector index_to_token = vocab.get_index_to_token_vocabulary(namespace) rows_not_to_optimize = [] for i in range(vocab_size): token = index_to_token[i] if token in embeddings: rows_not_to_optimize.append(i) return rows_not_to_optimize
def read_embeddings_from_text_file( filepath: str, debug: bool = False, emb_size: int = None, first_line: bool = False, use_padding: bool = False, split_char: str = '\t', use_cache: bool = True) -> Tuple[Dict[str, int], np.ndarray]: print('load emb from {} ...'.format(filepath)) id2ind = defaultdict(lambda: len(id2ind)) emb = [] if use_padding: # put padding at the first position _ = id2ind[PADDING] with EmbeddingsTextFile(filepath) as embeddings_file: if first_line: embeddings_file.readline() for i, line in tqdm(enumerate(embeddings_file)): token = line.split(split_char, 1)[0] _ = id2ind[token] if debug: emb.append([0.1] * emb_size) else: l = np.asarray(line.rstrip().split(split_char)[1:], dtype='float32') if emb_size and len(l) != emb_size: raise ValueError('emb dim incorrect') emb.append(l) if use_padding: # put padding at the first position if emb_size: emb.insert(0, [0] * emb_size) else: emb.insert(0, [0] * len(emb[0])) return dict(id2ind), np.array(emb, dtype=np.float32)
def __init__(self, glove_path: str, embedding_dim: int, trainable: bool = False) -> None: super(GloveContextualizer, self).__init__() self.embedding_dim = embedding_dim self.trainable = trainable # Read the GloVe file, and produce a dictionary of tokens to indices, a dictionary # of indices to tokens, and a PyTorch Embedding object. self.token_to_idx = {DEFAULT_OOV_TOKEN: 0} self.idx_to_token = {0: DEFAULT_OOV_TOKEN} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") embeddings = {} with EmbeddingsTextFile(glove_path) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] fields = line.rstrip().split(' ') if len(fields) - 1 != self.embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", self.embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector self.token_to_idx[token] = len(self.token_to_idx) self.idx_to_token[len(self.idx_to_token)] = token if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. vocab_size = len(self.token_to_idx) logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, self.embedding_dim).normal_( embeddings_mean, embeddings_std) # Start at 1, since the 0th token is OOV, and fill in the embedding matrix for i in range(1, vocab_size): embedding_matrix[i] = torch.FloatTensor( embeddings[self.idx_to_token[i]]) self.weight = torch.nn.Parameter(embedding_matrix, requires_grad=self.trainable)
def read_embedding(pretrained_path=PUBMED_PRETRAINED_PATH): """ Read Pubmed Pretrained embedding from Amazon S3 and return dictionary of embeddings """ embeddings = {} with EmbeddingsTextFile(pretrained_path) as embeddings_file: for line in embeddings_file: token = line.split(' ', 1)[0] if token in p_dict.keys(): fields = line.rstrip().split(' ') vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector return embeddings
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]: # Moving this import to the top breaks everything (cycling import, I guess) from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile logger.info('Reading pretrained tokens from: %s', embeddings_file_uri) tokens: List[str] = [] with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file: for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1): token_end = line.find(' ') if token_end >= 0: token = line[:token_end] tokens.append(token) else: line_begin = line[:20] + '...' if len(line) > 20 else line logger.warning(f'Skipping line number %d: %s', line_number, line_begin) return tokens
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]: """ Helper method for loading pretrained tokens from a given file. This is usually the case for e.g. GloVe files, containing the token and the embedding of that token. Parameters ---------- embeddings_file_url : ``str`` The path to the embedding file to load. """ # Moving this import to the top breaks everything (cycling import, I guess) from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile LOGGER.info('Reading pretrained tokens from: %s', embeddings_file_uri) tokens: List[str] = [] with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file: for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1): token_end = line.find(' ') if token_end >= 0: token = line[:token_end] tokens.append(token) else: line_begin = line[:20] + '...' if len(line) > 20 else line LOGGER.warning(f'Skipping line number %d: %s', line_number, line_begin) return tokens
def check_num_tokens(first_line, expected_num_tokens): with open(test_filename, "w") as f: f.write(first_line) with EmbeddingsTextFile(test_filename) as f: assert (f.num_tokens == expected_num_tokens ), f"Wrong num tokens for line: {first_line}"
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens" ) -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped. The remainder of the docstring is identical to `_read_pretrained_embeddings_file`. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) char_embeddings = {} embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line, ) continue vector = numpy.asarray(fields[1:], dtype="float32") for char in list(token): if char in char_embeddings: char_embeddings[char] = (char_embeddings[char][0] + vector, char_embeddings[char][1] + 1) else: char_embeddings[char] = (vector, 1) embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary" ) char_embeddings = {char: char_embeddings[char][0] / char_embeddings[char][1] for char in char_embeddings} chars = set(char_embeddings.keys()) all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std ) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 elif len(set(token) - chars) == 0: embedding_matrix[i] = torch.FloatTensor([char_embeddings[char] for char in list(token)]).sum(dim=-2) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token ) logger.info( "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size ) return embedding_matrix