Exemple #1
0
    def test_embeddings_text_file(self):
        txt_path = str(self.FIXTURES_ROOT / "utf-8_sample/utf-8_sample.txt")

        # This is for sure a correct way to read an utf-8 encoded text file
        with open(txt_path, "rt", encoding="utf-8") as f:
            correct_text = f.read()

        # Check if we get the correct text on plain and compressed versions of the file
        paths = [txt_path] + [txt_path + ext for ext in [".gz", ".zip"]]
        for path in paths:
            with EmbeddingsTextFile(path) as f:
                text = f.read()
            assert text == correct_text, "Test failed for file: " + path

        # Check for a file contained inside an archive with multiple files
        for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.lzma"]:
            archive_path = str(
                self.FIXTURES_ROOT / "utf-8_sample/archives/utf-8") + ext
            file_uri = format_embeddings_file_uri(archive_path,
                                                  "folder/utf-8_sample.txt")
            with EmbeddingsTextFile(file_uri) as f:
                text = f.read()
            assert text == correct_text, "Test failed for file: " + archive_path

        # Passing a second level path when not reading an archive
        with pytest.raises(ValueError):
            with EmbeddingsTextFile(
                    format_embeddings_file_uri(txt_path, "a/fake/path")):
                pass
Exemple #2
0
def re_read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace):

    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(" ")
                if len(fields) - 1 != embedding_dim:
                    continue

                vector = np.asarray(fields[1:], dtype="float32")
                embeddings[token] = vector

    index_to_token = vocab.get_index_to_token_vocabulary(namespace)

    rows_not_to_optimize = []
    for i in range(vocab_size):
        token = index_to_token[i]

        if token in embeddings:
            rows_not_to_optimize.append(i)

    return rows_not_to_optimize
Exemple #3
0
def read_embeddings_from_text_file(
        filepath: str,
        debug: bool = False,
        emb_size: int = None,
        first_line: bool = False,
        use_padding: bool = False,
        split_char: str = '\t',
        use_cache: bool = True) -> Tuple[Dict[str, int], np.ndarray]:
    print('load emb from {} ...'.format(filepath))
    id2ind = defaultdict(lambda: len(id2ind))
    emb = []
    if use_padding:  # put padding at the first position
        _ = id2ind[PADDING]
    with EmbeddingsTextFile(filepath) as embeddings_file:
        if first_line:
            embeddings_file.readline()
        for i, line in tqdm(enumerate(embeddings_file)):
            token = line.split(split_char, 1)[0]
            _ = id2ind[token]
            if debug:
                emb.append([0.1] * emb_size)
            else:
                l = np.asarray(line.rstrip().split(split_char)[1:],
                               dtype='float32')
                if emb_size and len(l) != emb_size:
                    raise ValueError('emb dim incorrect')
                emb.append(l)
    if use_padding:  # put padding at the first position
        if emb_size:
            emb.insert(0, [0] * emb_size)
        else:
            emb.insert(0, [0] * len(emb[0]))
    return dict(id2ind), np.array(emb, dtype=np.float32)
Exemple #4
0
    def __init__(self,
                 glove_path: str,
                 embedding_dim: int,
                 trainable: bool = False) -> None:
        super(GloveContextualizer, self).__init__()
        self.embedding_dim = embedding_dim
        self.trainable = trainable
        # Read the GloVe file, and produce a dictionary of tokens to indices, a dictionary
        # of indices to tokens, and a PyTorch Embedding object.
        self.token_to_idx = {DEFAULT_OOV_TOKEN: 0}
        self.idx_to_token = {0: DEFAULT_OOV_TOKEN}

        # First we read the embeddings from the file, only keeping vectors for the words we need.
        logger.info("Reading pretrained embeddings from file")
        embeddings = {}
        with EmbeddingsTextFile(glove_path) as embeddings_file:
            for line in Tqdm.tqdm(embeddings_file):
                token = line.split(' ', 1)[0]
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != self.embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        self.embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
                self.token_to_idx[token] = len(self.token_to_idx)
                self.idx_to_token[len(self.idx_to_token)] = token
        if not embeddings:
            raise ConfigurationError(
                "No embeddings of correct dimension found; you probably "
                "misspecified your embedding_dim parameter, or didn't "
                "pre-populate your Vocabulary")

        all_embeddings = numpy.asarray(list(embeddings.values()))
        embeddings_mean = float(numpy.mean(all_embeddings))
        embeddings_std = float(numpy.std(all_embeddings))
        # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
        # then filling in the word vectors we just read.
        vocab_size = len(self.token_to_idx)
        logger.info("Initializing pre-trained embedding layer")
        embedding_matrix = torch.FloatTensor(vocab_size,
                                             self.embedding_dim).normal_(
                                                 embeddings_mean,
                                                 embeddings_std)
        # Start at 1, since the 0th token is OOV, and fill in the embedding matrix
        for i in range(1, vocab_size):
            embedding_matrix[i] = torch.FloatTensor(
                embeddings[self.idx_to_token[i]])
        self.weight = torch.nn.Parameter(embedding_matrix,
                                         requires_grad=self.trainable)
def read_embedding(pretrained_path=PUBMED_PRETRAINED_PATH):
    """
    Read Pubmed Pretrained embedding from Amazon S3 and 
    return dictionary of embeddings
    """
    embeddings = {}
    with EmbeddingsTextFile(pretrained_path) as embeddings_file:
        for line in embeddings_file:
            token = line.split(' ', 1)[0]
            if token in p_dict.keys():
                fields = line.rstrip().split(' ')
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
    return embeddings
Exemple #6
0
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]:
    # Moving this import to the top breaks everything (cycling import, I guess)
    from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile

    logger.info('Reading pretrained tokens from: %s', embeddings_file_uri)
    tokens: List[str] = []
    with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file:
        for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1):
            token_end = line.find(' ')
            if token_end >= 0:
                token = line[:token_end]
                tokens.append(token)
            else:
                line_begin = line[:20] + '...' if len(line) > 20 else line
                logger.warning(f'Skipping line number %d: %s', line_number, line_begin)
    return tokens
Exemple #7
0
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]:
    """
    Helper method for loading pretrained tokens from a given file. This is usually
    the case for e.g. GloVe files, containing the token and the embedding of that token.

    Parameters
    ----------
    embeddings_file_url : ``str`` The path to the embedding file to load.
    """
    # Moving this import to the top breaks everything (cycling import, I guess)
    from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile

    LOGGER.info('Reading pretrained tokens from: %s', embeddings_file_uri)
    tokens: List[str] = []
    with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file:
        for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1):
            token_end = line.find(' ')
            if token_end >= 0:
                token = line[:token_end]
                tokens.append(token)
            else:
                line_begin = line[:20] + '...' if len(line) > 20 else line
                LOGGER.warning(f'Skipping line number %d: %s', line_number, line_begin)
    return tokens
Exemple #8
0
 def check_num_tokens(first_line, expected_num_tokens):
     with open(test_filename, "w") as f:
         f.write(first_line)
     with EmbeddingsTextFile(test_filename) as f:
         assert (f.num_tokens == expected_num_tokens
                 ), f"Wrong num tokens for line: {first_line}"
Exemple #9
0
def _read_embeddings_from_text_file(
        file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens"
) -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped.

    The remainder of the docstring is identical to `_read_pretrained_embeddings_file`.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    char_embeddings = {}
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(" ")
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1,
                        line,
                    )
                    continue

                vector = numpy.asarray(fields[1:], dtype="float32")
                for char in list(token):
                    if char in char_embeddings:
                        char_embeddings[char] = (char_embeddings[char][0] + vector, char_embeddings[char][1] + 1)
                    else:
                        char_embeddings[char] = (vector, 1)
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary"
        )

    char_embeddings = {char: char_embeddings[char][0] / char_embeddings[char][1] for char in char_embeddings}
    chars = set(char_embeddings.keys())

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std
    )
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        elif len(set(token) - chars) == 0:
            embedding_matrix[i] = torch.FloatTensor([char_embeddings[char] for char in list(token)]).sum(dim=-2)
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.", token
            )

    logger.info(
        "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size
    )

    return embedding_matrix