def test_embeddings_are_as_expected(self):
        loaded_sentences, loaded_embeddings = self._load_sentences_embeddings()

        assert len(loaded_sentences) == len(loaded_embeddings)
        batch_size = len(loaded_sentences)

        # The sentences and embeddings are organized in an idiosyncratic way TensorFlow handles batching.
        # We are going to reorganize them linearly so they can be grouped into batches by AllenNLP.
        sentences = []
        expected_embeddings = []
        for batch_number in range(len(loaded_sentences[0])):
            for index in range(batch_size):
                sentences.append(loaded_sentences[index][batch_number].split())
                expected_embeddings.append(
                    loaded_embeddings[index][batch_number])

        assert len(expected_embeddings) == len(sentences)

        embedder = ElmoEmbedder(options_file=self.options_file,
                                weight_file=self.weight_file)
        embeddings = list(embedder.embed_sentences(sentences, batch_size))

        assert len(embeddings) == len(sentences)

        for tensor, expected in zip(embeddings, expected_embeddings):
            numpy.testing.assert_array_almost_equal(tensor[2], expected)
    def test_embed_batch_contains_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file,
                                weight_file=self.weight_file)
        embeddings = list(
            embedder.embed_sentences(["This is a test".split(), []]))

        assert len(embeddings) == 2
Esempio n. 3
0
class ElmoEmbedder(GenericEmbedder):
    def __init__(self) -> None:
        super().__init__()
        self._elmo = AllenNLPElmoEmbedder()

    def embed_sentence(self,
                       text,
                       tokenized=True,
                       term_vectors=False,
                       **kwargs):
        if tokenized:
            tokens = text.split()
        else:
            tokens = self._spacy.word_tokenize(text)

        vectors = self._elmo.embed_sentence(tokens)
        return self._get_term_or_seq_vectors(vectors, term_vectors,
                                             kwargs.get("pooling", "mean"))

    def embed_collection(self,
                         iterable,
                         tokenized=True,
                         term_vectors=False,
                         **kwargs):
        collection_vectors = self._elmo.embed_sentences(
            TokenizerWrapper(self._spacy, iterable, tokenized))
        for vectors in collection_vectors:
            yield self._get_term_or_seq_vectors(vectors, term_vectors,
                                                kwargs.get("pooling", "mean"))
Esempio n. 4
0
def get_protein_nodes(raw_dir):
    """
    Retrieves features for the proteins. Embeddings are computed from protein sequences using the  Elmo model from
    https://github.com/rostlab/SeqVec

    :param raw_dir: Path to the directory containing preprocessed protein embeddings
    :return: protein_nodes dataframe. Each line corresponds to a gene and contains an embedding of the gene based on
        the sequence of the corresponding protein
    """
    print("Processing protein nodes..")

    if os.path.isfile(os.path.join(raw_dir, "protein_embeddings.csv")):
        protein_nodes = pd.read_csv(os.path.join(raw_dir,
                                                 "protein_embeddings.csv"),
                                    index_col="gene_hgnc_id")
    else:
        print("Computing protein embeddings, only happens the first time")
        from allennlp.commands.elmo import ElmoEmbedder

        # Embedder of protein sequences
        embedder = ElmoEmbedder(
            os.path.join(get_project_root(),
                         "DrugComb/raw/uniref50_v2/options.json"),
            os.path.join(get_project_root(),
                         "DrugComb/raw/uniref50_v2/weights.hdf5"),
            cuda_device=-1,
        )

        unique_gene_id = (rsv.get_proteins()["gene_hgnc_id"].drop_duplicates(
            keep="first").index)

        # Get dataframe containing protein sequences
        protein_nodes = (rsv.get_proteins()[[
            "gene_hgnc_id", "protein_sequence"
        ]].loc[unique_gene_id].set_index("gene_hgnc_id"))

        # Embed the protein sequences
        seqs = list(protein_nodes["protein_sequence"])
        seqs = [list(seq) for seq in seqs]
        seqs.sort(key=len)
        embedding = []
        for i in tqdm(range(len(seqs) // 10 + 1)):
            batch_embd = list(
                embedder.embed_sentences(seqs[10 * i:10 * (i + 1)]))
            batch_embd = [embd.sum(axis=0).mean(axis=0) for embd in batch_embd]

            embedding.extend(batch_embd)

        protein_nodes = pd.DataFrame(
            embedding,
            index=protein_nodes.index,
            columns=["emb_" + str(i) for i in range(len(embedding[0]))],
        )
        protein_nodes["is_drug"] = 0

        # Save the embeddings
        protein_nodes.to_csv(os.path.join(raw_dir, "protein_embeddings.csv"))

    return protein_nodes
class ElmoVectorEmbedderRunner(object):
    def __init__(self):
        self.max_sentences = 4
        self.max_words = 40
        self.embedding_size = 1024 * 3
        self.model = ElmoEmbedder()
        self.null_vector = np.zeros((self.max_sentences, 1024 * 3))

    def get_embedding(self, tokens):
        # return [self.model.embed_sentence(i) for i in p1["tokens"]]
        # sentences =  [i[0] for i in self.model.embed_sentences(tokens[0:self.max_sentences])]
        sentences = [
            i for i in self.model.embed_sentences(tokens[0:self.max_sentences])
        ]
        for idx in range(len(sentences)):
            sentence = sentences[idx]
            f1, f2, f3 = sentence
            f1 = f1.mean(0)
            f2 = f2.mean(0)
            f3 = f3.mean(0)
            combined = np.concatenate([f1, f2, f3], 0)
            # if sentence.shape[0] < self.max_words:
            #     word_diff = self.max_words - sentence.shape[0]
            #     zshape = (word_diff, sentence.shape[1])
            #     sentence = np.concatenate([sentence, np.zeros(zshape)], 0)
            # sentences[idx] = sentence.mean(0)
            sentences[idx] = combined

        sentences = np.asarray(sentences)

        try:
            if sentences.shape[0] < self.max_sentences:
                sentence_diff = self.max_sentences - sentences.shape[0]
                # zshape = (sentence_diff, self.max_words, self.embedding_size)
                zshape = (sentence_diff, self.embedding_size)
                sentences = np.concatenate([sentences, np.zeros(zshape)], 0)
        except ValueError:
            return None

        return sentences

    def map_function(self, text_tokens):
        results = []
        mlength = len(text_tokens)
        for idx, tokens in enumerate(text_tokens):
            embedded = self.get_embedding(tokens)
            if embedded is not None:
                results.append(embedded)
            else:
                results.append(self.null_vector)
                print("Problem with: {}".format(idx))
            if idx % 100 == 0:
                print("{} out of {}".format(idx, mlength))
        return results
Esempio n. 6
0
def get_page_sectionwise_elmo_vecs(page_outline_dict):
    outline_elmo_dict = dict()
    elmo = ElmoEmbedder()
    for page in page_outline_dict.keys():
        tokenized_headings = [page.split()]
        sections = page_outline_dict[page]
        for s in sections:
            tokenized_headings.append(s.split())
        embed_vecs = elmo.embed_sentences(tokenized_headings)
        outline_elmo_dict[page] = embed_vecs
    return outline_elmo_dict
def get_elmo_fea(data, op, wg):
    '''
    Took this method from public kernel:
    https://www.kaggle.com/wochidadonggua/elmo-baseline

    modified it to concatenate all 3 layers
    '''

    def get_nearest(slot, target):
        for i in range(target, -1, -1):
            if i in slot:
                return i

    # add parameter cuda_device=0 to use GPU
    elmo = ElmoEmbedder(options_file=op, weight_file=wg)

    tk = word_tokenizer.WordTokenizer()
    tokens = tk.batch_tokenize(data.Text)
    idx = []

    for i in range(len(tokens)):
        idx.append([x.idx for x in tokens[i]])
        tokens[i] = [x.text for x in tokens[i]]

    vectors = elmo.embed_sentences(tokens)

    ans = []
    for i, vector in enumerate([v for v in vectors]):
        P_l = data.iloc[i].Pronoun
        A_l = data.iloc[i].A.split()
        B_l = data.iloc[i].B.split()

        P_offset = data.iloc[i]['Pronoun-offset']
        A_offset = data.iloc[i]['A-offset']
        B_offset = data.iloc[i]['B-offset']

        if P_offset not in idx[i]:
            P_offset = get_nearest(idx[i], P_offset)
        if A_offset not in idx[i]:
            A_offset = get_nearest(idx[i], A_offset)
        if B_offset not in idx[i]:
            B_offset = get_nearest(idx[i], B_offset)

        # P is a single token. For A and B, average over tokens in the span.
        emb_P = vector[:, idx[i].index(P_offset), :]
        emb_A = np.mean(vector[:, idx[i].index(A_offset):idx[i].index(A_offset) + len(A_l), :], axis=1)
        emb_B = np.mean(vector[:, idx[i].index(B_offset):idx[i].index(B_offset) + len(B_l), :], axis=1)

        ans.append(np.concatenate([emb_A[0], emb_A[1], emb_A[2], emb_B[0], emb_B[1], emb_B[2],
                                   emb_P[0], emb_P[1], emb_P[2]], axis=0).reshape(1, -1))

    emb = np.concatenate(ans, axis=0)
    return emb
Esempio n. 8
0
class EmbeddingBaseline:
    def _warm_up_elmo(self):
        # running a few sentences in elmo will set it to a better state than initial zeros
        warm_up_sent = "En efecto , rematado ya su juicio , vino a dar en el más " \
                       "extraño pensamiento que jamás dio loco en el mundo ; y fue que " \
                       "le pareció convenible y necesario , así para el aumento de su honra " \
                       "como para el servicio de su república , hacerse caballero andante , e irse " \
                       "por todo el mundo con sus armas y caballo a buscar las " \
                       "aventuras y a ejercitarse en todo aquello que él había leído que " \
                       "los caballeros andantes se ejercitaban , deshaciendo todo género de agravio , y poniéndose " \
                       "en ocasiones y peligros donde , acabándolos , cobrase eterno nombre y fama .".split()
        for _ in range(3):
            _ = list(
                self.elmo.embed_sentences([warm_up_sent] * self.batch_size,
                                          self.batch_size))

    def __init__(self, cuda_device, weights_path, options_path, batch_size=40):
        super().__init__()
        logging.info('creating elmo in device %d. weight path %s '
                     ' batch_size: %d' %
                     (cuda_device, weights_path, batch_size))
        self.elmo = ElmoEmbedder(cuda_device=cuda_device,
                                 weight_file=weights_path,
                                 options_file=options_path)

        self.batch_size = batch_size

        logging.info('warming up elmo')
        self._warm_up_elmo()

    def embed_sentences(self, inst_id_to_sentence):
        inst_id_sent_tuples = list(inst_id_to_sentence.items())
        target = inst_id_sent_tuples[0][0].rsplit('.', 1)[0]
        to_embed = []

        for _, (tokens, _) in inst_id_sent_tuples:

            to_embed.append(tokens)

        logging.info('embedding %d sentences for target %s' %
                     (len(to_embed), target))
        embedded = list(self.elmo.embed_sentences(to_embed, self.batch_size))
        instance_embedding = dict()

        for index, (inst_id, _) in enumerate(inst_id_sent_tuples):

            instance_embedding[inst_id] = embedded[index][2][
                inst_id_to_sentence[inst_id][1]]
        return instance_embedding
Esempio n. 9
0
class Elmo_embedder():
    def __init__(
            self,
            model_dir='/home/go96bix/projects/deep_eve/seqvec/uniref50_v2',
            weights="/weights.hdf5",
            options="/options.json"):
        # torch.set_num_threads(multiprocessing.cpu_count()//2)
        self.model_dir = model_dir
        self.weights = self.model_dir + weights
        self.options = self.model_dir + options
        self.seqvec = ElmoEmbedder(self.options, self.weights, cuda_device=-1)

    def elmo_embedding(self, X):
        X_parsed = self.seqvec.embed_sentences(X, 100)
        return list(X_parsed)
Esempio n. 10
0
def read_parse_write(elmo: ElmoEmbedder,
                     insts: List[Instance],
                     mode: str = "average") -> None:
    """
    Attach the instances into the sentence/
    :param elmo: ELMo embedder
    :param insts: List of instance
    :param mode: the mode of elmo vectors
    :return:
    """
    all_vecs = elmo.embed_sentences([inst.input.words for inst in insts])
    index = 0
    for vec in all_vecs:
        insts[index].elmo_vec = np.average(vec, 0)
        index += 1
Esempio n. 11
0
def get_mean_elmo_embeddings(docid):
    sentences = preproc_doctext_dict[docid]
    elmo = ElmoEmbedder()
    embed_vecs = elmo.embed_sentences(sentences)
    doc_embed_vecs = []
    for i in range(len(sentences)):
        doc_embed_vecs.append(next(embed_vecs))

    cont_vec = doc_embed_vecs[0]
    for i in range(1, len(doc_embed_vecs)):
        cont_vec = np.hstack((cont_vec, doc_embed_vecs[i]))

    concat_vec = cont_vec[0]
    concat_vec = np.hstack((concat_vec, cont_vec[1]))
    concat_vec = np.hstack((concat_vec, cont_vec[2]))

    mean_vec = np.mean(concat_vec, axis=0)

    doc_embed_dict[docid] = mean_vec
Esempio n. 12
0
    def compute_elmo_embeddings(
        self,
        options_file:
        str = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/"
        "elmo_2x4096_512_2048cnn_2xhighway_options.json",
        weight_file:
        str = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/"
        "elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    ) -> Tuple[np.ndarray, ElmoEmbedder]:
        """
        Calculates the ELMo embeddings of the sentences within the corpus. Each embedding has 3 layers and 1024
        dimensions.

        :param options_file: from ELMo, the default options for the model
        :param weight_file: from ELMo, the pre-trained weights for the model
        :return: sentence_embeddings: a np.array containing sentence-level embeddings for each sentence in
                 list_of_sentences
                 elmo: the ElmoEmbedder model used to embed list_of_sentences
        """

        tok_sentences = [
            nltk.word_tokenize(sentence) for sentence in self.list_of_sentences
        ]
        logger.info('Initializing ElmoEmbedder.')
        logger.debug(
            f'Parameters: \n options_file: {options_file} \n weight_file: {weight_file}'
        )
        elmo = ElmoEmbedder(options_file, weight_file)

        word_embeddings = elmo.embed_sentences(
            tok_sentences, batch_size=16)  # this returns a generator such that
        # len(list(embedding_iter)) = len(tok_sentences)

        sentence_embeddings = []
        for embedding in word_embeddings:
            sentence_embedding = np.mean(embedding, axis=1)[
                2]  # taking last layer (ELMo layer is number 3, hence [2]).
            sentence_embeddings.append(sentence_embedding)
        sentence_embeddings = np.array(
            sentence_embeddings)  # shape (len(tok_sentences), 1024)

        return sentence_embeddings, elmo
Esempio n. 13
0
def get_elmo_embeddings(paralist):
    paralist_index_dict = dict()
    start_index = 0
    for para in paralist:
        sent_count = len(preproc_paratext_dict[para])
        paralist_index_dict[para] = (start_index, start_index + sent_count)
        start_index += sent_count
    sentences = []
    for para in paralist:
        sentences = sentences + preproc_paratext_dict[para]
    elmo = ElmoEmbedder()
    embed_vecs = elmo.embed_sentences(sentences, 10)

    for para in paralist_index_dict.keys():
        para_embed_vecs = []
        for i in range(paralist_index_dict[para][0],
                       paralist_index_dict[para][1]):
            para_embed_vecs.append(next(embed_vecs))
        para_embed_dict[para] = para_embed_vecs
    print("{} paras embedded".format(len(paralist)))
Esempio n. 14
0
def get_elmo_embeddings(sentences, max_tokens):
    #create a pretrained elmo model (requires internet connection)
    elmo = ElmoEmbedder(cuda_device=0)
    embeddings = []

    #loop through the input sentences
    for index, elmo_embedding in enumerate(elmo.embed_sentences(sentences)):
        print("elmo:", index)
        # Average the 3 layers returned from Elmo
        avg_elmo_embedding = np.average(elmo_embedding, axis=0)
        padding_length = max_tokens - avg_elmo_embedding.shape[0]
        if (padding_length > 0):
            avg_elmo_embedding = np.append(avg_elmo_embedding,
                                           np.zeros(
                                               (padding_length,
                                                avg_elmo_embedding.shape[1])),
                                           axis=0)
        else:
            avg_elmo_embedding = avg_elmo_embedding[:max_tokens]
        embeddings.append(avg_elmo_embedding)
    #return 1024 embeddings per word
    return np.array(embeddings)
Esempio n. 15
0
class Elmo_embedder():
    def __init__(
            self,
            model_dir='/home/go96bix/projects/deep_eve/seqvec/uniref50_v2',
            weights="/weights.hdf5",
            options="/options.json"):
        torch.set_num_threads(multiprocessing.cpu_count() // 2)
        self.model_dir = model_dir
        self.weights = self.model_dir + weights
        self.options = self.model_dir + options
        self.seqvec = ElmoEmbedder(self.options, self.weights, cuda_device=-1)

    def elmo_embedding(self, X, start=None, stop=None):
        # X_trimmed = X[:, start:stop]
        assert start == None and stop == None, "deprecated to use start stop, please trim seqs beforehand"

        if type(X[0]) == str:
            X = np.array([list(i.upper()) for i in X])
        embedding = self.seqvec.embed_sentences(X)
        X_parsed = []
        for i in embedding:
            X_parsed.append(i.mean(axis=0))
        return X_parsed
Esempio n. 16
0
def read_parse_write(elmo: ElmoEmbedder,
                     infile: str,
                     outfile: str,
                     mode: str = "average",
                     batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param elmo: ELMo embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)
    if batch_size < 1:  # Not using batch
        for sent in tqdm(all_sents, desc="Elmo Embedding"):
            elmo_vecs = elmo.embed_sentence(sent)
            vec = parse_sentence(elmo_vecs, mode=mode)
            all_vecs.append(vec)
    else:  # Batched prediction
        for elmo_vecs in tqdm(elmo.embed_sentences(all_sents,
                                                   batch_size=batch_size),
                              desc="Elmo Embedding",
                              total=len(all_sents)):
            vec = parse_sentence(elmo_vecs, mode=mode)
            all_vecs.append(vec)

    print("Finishing embedding ELMo sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
Esempio n. 17
0
    def test_embeddings_are_as_expected(self):
        loaded_sentences, loaded_embeddings = self._load_sentences_embeddings()

        assert len(loaded_sentences) == len(loaded_embeddings)
        batch_size = len(loaded_sentences)

        # The sentences and embeddings are organized in an idiosyncratic way TensorFlow handles batching.
        # We are going to reorganize them linearly so they can be grouped into batches by AllenNLP.
        sentences = []
        expected_embeddings = []
        for batch_number in range(len(loaded_sentences[0])):
            for index in range(batch_size):
                sentences.append(loaded_sentences[index][batch_number].split())
                expected_embeddings.append(loaded_embeddings[index][batch_number])

        assert len(expected_embeddings) == len(sentences)

        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = list(embedder.embed_sentences(sentences, batch_size))

        assert len(embeddings) == len(sentences)

        for tensor, expected in zip(embeddings, expected_embeddings):
            numpy.testing.assert_array_almost_equal(tensor[2], expected)
                # now processes the sents in batches
                outs = []
                # unlike the tensorflow version we can have dynamic batch sizes here!
                for batchnr in range(math.ceil(len(sents) / batchsize)):
                    fromidx = batchnr * batchsize
                    toidx = (batchnr + 1) * batchsize
                    actualtoidx = min(len(sents), toidx)
                    # print("Batch: from=",fromidx,"toidx=",toidx,"actualtoidx=",actualtoidx)
                    sentsbatch = sents[fromidx:actualtoidx]
                    sentsbatch = [s.split()[:maxtoks] for s in sentsbatch]
                    for s in sentsbatch:
                        if len(s) == 0:
                            s.append(
                                ""
                            )  # otherwise we get a shape (3,0,dims) result
                    ret = list(elmo.embed_sentences(sentsbatch))
                    # the ret is the original representation of three vectors per word
                    # We first combine per word through concatenation or average, then average
                    if concat:
                        ret = [np.concatenate(x, axis=1) for x in ret]
                    else:
                        ret = [np.average(x, axis=1) for x in ret]
                    # print("DEBUG tmpembs=", [l.shape for l in tmpembs])
                    ret = [np.average(x, axis=0) for x in ret]
                    # print("DEBUG finalreps=", [l.shape for l in finalreps])
                    outs.extend(ret)

                # print("Result lines:", len(outs))
                outs = [a.tolist() for a in outs]
                print(fields[0],
                      fields[1],
Esempio n. 19
0
class BilmElmo(Bilm):
    def __init__(self,
                 cuda_device,
                 weights_path,
                 vocab_path,
                 lemmatize_predictions,
                 batch_size=40,
                 cutoff=50,
                 cutoff_elmo_vocab=50000,
                 disable_symmetric_patterns=False):
        super().__init__()
        logging.info(
            'creating elmo in device %d. weight path %s, vocab_path %s, lemmatize_predictions %s,'
            ' batch_size: %d disable_symmetric_patterns:%s' %
            (cuda_device, weights_path, vocab_path, lemmatize_predictions,
             batch_size, disable_symmetric_patterns))
        self.elmo = ElmoEmbedder(cuda_device=cuda_device)

        self.batch_size = batch_size
        self.cutoff = cutoff
        self.disable_symmetric_patterns = disable_symmetric_patterns
        logging.info('warming up elmo')
        self._warm_up_elmo()
        logging.info('reading elmo weights')
        with h5py.File(weights_path) as fin:
            self.elmo_softmax_w = fin['softmax/W'][:].transpose()
        self.elmo_word_vocab = []

        def add_words_from_lines(lines):
            self.elmo_word_vocab = []
            stop_words = {'<UNK>', '<S>', '</S>', '--', '..', '...', '....'}
            rows_delete = []
            for idx, line in enumerate(lines):
                word = line.strip()
                if word in stop_words or len(word) <= 1:
                    rows_delete.append(idx)
                    continue
                self.elmo_word_vocab.append(word)
            self.elmo_softmax_w = np.delete(self.elmo_softmax_w, rows_delete,
                                            1)

        logging.info('reading elmo vocabulary')
        if lemmatize_predictions:
            if os.path.isfile(vocab_path + '.lemmatized'):
                with open(vocab_path + '.lemmatized') as fin:
                    add_words_from_lines(fin)
            else:
                with open(vocab_path) as fin:
                    unlem = [x.strip() for x in fin.readlines()]
                logging.info('lemmatizing ELMo vocabulary')
                print('lemmatizing ELMo vocabulary')
                import spacy
                nlp = spacy.load("en", disable=['ner', 'parser'])
                new_vocab = []
                for spacyed in tqdm(nlp.pipe(
                        unlem,
                        batch_size=1000,
                        n_threads=multiprocessing.cpu_count()),
                                    total=len(unlem)):
                    new_vocab.append(spacyed[0].lemma_ if spacyed[0].
                                     lemma_ != '-PRON-' else spacyed[0].lower_)
                with open(vocab_path + '.lemmatized', 'w') as fout:
                    for word in new_vocab:
                        fout.write('%s\n' % word)
                add_words_from_lines(new_vocab)
                logging.info('lemmatization done and cached to file')
                print('lemmatization done and cached to file')
        else:
            # no lemmatization
            with open(vocab_path) as fin:
                add_words_from_lines(fin)

        logging.info('caching cnn embeddings')
        # self.elmo.elmo_bilm.create_cached_cnn_embeddings(self.elmo_word_vocab)
        # self.elmo.elmo_bilm._has_cached_vocab = True

        self.elmo_word_vocab = self.elmo_word_vocab[:cutoff_elmo_vocab]
        self.elmo_softmax_w = self.elmo_softmax_w[:, :cutoff_elmo_vocab]

    def _warm_up_elmo(self):
        warm_up_sent = "Well , the line comes from deciding what the First Amendment interest is , " \
                       "and if this Court heed the First Amendment interest off of this difference " \
                       "between selecting who gets the benefit of 20 years of extension and just " \
                       "simply legislating in a general way prospectively , then this Court could " \
                       "hold , with respect to the prospective , that it 's not even necessary to " \
                       "raise the intermediate scrutiny in that context , but again , for Ashwander " \
                       "reasons we do n't think that this Court should address the prospective aspect " \
                       "of the CTEA even under the First Amendment .".split()
        for _ in range(3):
            _ = list(
                self.elmo.embed_sentences([warm_up_sent] * self.batch_size,
                                          self.batch_size))

    def _get_top_words_dist(self, state):
        log_probs = np.matmul(state, self.elmo_softmax_w)
        top_k_log_probs = np.argpartition(-log_probs,
                                          self.cutoff)[:self.cutoff]
        top_k_log_probs_vals = log_probs[top_k_log_probs]
        e_x = np.exp(top_k_log_probs_vals - np.max(top_k_log_probs_vals))
        probs = e_x / e_x.sum(axis=0)
        return top_k_log_probs, probs

    def predict_sent_substitute_representatives(
        self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]],
        n_representatives: int, samples_per_side_per_representative: int
    ) -> Dict[str, List[Dict[str, int]]]:
        """
        a representative is a dictionary made out of samples from both sides of the BiLM, predicting substitutes
        for a contextualized token.
        an example might look like:
        {'forward_jump':2,'backward_leap':1, 'backward_climb':1} (samples_per_side_per_representative=2)
        we return a list of n_representatives of those

        :param tokens: list of one sentence tokens
        :param target_idx: index of disambiguated token
        :param n_representatives: number of representatives
        :param samples_per_side_per_representative: number of samples to draw from each side
        :return:
        """
        inst_id_sent_tuples = list(inst_id_to_sentence.items())
        target = inst_id_sent_tuples[0][0].rsplit('.', 1)[0]
        lemma = inst_id_sent_tuples[0][0].split('.')[0]
        to_embed = []
        if self.disable_symmetric_patterns:
            # w/o sym. patterns - predict for blanked out word.
            # if the target word is the first or last in sentence get empty prediction by embedding '.'
            for _, (tokens, target_idx) in inst_id_sent_tuples:
                forward = tokens[:target_idx]
                backward = tokens[target_idx + 1:]
                if not forward:
                    forward = ['.']
                if not backward:
                    backward = ['.']
                to_embed += [forward, backward]
        else:
            # w/ sym. patterns - include target word + "and" afterwards in both directions
            for _, (tokens, target_idx) in inst_id_sent_tuples:
                to_embed += [
                    tokens[:target_idx + 1] + ['and'],
                    ['and'] + tokens[target_idx:]
                ]
        logging.info('embedding %d sentences for target %s' %
                     (len(to_embed), target))
        embedded = list(self.elmo.embed_sentences(to_embed, self.batch_size))

        results = {}
        for i in range(len(inst_id_sent_tuples)):
            inst_id, (tokens, target_idx) = inst_id_sent_tuples[i]
            sentence = ' '.join([
                t if i != target_idx else '***%s***' % t
                for i, t in enumerate(tokens)
            ])
            logging.info('instance %s sentence: %s' % (inst_id, sentence))

            forward_out_em = embedded[i * 2][2, -1, :512]
            backward_out_em = embedded[i * 2 + 1][2, 0, 512:]

            forward_idxs, forward_dist = self._get_top_words_dist(
                forward_out_em)
            backward_idxs, backward_dist = self._get_top_words_dist(
                backward_out_em)

            forward_samples = []
            # after removing samples equal to disamb. target,
            # we might end up with not enough samples, so repeat until we have enough samples
            while len(
                    forward_samples
            ) < n_representatives * samples_per_side_per_representative:
                new_samples = list(
                    np.random.choice(forward_idxs,
                                     n_representatives *
                                     samples_per_side_per_representative * 2,
                                     p=forward_dist))
                new_samples = [
                    x for x in new_samples
                    if self.elmo_word_vocab[x].lower() != lemma
                ]
                forward_samples += new_samples

            backward_samples = []
            while len(
                    backward_samples
            ) < n_representatives * samples_per_side_per_representative:
                new_samples = list(
                    np.random.choice(backward_idxs,
                                     n_representatives *
                                     samples_per_side_per_representative * 2,
                                     p=backward_dist))
                new_samples = [
                    x for x in new_samples
                    if self.elmo_word_vocab[x].lower() != lemma
                ]
                backward_samples += new_samples
            representatives = []
            for _ in range(n_representatives):
                representative = {}
                for _ in range(samples_per_side_per_representative):
                    forward_sampled_word = self.elmo_word_vocab[
                        forward_samples.pop()]
                    backward_sampled_word = self.elmo_word_vocab[
                        backward_samples.pop()]
                    representative['fw:%s' %
                                   forward_sampled_word] = representative.get(
                                       'fw:%s' % forward_sampled_word, 0) + 1
                    representative['bw:%s' %
                                   backward_sampled_word] = representative.get(
                                       'bw:%s' % backward_sampled_word, 0) + 1
                representatives.append(representative)
            logging.info('first 3 representatives out of %d:\n%s' %
                         (n_representatives, representatives[:3]))
            results[inst_id] = representatives
        return results
Esempio n. 20
0
class ElmoManager(object):
    def __init__(self, loc, load_bert=True):
        if load_bert:
            self.max_words = 200
            self.analyzer = ElmoTqaEnwikiAnalyzer(loc, self.max_words)
            self.analyzer.elmotize()
            self.model = ElmoEmbedder()
            self.embeddings = {}
            self.max_sentences = 4
            self.embedding_size = 1024

    def get_embedding(self, p1):
        # return [self.model.embed_sentence(i) for i in p1["tokens"]]
        sentences = [
            i[0] for i in self.model.embed_sentences(p1["tokens"]
                                                     [0:self.max_sentences])
        ]
        for idx in range(len(sentences)):
            sentence = sentences[idx]
            # if sentence.shape[0] < self.max_words:
            #     word_diff = self.max_words - sentence.shape[0]
            #     zshape = (word_diff, sentence.shape[1])
            #     sentence = np.concatenate([sentence, np.zeros(zshape)], 0)
            sentences[idx] = sentence.mean(0)

        sentences = np.asarray(sentences)

        if sentences.shape[0] < self.max_sentences:
            sentence_diff = self.max_sentences - sentences.shape[0]
            # zshape = (sentence_diff, self.max_words, self.embedding_size)
            zshape = (sentence_diff, self.embedding_size)
            sentences = np.concatenate([sentences, np.zeros(zshape)], 0)

        # return np.asarray(sentences)
        return sentences

    def run_test(self):
        t = len(self.analyzer.data)
        tqa_matrix = []
        sample_matrix = []
        label_matrix = []
        for idx, example in enumerate(self.analyzer.data):
            print("{} for {}".format(idx, t))
            # if idx > 1:
            #     break
            try:
                qid = example["qid"]
                qd = self.analyzer.bert_data[qid]
                tqa = qd["tqa"][0]
                tqa_embedding = self.get_embedding(tqa)

                for p in qd["enwiki"]:
                    embedding = self.get_embedding(p)
                    tqa_matrix.append(tqa_embedding)
                    label_matrix.append(1)
                    sample_matrix.append(embedding)

                for p in qd["negatives"]:
                    embedding = self.get_embedding(p)
                    tqa_matrix.append(tqa_embedding)
                    label_matrix.append(0)
                    sample_matrix.append(embedding)
            except RuntimeError:
                print("Error")
            except KeyError:
                print("Key Error")

        tqa_matrix = np.asarray(tqa_matrix)
        sample_matrix = np.asarray(sample_matrix)
        label_matrix = np.asarray(label_matrix)

        np.save("elmo_tqa.npy", tqa_matrix)
        np.save("elmo_sample.npy", sample_matrix)
        np.save("elmo_label.npy", label_matrix)

    def load_test(self):
        tqa_matrix = np.load("elmo_tqa.npy")
        sample_matrix = np.load("elmo_sample.npy")
        label_matrix = np.load("elmo_label.npy")

        tqas_train, tqas_test, \
        samples_train, samples_test, \
        labels_train, labels_test = train_test_split(tqa_matrix, sample_matrix, label_matrix,
                                                               test_size = 0.05, random_state = 422)

        # labels_train = np.where(labels_train == 0, -1, 1)
        # labels_test = np.where(labels_test == 0, -1, 1)

        model = BertLSTMModel
        loss_function = torch.nn.BCEWithLogitsLoss(reduction='mean')
        self.train_handler = DataHandler(predictors={
            'p1': tqas_train,
            'p2': samples_train
        },
                                         response=labels_train,
                                         policy=DataPolicy.ALL_DATA)
        self.trainer = BaseTrainer(data_handler=self.train_handler,
                                   model=model,
                                   loss_function=loss_function,
                                   lr=0.001)

        for i in range(10):
            self.trainer.model.is_train = True
            self.trainer.train(weight_decay=0.0000, n=5)
            self.trainer.model.is_train = False

            self.train_handler2 = DataHandler(predictors={
                'p1': tqas_test,
                'p2': samples_test
            },
                                              response=labels_test,
                                              policy=DataPolicy.ALL_DATA)

            results = self.trainer.model(self.train_handler2).detach().numpy()
            results = expit(results)
            results = np.where(results >= 0.5, 1, 0)
            results = np.squeeze(results)

            acc = (np.squeeze(results)
                   == np.squeeze(labels_test)).sum() / labels_test.shape[0]
            print("Acc Test: {}".format(acc))

            self.train_handler2 = DataHandler(predictors={
                'p1': tqas_train,
                'p2': samples_train
            },
                                              response=labels_train,
                                              policy=DataPolicy.ALL_DATA)

            tp = (((results == 1) * (labels_test == 1)).sum())
            tn = ((results == 0) * (labels_test == 0)).sum()
            fn = ((results == 0) * (labels_test == 1)).sum()
            fp = ((results == 1) * (labels_test == 0)).sum()
            print("tp: {}, tn: {}, fn: {}, fp: {}, recall: {}, precision: {}".
                  format(tp, tn, fn, fp, tp / (tp + fn), tp / (tp + fp)))

            results = self.trainer.model(self.train_handler2).detach().numpy()
            results = expit(results)
            results = np.where(results >= 0.5, 1, 0)

            acc = (np.squeeze(results)
                   == np.squeeze(labels_train)).sum() / labels_train.shape[0]
            print("Acc Train: {}".format(acc))

            # results = np.where(results > 0.5, 1, 1)
            #
            # acc = (np.squeeze(results) == np.squeeze(labels_test)).sum() / labels_test.shape[0]
            # print(acc)
            #
            # results = np.where(results > 0.5, 0, 0)
            #
            # acc = (np.squeeze(results) == np.squeeze(labels_test)).sum() / labels_test.shape[0]
            # print(acc)

        torch.save(self.trainer.model, "elmo_model")
Esempio n. 21
0
class FastElmo(EmbeddingCandidateGenerator):
    """
    A method to re-rank candidates accordingly with vector similarities, based on the ELMO embeddings.
    """

    def __init__(self, *lookup_services: LookupService,
                 config=EmbeddingCandidateGeneratorConfig(max_subseq_len=0,
                                                          abstract='short',
                                                          abstract_max_tokens=15)):
        super().__init__(*lookup_services, config=config)
        self._model = ElmoEmbedder(cuda_device=0,
                                   weight_file=os.path.join(os.path.dirname(__file__),
                                                            'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'),
                                   options_file=os.path.join(os.path.dirname(__file__),
                                                             'elmo_2x4096_512_2048cnn_2xhighway_options.json'))

    def _embed_sentences(self, sentences: List[str], mode) -> List[np.ndarray]:
        """
        Generic method to generate sentence embeddings from ELMO.
        :param sentences: the list of sentences to embed
        :param mode: from which layer of ELMO you want the embedding.
                     "mean" gets the embedding of the three elmo layers for each token
        :return: a list of embeddings
        """
        model_outputs = self._model.embed_sentences([sentence.split() for sentence in sentences], batch_size=16)

        embeds = []
        if mode == "layer_2":
            embeds = [model_output[2] for model_output in model_outputs]

        if mode == "layer_1":
            embeds = [model_output[1] for model_output in model_outputs]

        if mode == "layer_0":
            embeds = [model_output[0] for model_output in model_outputs]

        if mode == "mean":
            embeds = [(model_output[0] + model_output[1] + model_output[2]) / 3 for model_output in model_outputs]

        embeds = [np.mean(embed, axis=0) if embed.size else embed for embed in embeds]

        return embeds

    def _embed_search_keys(self, search_keys: List[SearchKey], mode="layer_2") -> List[Embedding]:
        """
        Generates the sentence embeddings from ELMO for each search key in a list of SearchKey items.
        :param search_keys: the list of SearchKey to embed
        :param mode: from which layer of ELMO you want the embedding.
                     "mean" gets the embedding of the three elmo layers for each token
        :return: a list of embeddings
        """
        sentences = [" ".join([search_key.label] + [x[1] for x in search_key.context]) for search_key in search_keys]
        return [Embedding(search_key, embedding)
                for search_key, embedding in zip(search_keys, self._embed_sentences(sentences, mode))]

    def _embed_abstracts(self, abstracts: List[str], mode='layer_2') -> List[Embedding]:
        """
        Generates the sentence embeddings from ELMO for each abstract in list.
        :param abstracts: the list of abstracts to embed
        :return: a list of embeddings
        """
        return [Embedding(abstract, embedding)
                for abstract, embedding in zip(abstracts, self._embed_sentences(abstracts, mode))]
Esempio n. 22
0
class BilmElmo(Bilm):

    def __init__(self, cuda_device, weights_path, vocab_path, batch_size=40,
                 cutoff_elmo_vocab=50000):
        super().__init__()
        logging.info(
            'creating elmo in device %d. weight path %s, vocab_path %s '
            ' batch_size: %d' % (
                cuda_device, weights_path, vocab_path,
                batch_size))
        self.elmo = ElmoEmbedder(cuda_device=-1)

        self.batch_size = batch_size

        logging.info('warming up elmo')
        self._warm_up_elmo()

        logging.info('reading elmo weights')
        with h5py.File(weights_path, 'r', libver='latest', swmr=True) as fin:
            self.elmo_softmax_w = fin['softmax/W'][:cutoff_elmo_vocab, :].transpose()
            # self.elmo_softmax_b=fin['softmax/b'][:cutoff_elmo_vocab]
        self.elmo_word_vocab = []
        self.elmo_word_vocab_lemmatized = []

        # we prevent the prediction of these by removing their weights and their vocabulary altogether
        stop_words = {'<UNK>', '<S>', '</S>', '--', '..', '...', '....'}

        logging.info('reading elmo vocabulary')

        lines_to_remove = set()
        with open(vocab_path, encoding="utf-8") as fin:
            for idx, line in enumerate(fin):
                if idx == cutoff_elmo_vocab:
                    break
                word = line.strip()
                if len(word) == 1 or word in stop_words:
                    lines_to_remove.add(idx)
                self.elmo_word_vocab.append(word)

        with open(vocab_path + '.lemmatized', encoding="utf-8") as fin:
            for idx, line in enumerate(fin):
                if idx == cutoff_elmo_vocab:
                    break
                word = line.strip()
                if len(word) == 1 or word in stop_words:
                    lines_to_remove.add(idx)
                self.elmo_word_vocab_lemmatized.append(word)

        # remove stopwords
        self.elmo_word_vocab = [x for i, x in enumerate(self.elmo_word_vocab) if i not in lines_to_remove]
        self.elmo_word_vocab_lemmatized = [x for i, x in enumerate(self.elmo_word_vocab_lemmatized) if
                                           i not in lines_to_remove]
        self.elmo_softmax_w = np.delete(self.elmo_softmax_w, list(lines_to_remove), 1)
        # self.elmo_softmax_b = np.delete(self.elmo_softmax_b, list(lines_to_remove))
        # logging.info('caching cnn embeddings')
        # self.elmo.elmo_bilm.create_cached_cnn_embeddings(self.elmo_word_vocab)
        # self.elmo.elmo_bilm._has_cached_vocab = True

    @staticmethod
    def create_lemmatized_vocabulary_if_needed(vocab_path):
        """
        this creates a new voabulary file in the same directory as ELMo vocab where words has been lemmatized
        :param vocab_path: path to ELMo vocabulary
        :return:
        """
        if not os.path.isfile(vocab_path + '.lemmatized'):
            # if there is not lemmatized vocabulary create it
            with open(vocab_path, encoding="utf-8") as fin:
                unlem = [x.strip() for x in fin.readlines()]
            logging.info('lemmatizing ELMo vocabulary')
            print('lemmatizing ELMo vocabulary')
            import spacy
            nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
            new_vocab = []
            for spacyed in tqdm(
                    nlp.pipe(unlem, batch_size=1000, n_threads=multiprocessing.cpu_count()),
                    total=len(unlem)):
                new_vocab.append(spacyed[0].lemma_ if spacyed[0].lemma_ != '-PRON-' else spacyed[0].lower_)
            with open(vocab_path + '.lemmatized', 'w', encoding="utf-8") as fout:
                for word in new_vocab:
                    fout.write('%s\n' % word)
            logging.info('lemmatization done and cached to file')
            print('lemmatization done and cached to file')

    def _warm_up_elmo(self):
        # running a few sentences in elmo will set it to a better state than initial zeros
        warm_up_sent = "Well , the line comes from deciding what the First Amendment interest is , " \
                       "and if this Court heed the First Amendment interest off of this difference " \
                       "between selecting who gets the benefit of 20 years of extension and just " \
                       "simply legislating in a general way prospectively , then this Court could " \
                       "hold , with respect to the prospective , that it 's not even necessary to " \
                       "raise the intermediate scrutiny in that context , but again , for Ashwander " \
                       "reasons we do n't think that this Court should address the prospective aspect " \
                       "of the CTEA even under the First Amendment .".split()
        for _ in range(3):
            _ = list(self.elmo.embed_sentences([warm_up_sent] * self.batch_size, self.batch_size))

    def _get_top_words_dist(self, state, cutoff):
        log_probs = np.matmul(state, self.elmo_softmax_w)# (not) + self.elmo_softmax_b - we prevent unconditionally probable substitutes predictions by ignoring the bias vector
        top_k_log_probs = np.argpartition(-log_probs, cutoff)[: cutoff]
        top_k_log_probs_vals = log_probs[top_k_log_probs]
        e_x = np.exp(top_k_log_probs_vals - np.max(top_k_log_probs_vals))
        probs = e_x / e_x.sum(axis=0)
        return top_k_log_probs, probs

    def _embed_sentences(self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]], disable_symmetric_patterns) -> \
            Tuple[List, List]:
        inst_id_sent_tuples = list(inst_id_to_sentence.items())
        target = inst_id_sent_tuples[0][0].rsplit('.', 1)[0]
        to_embed = []

        if disable_symmetric_patterns:
            # w/o sym. patterns - predict for blanked out word.
            # if the target word is the first or last in sentence get empty prediction by embedding '.'
            for _, (tokens, target_idx) in inst_id_sent_tuples:
                forward = tokens[:target_idx]
                backward = tokens[target_idx + 1:]
                if not forward:
                    forward = ['.']
                if not backward:
                    backward = ['.']
                to_embed.append(forward)
                to_embed.append(backward)
        else:

            # w/ sym. patterns - include target word + "and" afterwards in both directions
            for _, (tokens, target_idx) in inst_id_sent_tuples:
                # forward sentence
                to_embed.append(tokens[:target_idx + 1] + ['and'])

                # backward sentence
                to_embed.append(['and'] + tokens[target_idx:])

        logging.info('embedding %d sentences for target %s' % (len(to_embed), target))
        embedded = list(self.elmo.embed_sentences(to_embed, self.batch_size))

        return inst_id_sent_tuples, embedded

    def predict_sent_substitute_representatives(self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]],
                                                n_represent: int,
                                                n_samples_side: int, disable_symmetric_patterns: bool,
                                                disable_lemmatiziation: bool, prediction_cutoff: int) \
            -> Dict[str, List[Dict[str, int]]]:
        """
        a representative is a dictionary made out of samples from both sides of the BiLM, predicting substitutes
        for a contextualized token.
        an example might look like:
        {'forward_jump':2,'backward_leap':1, 'backward_climb':1} (n_samples_side=2)
        we return a list of n_representatives of those

        :param inst_id_to_sentence: dictionary instance_id -> (sentence tokens list, target word index in tokens)
        :param n_represent: number of representatives
        :param n_samples_side: number of samples to draw from each side
        :param disable_symmetric_patterns: if true words are predicted from context only
        :param disable_lemmatiziation: if true predictions are not lemmatized
        :param prediction_cutoff: only top prediction_cutoff LM prediction are considered
        :return: map from instance id to list of representatives
        """
        inst_id_sent_tuples, embedded = self._embed_sentences(inst_id_to_sentence, disable_symmetric_patterns)
        lemma = inst_id_sent_tuples[0][0].split('.')[0]

        vocabulary_used = self.elmo_word_vocab if disable_lemmatiziation else self.elmo_word_vocab_lemmatized

        results = {}
        for i in range(len(inst_id_sent_tuples)):
            inst_id, (tokens, target_idx) = inst_id_sent_tuples[i]
            target_word_lower = tokens[target_idx].lower()

            sentence = ' '.join([t if i != target_idx else '***%s***' % t for i, t in enumerate(tokens)])
            logging.info('instance %s sentence: %s' % (inst_id, sentence))

            # these will be multiplied by ELMo's output matrix, [layer-number,token-index, state dims]
            # (first 512 state dims in elmo are the forward LM, 512:1024 are the backward LM)
            forward_out_em = embedded[i * 2][2, -1, :512]
            backward_out_em = embedded[i * 2 + 1][2, 0, 512:]

            forward_idxs, forward_dist = self._get_top_words_dist(forward_out_em, prediction_cutoff)
            backward_idxs, backward_dist = self._get_top_words_dist(backward_out_em, prediction_cutoff)

            forward_samples = []

            # after removing samples equal to disamb. target,
            # we might end up with not enough samples, so repeat until we have enough samples
            while len(forward_samples) < n_represent * n_samples_side:
                new_samples = list(
                    np.random.choice(forward_idxs, n_represent * n_samples_side * 2,
                                     p=forward_dist))
                new_samples = [vocabulary_used[x] for x in new_samples if
                               vocabulary_used[x].lower() != lemma and vocabulary_used[x].lower() != target_word_lower]
                forward_samples += new_samples

            backward_samples = []
            while len(backward_samples) < n_represent * n_samples_side:
                new_samples = list(
                    np.random.choice(backward_idxs, n_represent * n_samples_side * 2,
                                     p=backward_dist))
                new_samples = [vocabulary_used[x] for x in new_samples if
                               vocabulary_used[x].lower() != lemma and vocabulary_used[x].lower() != target_word_lower]
                backward_samples += new_samples
            logging.info('some forward samples: %s' % [x for x in forward_samples[:5]])
            logging.info('some backward samples: %s' % [x for x in backward_samples[:5]])
            representatives = []
            for _ in range(n_represent):
                representative = dict()
                for _ in range(n_samples_side):
                    for sample_src in forward_samples, backward_samples:
                        sample_word = sample_src.pop()
                        representative[sample_word] = representative.get(sample_word, 0) + 1
                representatives.append(representative)
            logging.info('first 3 representatives out of %d:\n%s' % (n_represent, representatives[:3]))
            results[inst_id] = representatives
        return results
Esempio n. 23
0
    def test_embed_batch_contains_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = list(embedder.embed_sentences(["This is a test".split(), []]))

        assert len(embeddings) == 2
Esempio n. 24
0
class ELMoWordEmbeddings:
    def __init__(self,
                 embeddings_path,
                 elmo_options_file=DEFAULT_OPTIONS_FILE,
                 elmo_weight_file=DEFAULT_WEIGHT_FILE,
                 elmo_mode='average',
                 elmo_cuda_device=-1):
        self.embeddings_path = embeddings_path
        self.embedding_name = os.path.splitext(
            os.path.basename(
                embeddings_path))[0] if embeddings_path is not None else 'None'
        self.word2Idx = None
        self.embeddings = None

        self.elmo_options_file = elmo_options_file
        self.elmo_weight_file = elmo_weight_file
        self.elmo_cuda_device = elmo_cuda_device

        self.elmo_mode = elmo_mode
        self.elmo = None

        self.cache_computed_elmo_embeddings = False
        self.cache = {}
        self.lazyCacheFiles = []

    def getConfig(self):
        return {
            "embeddings_path": self.embeddings_path,
            "elmo_options_file": self.elmo_options_file,
            "elmo_weight_file": self.elmo_weight_file,
            "elmo_mode": self.elmo_mode,
            "elmo_cuda_device": self.elmo_cuda_device
        }

    def sentenceLookup(self, sentences):
        elmo_vectors = None

        # :: Elmo ::
        if self.elmo_mode is not None:
            elmo_vectors = self.getElmoEmbedding(sentences)

        # :: Word Embedding ::
        tokens_vectors = None
        if self.embeddings_path is not None:
            if self.word2Idx is None or self.embeddings is None:
                self.word2Idx, self.embeddings = self.readEmbeddings(
                    self.embeddings_path)

            tokens_vectors = []
            for sentence in sentences:
                per_token_embedding = []
                for token in sentence['tokens']:
                    vecId = self.word2Idx['UNKNOWN_TOKEN']

                    if token in self.word2Idx:
                        vecId = self.word2Idx[token]
                    elif token.lower() in self.word2Idx:
                        vecId = self.word2Idx[token.lower()]
                    per_token_embedding.append(self.embeddings[vecId])
                per_token_embedding = np.asarray(per_token_embedding)
                tokens_vectors.append(per_token_embedding)

        out_vectors = {}
        if tokens_vectors is not None:
            out_vectors['tokens'] = tokens_vectors

        if elmo_vectors is not None:
            out_vectors['elmo'] = elmo_vectors

        return out_vectors

    def batchLookup(self, sentences, feature_name):
        if feature_name == 'tokens':
            if self.word2Idx is None or self.embeddings is None:
                self.word2Idx, self.embeddings = self.readEmbeddings(
                    self.embeddings_path)

            tokens_vectors = []
            for sentence in sentences:
                per_token_embedding = []
                for token in sentence['tokens']:
                    vecId = self.word2Idx['UNKNOWN_TOKEN']

                    if token in self.word2Idx:
                        vecId = self.word2Idx[token]
                    elif token.lower() in self.word2Idx:
                        vecId = self.word2Idx[token.lower()]
                    per_token_embedding.append(self.embeddings[vecId])
                per_token_embedding = np.asarray(per_token_embedding)
                tokens_vectors.append(per_token_embedding)

            return np.asarray(tokens_vectors)
        elif feature_name == 'elmo':
            return np.asarray(self.getElmoEmbedding(sentences))
        else:
            print("Unknown feature name was passed to singleSentenceLookup")
            assert (False)

    def applyElmoMode(self, elmo_vectors):
        if self.elmo_mode == 'average':
            return np.average(elmo_vectors, axis=0).astype(np.float32)
        elif self.elmo_mode == 'weighted_average':
            return np.swapaxes(elmo_vectors, 0, 1)
        elif self.elmo_mode == 'last':
            return elmo_vectors[-1, :, :]
        elif isinstance(self.elmo_mode, int):
            return elmo_vectors[int(self.elmo_mode), :, :]
        else:
            print("Unknown ELMo mode")
            assert (False)

    def getElmoEmbedding(self, sentences):
        if len(self.lazyCacheFiles) > 0:
            self._loadLazyCache()

        elmo_embeddings = []
        non_cached_sentences = []
        non_cached_sentences_indices = []

        # :: Lookup cached sentences ::
        for sentence in sentences:
            tokens = sentence['tokens']
            cache_key = tuple(tokens)
            if len(self.cache) > 0 and cache_key in self.cache:
                elmo_embeddings.append(
                    self.applyElmoMode(self.cache[cache_key]))
            else:
                non_cached_sentences.append(tokens)
                non_cached_sentences_indices.append(len(elmo_embeddings))
                elmo_embeddings.append(None)

        # :: Compute ELMo on the fly ::
        if len(non_cached_sentences) > 0:
            if self.elmo is None:
                self.loadELMo()

            idx = 0
            for elmo_vectors in self.elmo.embed_sentences(
                    non_cached_sentences):
                assert (
                    elmo_embeddings[non_cached_sentences_indices[idx]] == None)
                elmo_embeddings[non_cached_sentences_indices[
                    idx]] = self.applyElmoMode(elmo_vectors)

                if self.cache_computed_elmo_embeddings:
                    tokens = non_cached_sentences[idx]
                    cache_key = tuple(tokens)
                    self.cache[cache_key] = elmo_vectors

                idx += 1

        return elmo_embeddings

    def getIdentifier(self):
        """Returns a unique identifier for this lookup function"""
        return "ELMoWordEmbeddings_" + self.embedding_name + "_" + str(
            self.elmo_mode)

    def loadELMo(self):
        self.elmo = ElmoEmbedder(self.elmo_options_file, self.elmo_weight_file,
                                 self.elmo_cuda_device)

    def loadCache(self, inputPath):
        self.lazyCacheFiles.append(inputPath)

    def storeCache(self, outputPath):
        f = open(outputPath, 'wb')
        pkl.dump(self.cache, f, -1)
        f.close()

    def addToCache(self, sentences):
        if self.elmo is None:
            self.loadELMo()

        idx = 0
        for elmoEmbedding in self.elmo.embed_sentences(sentences):
            cache_key = tuple(sentences[idx])
            self.cache[cache_key] = elmoEmbedding

            idx += 1

    def _loadLazyCache(self):
        while len(self.lazyCacheFiles) > 0:
            inputPath = self.lazyCacheFiles.pop()

            if not os.path.isfile(inputPath):
                print("ELMo cache file not found:", inputPath)
                continue

            f = open(inputPath, 'rb')
            loaded_cache = pkl.load(f)
            f.close()

            if len(self.cache) == 0:
                self.cache = loaded_cache
            else:
                self.cache.update(loaded_cache)

    def readEmbeddings(self, embeddingsPath):
        filename = os.path.basename(embeddingsPath)
        if not os.path.isfile(embeddingsPath):
            if filename in [
                    'komninos_english_embeddings.gz',
                    'levy_english_dependency_embeddings.gz',
                    'reimers_german_embeddings.gz'
            ]:
                self.getEmbeddings(filename, embeddingsPath)
            else:
                print("The embeddings file %s was not found" % embeddingsPath)
                exit()

        # :: Read in word embeddings ::
        logging.info("Read file: %s" % embeddingsPath)
        word2Idx = {}
        embeddings = []
        embeddingsIn = gzip.open(
            embeddingsPath, "rt") if embeddingsPath.endswith('.gz') else open(
                embeddingsPath, encoding="utf8")
        embeddingsDimension = None

        for line in embeddingsIn:
            split = line.rstrip().split(" ")
            word = split[0]

            if embeddingsDimension == None:
                embeddingsDimension = len(split) - 1

            if (
                    len(split) - 1
            ) != embeddingsDimension:  # Assure that all lines in the embeddings file are of the same length
                print(
                    "ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token."
                )
                continue

            if len(word2Idx) == 0:  # Add padding+unknown
                word2Idx["PADDING_TOKEN"] = len(word2Idx)
                vector = np.zeros(embeddingsDimension)
                embeddings.append(vector)

                word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
                rndState = np.random.RandomState(
                    seed=12345
                )  # Fixed rnd seed for unknown token, so that it is always the same
                vector = rndState.uniform(
                    -0.25, 0.25, embeddingsDimension
                )  # Alternativ -sqrt(3/dim) ... sqrt(3/dim)

                embeddings.append(vector)

            vector = np.array([float(num) for num in split[1:]])

            embeddings.append(vector)
            word2Idx[word] = len(word2Idx)

        return word2Idx, embeddings

    def getEmbeddings(self, filename, savePath):
        if not os.path.isfile(savePath):
            self.download(
                "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/"
                + filename, savePath)

    def download(self, url, savePath, silent=False):
        filename = os.path.basename(
            urlparse.urlparse(url).path) or 'downloaded.file'

        def get_size():
            meta = urllib2.urlopen(url).info()
            meta_func = meta.getheaders if hasattr(
                meta, 'getheaders') else meta.get_all
            meta_length = meta_func('Content-Length')
            try:
                return int(meta_length[0])
            except:
                return 0

        def kb_to_mb(kb):
            return kb / 1024.0 / 1024.0

        def callback(blocks, block_size, total_size):
            current = blocks * block_size
            percent = 100.0 * current / total_size
            line = '[{0}{1}]'.format('=' * int(percent / 2),
                                     ' ' * (50 - int(percent / 2)))
            status = '\r{0:3.0f}%{1} {2:3.1f}/{3:3.1f} MB'
            sys.stdout.write(
                status.format(percent, line, kb_to_mb(current),
                              kb_to_mb(total_size)))

        logging.info('Downloading: {0} ({1:3.1f} MB)'.format(
            url, kb_to_mb(get_size())))
        try:
            (savePath, headers) = urlretrieve(url, savePath,
                                              None if silent else callback)
        except:
            os.remove(savePath)
            raise Exception("Can't download {0}".format(savePath))
        else:
            print()
            logging.info('Downloaded to: {0}'.format(savePath))

        return savePath
left_to_batch = 1
batch = []

while True:
    s = input(">>> ")
    if s.startswith("!!! to batch = "):
        left_to_batch = int(s.replace("!!! to batch = ", ""))
    elif s.startswith("!!! emb_type = "):
        emb_type = s.replace("!!! emb_type = ", "")
    elif s == "EXIT":
        exit()
    else:
        batch.append(s.split())
        left_to_batch -= 1
        if (left_to_batch == 0):
            ress = embedder.embed_sentences(batch)

            for sent, res in zip(batch, ress):
                print("words %d" % len(sent))
                for word_position in range(len(sent)):
                    if emb_type == "forward-top":
                        vec = res[layer, word_position, :half_dimension]
                    elif emb_type == "backward-top":
                        vec = res[layer, word_position, half_dimension:]
                    elif emb_type == "concat-top":
                        vec = res[layer, word_position, :]
                    elif emb_type == "average-top":
                        fwd_vec = res[layer, word_position, :half_dimension]
                        bck_vec = res[layer, word_position, half_dimension:]
                        vec = (fwd_vec + bck_vec) / 2
                    elif emb_type == "local":
Esempio n. 26
0
class CNN_Text(nn.Module):

    def __init__(self, config_dict, text_field, embedding_file, eval_measures):
        super(CNN_Text, self).__init__()
        self.config_dict = config_dict

        V = config_dict['embed_num']
        D = config_dict['embedding']['emb_size']
        C = config_dict['class_num']
        Ci = 1
        Co = config_dict['class_model']['cnn_max_pooling_parmas']['kernel_num']
        Ks = config_dict['kernel_sizes']
        H = config_dict['class_model']['cnn_max_pooling_parmas']['last_mlp_dim']

        # Handling embedding component - either we use Elmo model / glove like pre-trained model / none of these
        # glove/w2vec option
        if eval(config_dict['embedding']['use_pretrained']) and config_dict['embedding']['model_type'] != 'elmo':
            self.embed = nn.Embedding(V, D)
            pre_trained_embedding = build_embedding_matrix(embedding_file, text_field,
                                                           emb_size=config_dict['embedding']['emb_size'])
            self.embed.weight.data.copy_(torch.from_numpy(pre_trained_embedding))
        # elmo option
        elif eval(config_dict['embedding']['use_pretrained']) and config_dict['embedding']['model_type'] == 'elmo':
            options_file = config_dict['embedding']['elmo_options_file']
            weight_file = config_dict['embedding']['elmo_weight_file']
            self.embed = ElmoEmbedder(options_file, weight_file)
            #self.embed.training = False
            #for p in self.embed.parameters():
            #    p.requires_grad = False

        # none of these (just random constant values)
        else:
            self.embed = nn.Embedding(V, D)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(config_dict['class_model']['nn_params']['dropout'])
        # case the model should end up using meta features data
        #if eval(config_dict['meta_data_usage']['use_meta']):
        self.fc1 = torch.nn.Sequential(
            torch.nn.Linear(len(Ks) * Co + config_dict['meta_features_dim'], H),
            torch.nn.Linear(H, int(H/2)),
            #torch.nn.ReLU(),
            torch.nn.Linear(int(H/2), C),
        )
        self.eval_measures = eval_measures
        self.eval_results = defaultdict(list)
        self.text_field = text_field

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x, explanatory_meta_features):
        # case the length of the sent is too short (compared to the kernel defined), we skip it
        min_sent_length = max([ker.kernel_size[0] for ker in self.convs1])
        if any([1 if len(cur_sent) < min_sent_length else 0 for cur_sent in x]):
            raise IOError("Length of sentence is too short compared to kernel sizes given! Please fix")

        if eval(self.config_dict['embedding']['use_pretrained']) and self.config_dict['embedding']['model_type'] == 'elmo':
            self.embed.training = False
            x_as_text = [[self.text_field.vocab.itos[cur_idx] for cur_idx in cur_sent] for cur_sent in x]
            embeddings = list(self.embed.embed_sentences(x_as_text))
            x_embedded = torch.Tensor([e[2] for e in embeddings])
            '''
            Old way calling Elmo - was found as VERY slow, so moved to using the ElmoEmbedder class
            start_time = datetime.datetime.now()
            x_as_text = [[self.text_field.vocab.itos[cur_idx] for cur_idx in cur_sent if cur_idx != 1] for cur_sent in x]
            duration = (datetime.datetime.now() - start_time).seconds
            print("x_as_text loading time: {} sec".format(duration))
            start_time = datetime.datetime.now()
            character_ids = batch_to_ids(x_as_text)
            embeddings = self.embed(character_ids)
            x_embedded = embeddings['elmo_representations'][0]
            duration = (datetime.datetime.now() - start_time).seconds
            print("Elmo model loading time: {} sec".format(duration))
            '''
        else:
            #start_time = datetime.datetime.now()
            x_embedded = self.embed(x)  # (N, W, D)
            #duration = (datetime.datetime.now() - start_time).seconds
            #print("embed model loading time: {} sec".format(duration))
        x_embedded_unsqueezed = x_embedded.unsqueeze(1)  # (N, Ci, W, D)
        # now converting x to list of 3 Tensors (one for each convolution)
        x_convultioned = [F.relu(conv(x_embedded_unsqueezed)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        x_max_pooled = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x_convultioned]  # [(N, Co), ...]*len(Ks)
        # concatenate all kernel-sizes (by default there are 3 such ones)
        x_concat = torch.cat(x_max_pooled, 1)
        # calculating the average per column over all instances
        x_concat_avg = torch.mean(input=x_concat, dim=0)
        # adding the explanatory meta features - sorting them by name and then concatinating it to the NN output
        if explanatory_meta_features is not None:
            meta_features_sorted = [value for (key, value) in sorted(explanatory_meta_features[0].items())]
            # case we run on the GPU, we'll convert the data to the GPU
            if self.config_dict['cuda']:
                meta_features_sorted = torch.FloatTensor(meta_features_sorted).cuda()
            x_concat_avg_with_meta = torch.cat([x_concat_avg, torch.tensor(meta_features_sorted)])
            x_dropped_out = self.dropout(x_concat_avg_with_meta)  # (N, len(Ks)*Co)
        else:
            x_dropped_out = self.dropout(x_concat_avg)  # (N, len(Ks)*Co)
        # x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        # x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        # x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        # x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        # this step doesn't change the shape of x_concat, only adds a probability not to take some weights into account
        logit = self.fc1(x_dropped_out)  # (N, C)
        gc.collect()
        return logit

    def calc_eval_measures(self, y_true, y_pred, nomalize_y=True):
        """
        calculation of the evaluation measures for a given prediciton vector and the y_true vector
        :param y_true: list of ints
            list containing the true values of y. Any value > 0 is considered as 1 (drawing),
            all others are 0 (not drawing)
        :param y_pred: list of floats
            list containing prediction values for each sr. It represnts the probability of the sr to be a drawing one
        :param nomalize_y: boolean. default: True
            whether or not to normalize the y_true and the predictions
        :return: dict
            dictionary with all the evalution measures calculated
        """
        if nomalize_y:
            y_true = [1 if y > 0 else 0 for y in y_true]
            binary_y_pred = [1 if p > 0.5 else 0 for p in y_pred]
        else:
            binary_y_pred = [1 if p > 0.5 else -1 for p in y_pred]
        for name, func in self.eval_measures.items():
            self.eval_results[name].append(func(y_true, binary_y_pred))
        return self.eval_results
    '''
Esempio n. 27
0
class ElmoEncoder(object):
    def __init__(self):
        self.elmo = ElmoEmbedder()

    # return: numpy array
    def encode_batch(self, sents):
        vec_seq = self.elmo.embed_sentences(sents)
        vecs = []
        for vec in vec_seq:
            vecs.append(self.collapse_vec(vec))
        # vecs = torch.stack(vecs)
        vecs = np.stack(vecs)
        return vecs

    def collapse_vec(self,
                     vec_seq,
                     time_combine_method="max",
                     layer_combine_method="add"):
        if time_combine_method == "max":
            vec = vec_seq.max(axis=1)
        elif time_combine_method == "mean":
            vec = vec_seq.mean(axis=1)
        elif time_combine_method == "concat":
            vec = np.concatenate(vec_seq, axis=1)
        elif time_combine_method == "last":
            vec = vec_seq[:, -1]
        else:
            raise NotImplementedError

        if layer_combine_method == "add":
            vec = vec.sum(axis=0)
        elif layer_combine_method == "mean":
            vec = vec.mean(axis=0)
        elif layer_combine_method == "concat":
            vec = np.concatenate(vec, axis=0)
        elif layer_combine_method == "last":
            vec = vec[-1]
        else:
            raise NotImplementedError

        return vec

    def encode(self,
               sents,
               time_combine_method="max",
               layer_combine_method="add"):
        """ Load ELMo and encode sents """
        vecs = {}
        for sent in sents:
            vec_seq = self.elmo.embed_sentence(sent)
            if time_combine_method == "max":
                vec = vec_seq.max(axis=1)
            elif time_combine_method == "mean":
                vec = vec_seq.mean(axis=1)
            elif time_combine_method == "concat":
                vec = np.concatenate(vec_seq, axis=1)
            elif time_combine_method == "last":
                vec = vec_seq[:, -1]
            else:
                raise NotImplementedError

            if layer_combine_method == "add":
                vec = vec.sum(axis=0)
            elif layer_combine_method == "mean":
                vec = vec.mean(axis=0)
            elif layer_combine_method == "concat":
                vec = np.concatenate(vec, axis=0)
            elif layer_combine_method == "last":
                vec = vec[-1]
            else:
                raise NotImplementedError
            vecs[' '.join(sent)] = vec
        return vecs
Esempio n. 28
0
def get_elmo_emb(data_name, op, wg):
    elmo = ElmoEmbedder(options_file=op, weight_file=wg, cuda_device=0)

    # data = pd.read_csv("input/gap-validation.tsv", sep = '\t')

    data = pd.read_csv(f'input/{data_name}.tsv', sep='\t')

    index = data.index
    columns = ['emb_A', 'emb_B', 'emb_P', 'label']
    emb = pd.DataFrame(index=index, columns=columns)
    emb.index.name = 'ID'

    tk = word_tokenizer.WordTokenizer()
    tokens = tk.batch_tokenize(data.Text)
    idx = []

    for i in range(len(tokens)):
        idx.append([x.idx for x in tokens[i]])
        tokens[i] = [x.text for x in tokens[i]]

    vectors = elmo.embed_sentences(tokens)

    ans = []
    for i, vector in enumerate([v for v in vectors]):
        P_l = data.iloc[i].Pronoun
        A_l = data.iloc[i].A.split()
        B_l = data.iloc[i].B.split()

        P_offset = data.iloc[i]['Pronoun-offset']
        A_offset = data.iloc[i]['A-offset']
        B_offset = data.iloc[i]['B-offset']

        if P_offset not in idx[i]:
            P_offset = get_nearest(idx[i], P_offset)
        if A_offset not in idx[i]:
            A_offset = get_nearest(idx[i], A_offset)
        if B_offset not in idx[i]:
            B_offset = get_nearest(idx[i], B_offset)

        emb_P = np.mean(vector[1:3, idx[i].index(P_offset), :],
                        axis=0,
                        keepdims=True)

        emb_A = np.mean(vector[1:3,
                               idx[i].index(A_offset):idx[i].index(A_offset) +
                               len(A_l), :],
                        axis=(1, 0),
                        keepdims=True)
        emb_A = np.squeeze(emb_A, axis=0)

        emb_B = np.mean(vector[1:3,
                               idx[i].index(B_offset):idx[i].index(B_offset) +
                               len(B_l), :],
                        axis=(1, 0),
                        keepdims=True)
        emb_B = np.squeeze(emb_B, axis=0)

        emb_A = emb_A.reshape((1024, ))
        emb_B = emb_B.reshape((1024, ))
        emb_P = emb_P.reshape((1024, ))

        label = 'Neither'
        if data.loc[i, 'A-coref']:
            label = 'A'
        if data.loc[i, 'B-coref']:
            label = 'B'

        emb.iloc[i] = [emb_A, emb_B, emb_P, label]
    return emb
Esempio n. 29
0
class SeqVec:
    def __init__(self, model_dir, cuda_device=-1, tokens_per_batch=16000):
        """
        Wrapper for efficient embedding of protein sequences with SeqVec (Heinzinger et al., 2019)
        :param model_dir: Directory storing SeqVec files (weights.hdf5 and options.json)
        :param cuda_device: Index of the CUDA device to use when encoding (-1 if CPU)
        :param tokens_per_batch: Number of tokens (amino acids per encoded sequence batch) - depends on available RAM
        """
        weights = model_dir + '/' + 'weights.hdf5'
        options = model_dir + '/' + 'options.json'
        self.seqvec = ElmoEmbedder(options, weights, cuda_device=cuda_device)
        self.tokens_per_batch = tokens_per_batch

    def encode(self,
               data,
               to_file=True,
               out_path=None,
               sum_axis=True,
               cut_out=False):
        """
        Encodes sequences stored in 'data' DataFrame
        :param data: pandas DataFrame storing sequences ('sequence' column) and optionally 'beg' and 'end' indices
        to cut out the embeddings
        :param to_file: If true save embedding for further use in 'out_path'
        :param out_path: Directory to store embeddings if to_file is True. Filenames match the indexes of the 'data'.
        :param sum_axis: Specifies whether first axis of the embedding will be summed up.
        This will results in Nx1024 embedding for a protein sequence of length N.
        :param cut_out: Optionally cut the embedding with the 'beg' and 'end' indices. Useful when calculating the
        embedding for whole sequence and cutting out only part of it. If True data must contain 'beg' and 'end' columns.
        :return results: if 'to_file' is false returns dictionary with data indexes as keys and embedding as values.
        """
        # Validate input DataFrame
        if not isinstance(data, pd.DataFrame):
            raise TypeError('Data must be a pandas DataFrame!')
        if 'sequence' not in data.columns:
            raise ValueError('DataFrame must contain sequence column!')
        if cut_out and 'beg' not in data.columns and 'end' not in data.columns:
            raise ValueError(
                'DataFrame must contain beg and end columns to if cut_out is True!'
            )
        if to_file and not os.path.isdir(out_path):
            raise OSError('Output directory does not exist!')

        # Process input DataFrame
        tmp_df = data.copy()
        tmp_df['seq_len'] = tmp_df['sequence'].apply(
            len)  # Calculate length of each sequence in DataFrame
        tmp_df = tmp_df.sort_values(by='seq_len')  # Sort sequences by length
        tmp_df['cum_seq_len'] = tmp_df['seq_len'].cumsum(
        )  # Calculate cumulative sequence lengths to split into batches
        tmp_df['batch'] = tmp_df['cum_seq_len'] // self.tokens_per_batch
        # Encode sequences in batches to speed up the process. Each batch contain at most 'tokens_per_batch' aa's.
        results = {}
        for batch in tmp_df['batch'].unique():
            df = tmp_df[tmp_df['batch'] == batch]
            sequences = df['sequence'].tolist()
            if cut_out:
                beg_indices = df['beg'].tolist()
                end_indices = df['end'].tolist()
            embs = self.seqvec.embed_sentences(sequences)
            # Sum first axis if specified
            if sum_axis:
                embs = [emb.sum(axis=0) for emb in embs]
            # Cut out sequence chunks if specified
            if cut_out:
                embs = [
                    emb[beg:end]
                    for emb, beg, end in zip(embs, beg_indices, end_indices)
                ]
            # Save results
            for emb, _id in zip(embs, df.index.values):
                if to_file:
                    np.save('{}/{}.npy'.format(out_path, _id), emb)
                else:
                    results[_id] = emb
        if not to_file:
            return results
Esempio n. 30
0
"""

from allennlp.commands.elmo import ElmoEmbedder
import numpy as np

#define max token length
max_tokens = 60

#input sentences
sentences = [
    "how are you doing", "what is your name", "can you subscribe to my channel"
]

#create a pretrained elmo model (requires internet connection)
elmo = ElmoEmbedder(cuda_device=0)
embeddings = []

#loop through the input sentences
for index, elmo_embedding in enumerate(elmo.embed_sentences(sentences)):
    print("elmo:", index)
    # Average the 3 layers returned from Elmo
    avg_elmo_embedding = np.average(elmo_embedding, axis=0)
    padding_length = max_tokens - avg_elmo_embedding.shape[0]
    if (padding_length > 0):
        avg_elmo_embedding = np.append(avg_elmo_embedding,
                                       np.zeros((padding_length,
                                                 avg_elmo_embedding.shape[1])),
                                       axis=0)
    else:
        avg_elmo_embedding = avg_elmo_embedding[:max_tokens]
    embeddings.append(avg_elmo_embedding)
Esempio n. 31
0
class BowElmoEmbedder(nn.Module, ClassNursery):
    def __init__(
        self,
        emb_dim: int = 1024,
        dropout_value: float = 0.0,
        layer_aggregation: str = "sum",
        cuda_device_id: int = -1,
    ):
        """ Bag of words Elmo Embedder which aggregates elmo embedding for every token

        Parameters
        ----------
        emb_dim : int
            Embedding dimension
        dropout_value : float
            Any input dropout to be applied to the embeddings
        layer_aggregation : str
            You can chose one of ``[sum, average, last, first]``
            which decides how to aggregate different layers of ELMO. ELMO produces three
            layers of representations

            sum
                Representations from different layers are summed
            average
                Representations from different layers are average
            last
                Representations from last layer is considered
            first
                Representations from first layer is considered

        cuda_device_id : int
            Cuda device id on which representations will be transferred
            -1 indicates cpu
        """
        super(BowElmoEmbedder, self).__init__()
        self.emb_dim = emb_dim
        self.dropout_value = dropout_value
        self.layer_aggregation_type = layer_aggregation
        self.allowed_layer_aggregation_types = [
            "sum", "average", "last", "first"
        ]
        self.cuda_device_id = cuda_device_id
        self.device = (torch.device("cpu") if cuda_device_id < 0 else
                       torch.device(f"cuda:{cuda_device_id}"))
        self.msg_printer = wasabi.Printer()

        assert (
            self.layer_aggregation_type in self.allowed_layer_aggregation_types
        ), self.msg_printer.fail(
            f"For bag of words elmo encoder, the allowable aggregation "
            f"types are {self.allowed_layer_aggregation_types}. You passed {self.layer_aggregation_type}"
        )

        # load the elmo embedders
        with self.msg_printer.loading("Creating Elmo object"):
            self.elmo = ElmoEmbedder(cuda_device=self.cuda_device_id)
        self.msg_printer.good("Finished Loading Elmo object")

    def forward(self, iter_dict: Dict[str, Any]) -> torch.Tensor:
        """

        Parameters
        ----------
        iter_dict : Dict[str, Any]
            ``iter_dict`` from any dataset. Expects ``instance`` to be present in the
            ``iter_dict`` where instance is a list of sentences and the tokens are separated by
            space

        Returns
        -------
        torch.Tensor
            Returns the representation for every token in the instance
            ``[batch_size, max_len, emb_dim]``. In case of Elmo the ``emb_dim`` is 1024


        """
        # [np.array] - A generator of embeddings
        # each array in the list is of the shape (3, #words_in_sentence, 1024)
        x = iter_dict["instance"]
        x = x if isinstance(x, list) else [x]
        x = [instance.split() for instance in x]

        embedded = list(self.elmo.embed_sentences(x))

        # bs, 3, #words_in_sentence, 1024
        embedded = torch.FloatTensor(embedded)

        embedding_ = None
        # aggregate of word embeddings
        if self.layer_aggregation_type == "sum":
            # bs, #words_in_sentence, 1024
            embedding_ = torch.sum(embedded, dim=1)

        elif self.layer_aggregation_type == "average":
            # mean across all layers
            embedding_ = torch.mean(embedded, dim=1)

        elif self.layer_aggregation_type == "last":
            # bs, max_len, 1024
            embedding_ = embedded[:, -1, :, :]

        elif self.layer_aggregation_type == "first":
            # bs, max_len, 1024
            embedding_ = embedded[:, 0, :, :]

        embedding_ = embedding_.to(self.device)

        return embedding_
Esempio n. 32
0
class BowElmoEmbedder(nn.Module, BaseEmbedder, ClassNursery):
    def __init__(
        self,
        datasets_manager: DatasetsManager = None,
        layer_aggregation: str = "sum",
        device: Union[str, torch.device] = torch.device("cpu"),
        word_tokens_namespace="tokens",
    ):
        """ Bag of words Elmo Embedder which aggregates elmo embedding for every token

        Parameters
        ----------
        layer_aggregation : str
            You can chose one of ``[sum, average, last, first]``
            which decides how to aggregate different layers of ELMO. ELMO produces three
            layers of representations

            sum
                Representations from different layers are summed
            average
                Representations from different layers are average
            last
                Representations from last layer is considered
            first
                Representations from first layer is considered

        device : Union[str, torch.device]
            device for running the model on

        word_tokens_namespace: int
            Namespace where all the word tokens are stored
        """
        super(BowElmoEmbedder, self).__init__()
        self.dataset_manager = datasets_manager
        self.embedding_dimension = self.get_embedding_dimension()
        self.embedder_name = "elmo"
        self.word_tokens_namespace = word_tokens_namespace
        self.layer_aggregation_type = layer_aggregation
        self.allowed_layer_aggregation_types = [
            "sum", "average", "last", "first"
        ]
        self.device = (torch.device(device)
                       if isinstance(device, str) else torch.device(device))

        if self.device.index:
            self.cuda_device_id = self.device.index
        else:
            self.cuda_device_id = -1
        self.msg_printer = wasabi.Printer()

        assert (
            self.layer_aggregation_type in self.allowed_layer_aggregation_types
        ), self.msg_printer.fail(
            f"For bag of words elmo encoder, the allowable aggregation "
            f"types are {self.allowed_layer_aggregation_types}. You passed {self.layer_aggregation_type}"
        )

        # load the elmo embedders
        with self.msg_printer.loading("Creating Elmo object"):
            self.elmo = ElmoEmbedder(cuda_device=self.cuda_device_id)
        self.msg_printer.good("Finished Loading Elmo object")

    def forward(self, lines: List[Line]) -> torch.Tensor:
        """

        Parameters
        ----------
        lines : List[Line]
            Just a list of lines

        Returns
        -------
        torch.Tensor
            Returns the representation for every token in the instance
            ``[batch_size, max_num_words, emb_dim]``. In case of Elmo the ``emb_dim`` is 1024


        """
        # [np.array] - A generator of embeddings
        # each array in the list is of the shape (3, #words_in_sentence, 1024)

        batch_tokens = []
        token_lengths = []
        for line in lines:
            line_tokens = line.tokens[self.word_tokens_namespace]
            line_tokens = [tok.text for tok in line_tokens]
            batch_tokens.append(line_tokens)
            token_lengths.append(len(line_tokens))

        max_len = max(token_lengths)
        embedded = list(self.elmo.embed_sentences(batch_tokens))

        batch_embeddings = []

        for idx, (line, embedding) in enumerate(zip(lines, embedded)):
            tokens = line.tokens[self.word_tokens_namespace]
            line_embeddings = []
            padding_length = max_len - len(tokens)
            embedding = torch.FloatTensor(embedding)
            embedding = embedding.to(self.device)

            # 3, #words_in_sentence, 1024

            # aggregate of word embeddings
            if self.layer_aggregation_type == "sum":
                # words_in_sentence, 1024
                embedding = torch.sum(embedding, dim=0)

            elif self.layer_aggregation_type == "average":
                # mean across all layers
                embedding = torch.mean(embedding, dim=0)

            elif self.layer_aggregation_type == "last":
                # words_in_sentence, 1024
                embedding = embedding[-1, :, :]

            elif self.layer_aggregation_type == "first":
                # words_in_sentence, 1024
                embedding = embedding[0, :, :]
            else:
                raise ValueError(
                    f"Layer aggregation can be one of sum, average, last and first"
                )

            for token, token_emb in zip(tokens, embedding):
                token.set_embedding(self.embedder_name, token_emb)
                line_embeddings.append(token_emb)

            # for batching
            for i in range(padding_length):
                zeros = torch.zeros(self.embedding_dimension,
                                    device=self.device)
                line_embeddings.append(zeros)

            line_embeddings = torch.stack(line_embeddings)
            batch_embeddings.append(line_embeddings)

        batch_embeddings = torch.stack(batch_embeddings)
        return batch_embeddings

    def get_embedding_dimension(self) -> int:
        return 1024