Example #1
0
def distance(euphemism_example, index, neutral, synonym):
    euphemism_example = euphemism_example.split()

    temp = euphemism_example

    temp[index] = neutral
    neutral_example = temp

    temp[index] = synonym
    synonym_example = temp

    from allennlp.commands.elmo import ElmoEmbedder
    elmo = ElmoEmbedder('options.json', 'model.hdf5')
    import scipy

    vectors = elmo.embed_sentence(neutral_example)

    assert (len(vectors) == 3)  # one for each layer in the ELMo output
    assert (len(vectors[0]) == len(neutral_example)
            )  # the vector elements correspond with the input tokens

    vectors2 = elmo.embed_sentence(synonym_example)
    neutral_synonym_distance = scipy.spatial.distance.cosine(
        vectors[2][index], vectors2[2][index])

    vectors2 = elmo.embed_sentence(euphemism_example)
    euphemism_synonym_distance = scipy.spatial.distance.cosine(
        vectors[2][index], vectors2[2][index])

    return neutral_synonym_distance, euphemism_synonym_distance
Example #2
0
def get_elmo_score(synonyms, tok_sentence, index):
    import nltk
    from allennlp.commands.elmo import ElmoEmbedder
    import numpy
    import scipy

    elmo = ElmoEmbedder()

    vectors = elmo.embed_sentence(tok_sentence)
    original_vector = vectors[2][index]

    distances = []

    for synonym in synonyms:
        new_sentence = tok_sentence.copy()
        del new_sentence[index]
        for i, word in enumerate(synonym):
            new_sentence.insert((index + i), word)

        new_vectors = elmo.embed_sentence(new_sentence)
        if len(synonym) == 1:
            new_vector = new_vectors[2][index]
        else:
            phrase_vectors = []
            for i, word in enumerate(synonym):
                phrase_vectors.append(numpy.array(new_vectors[2][(index + i)]))
            new_vector = numpy.mean(phrase_vectors, axis=0)

        distances.append(
            scipy.spatial.distance.cosine(original_vector, new_vector))

    return distances
Example #3
0
def single_sequence_processing(batch: List[Tuple[str,
                                                 str]], model: ElmoEmbedder,
                               model_dir: Path) -> EmbedderReturnType:
    """
    Single sequence processing in case of runtime error due to
    a) very long sequence or b) too large batch size
    If this fails, you might want to consider lowering batchsize and/or
    cutting very long sequences into smaller chunks

    Returns unprocessed embeddings
    """
    for sample_id, seq in batch:
        try:
            with torch.no_grad():
                embedding = model.embed_sentence(list(seq))
            yield sample_id, embedding

        except RuntimeError as e:
            logger.error("RuntimeError for {} with {} residues: {}".format(
                sample_id, len(seq), e))
            logger.error(
                "Single sequence processing failed. Switching to CPU now. " +
                "This slows down the embedding process.")
            model = get_elmo_model(model_dir, cpu=True)
            with torch.no_grad():
                embedding = model.embed_sentence(list(seq))
            yield sample_id, embedding
Example #4
0
class Vectorize:
    def __init__(self):
        games = json.load(open(GAME_PATH))
        descriptions = json.load(open(FACE_PATH))

        path_ = '/roaming/tcastrof/drew'
        with open(os.path.join(path_, 'game.txt'), 'w') as f:
            f.write('\n'.join([' '.join(snt['tokens']) for snt in games]))

        with open(os.path.join(path_, 'face.txt'), 'w') as f:
            f.write('\n'.join(
                [' '.join(snt['tokens']) for snt in descriptions]))

        self.elmo = ElmoEmbedder(cuda_device=1)

        vectors = [self.elmo.embed_sentence(snt['tokens']) for snt in games]
        path = os.path.join(path_, 'game_elmo.hdf5')
        with h5py.File(path, 'w') as hf:
            for i, vector in enumerate(vectors):
                hf.create_dataset(str(i), data=vectors[i])

        vectors = [
            self.elmo.embed_sentence(snt['tokens']) for snt in descriptions
        ]
        path = os.path.join(path_, 'face_elmo.hdf5')
        with h5py.File(path, 'w') as hf:
            for i, vector in enumerate(vectors):
                hf.create_dataset(str(i), data=vectors[i])
Example #5
0
def elmo_Model(total_samples, data_df):

    words_context1 = [[] for _ in range(total_samples)]
    words_context2 = [[] for _ in range(total_samples)]

    word1_context1 = [0] * total_samples
    word2_context1 = [0] * total_samples
    word1_context2 = [0] * total_samples
    word2_context2 = [0] * total_samples

    similarityScore_context1 = [0] * total_samples
    similarityScore_context2 = [0] * total_samples

    difference = [0] * total_samples

    for i in range(total_samples):

        words_context1[i] = data_df['clean_context1'][i].split(' ')
        words_context2[i] = data_df['clean_context2'][i].split(' ')

        if data_df['clean_word1'][i] in words_context1[i]:
            word1_context1[i] = words_context1[i].index(data_df['clean_word1'][i])

        if data_df['clean_word2'][i] in words_context1[i]:
            word2_context1[i] = words_context1[i].index(data_df['clean_word2'][i])

        if data_df['clean_word1'][i] in words_context2[i]:
            word1_context2[i] = words_context2[i].index(data_df['clean_word1'][i])

        if data_df['clean_word2'][i] in words_context2[i]:
            word2_context2[i] = words_context2[i].index(data_df['clean_word2'][i])

        elmo_embeddingModel = ElmoEmbedder()
        tokens = words_context1[i]
        vectors = elmo_embeddingModel.embed_sentence(tokens)

        assert (len(vectors) == 3)  # one for each layer in the ELMo output
        assert (len(vectors[0]) == len(tokens))

        similarityScore_context1[i] = 1 - scipy.spatial.distance.cosine(vectors[2][word1_context1[i]], vectors[2][word2_context1[i]])

        tokens2 = words_context2[i]
        vectors2 = elmo_embeddingModel.embed_sentence(tokens2)

        # 3 corresponds to the total number of layers in the elmo model (1 for each layer)
        assert (len(vectors2) == 3)
        assert (len(vectors2[0]) == len(tokens2))

        similarityScore_context2[i] = 1 - scipy.spatial.distance.cosine(vectors2[2][word1_context2[i]], vectors2[2][word2_context2[i]])

        difference[i] = similarityScore_context2[i] - similarityScore_context1[i]

    return difference, similarityScore_context1, similarityScore_context2
Example #6
0
def use_allen():
    options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json"
    weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5"

    eltest = ElmoEmbedder(options_file, weight_file)

    test_list = ['First test']
    sec_test = ['Second test']

    vec = eltest.embed_sentence(test_list)
    vec2 = eltest.embed_sentence(sec_test)

    print(vec)
    print(vec2)
    print(scipy.spatial.distance.cosine(vec, vec2))
Example #7
0
    def test_average_embedding_works(self):
        tempdir = tempfile.mkdtemp()
        sentences_path = os.path.join(tempdir, "sentences.txt")
        output_path = os.path.join(tempdir, "output.txt")

        sentence = "Michael went to the store to buy some eggs ."
        with open(sentences_path, 'w') as f:
            f.write(sentence)

        sys.argv = ["run.py",  # executable
                    "elmo",  # command
                    sentences_path,
                    output_path,
                    "--average",
                    "--options-file",
                    self.options_file,
                    "--weight-file",
                    self.weight_file]

        main()

        assert os.path.exists(output_path)

        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        expected_embedding = embedder.embed_sentence(sentence.split())
        expected_embedding = (expected_embedding[0] + expected_embedding[1] + expected_embedding[2]) / 3

        with h5py.File(output_path, 'r') as h5py_file:
            assert list(h5py_file.keys()) == [sentence]
            # The vectors in the test configuration are smaller (32 length)
            embedding = h5py_file.get(sentence)
            assert embedding.shape == (len(sentence.split()), 32)
            numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
Example #8
0
def encode(sents, time_combine_method="max", layer_combine_method="add"):
    """ Load ELMo and encode sents """
    elmo = ElmoEmbedder()
    vecs = {}
    for sent in sents:
        vec_seq = elmo.embed_sentence(sent)
        if time_combine_method == "max":
            vec = vec_seq.max(axis=1)
        elif time_combine_method == "mean":
            vec = vec_seq.mean(axis=1)
        elif time_combine_method == "concat":
            vec = np.concatenate(vec_seq, axis=1)
        elif time_combine_method == "last":
            vec = vec_seq[:, -1]
        else:
            raise NotImplementedError

        if layer_combine_method == "add":
            vec = vec.sum(axis=0)
        elif layer_combine_method == "mean":
            vec = vec.mean(axis=0)
        elif layer_combine_method == "concat":
            vec = np.concatenate(vec, axis=0)
        elif layer_combine_method == "last":
            vec = vec[-1]
        else:
            raise NotImplementedError
        vecs[' '.join(sent)] = vec
    return vecs
Example #9
0
    def test_top_embedding_works(self):
        sentence = "Michael went to the store to buy some eggs ."
        with open(self.sentences_path, 'w') as f:
            f.write(sentence)

        sys.argv = [
            "run.py",  # executable
            "elmo",  # command
            self.sentences_path,
            self.output_path,
            "--top",
            "--options-file",
            self.options_file,
            "--weight-file",
            self.weight_file
        ]

        main()

        assert os.path.exists(self.output_path)

        embedder = ElmoEmbedder(options_file=self.options_file,
                                weight_file=self.weight_file)
        expected_embedding = embedder.embed_sentence(sentence.split())[2]

        with h5py.File(self.output_path, 'r') as h5py_file:
            assert list(h5py_file.keys()) == [sentence]
            # The vectors in the test configuration are smaller (32 length)
            embedding = h5py_file.get(sentence)
            assert embedding.shape == (len(sentence.split()), 32)
            numpy.testing.assert_allclose(embedding,
                                          expected_embedding,
                                          rtol=1e-4)
    def _buildElmoEmbedding(self, **kwargs):
        if (os.path.exists(self.elmo_embedding_filename + ".pkl")):
            pickle_in = open(self.elmo_embedding_filename + ".pkl", "rb")
            elmo_embedding = pickle.load(pickle_in)
            self.elmo_embedding = elmo_embedding
            return elmo_embedding

        layer_num = kwargs.get('layer_num', 2)

        vocab = np.genfromtxt('data/elmo/vocabulary.txt', dtype='str')
        options_file = "data/elmo/biomed_elmo_options.json"
        weight_file = "data/elmo/biomed_elmo_weights.hdf5"

        elmo = ElmoEmbedder(options_file, weight_file)

        token_array_dict = self.data.getTokenArrayDict()

        elmo_embedding = {}
        for pmid in self.pmids:
            tokens = token_array_dict[pmid]
            # Embedding is of shape (# of layers, # of tokens, 1024)
            # We take the top layer (the second layer or whichever was provided) and then sum over all tokens
            elmo_embedding[pmid] = np.sum(
                elmo.embed_sentence(tokens)[layer_num], axis=0)

        # Saving the embedding dictionary in a pickle file
        with open(self.elmo_embedding_filename + ".pkl", "wb") as f:
            pickle.dump(elmo_embedding, f)

        self.elmo_embedding = elmo_embedding

        return elmo_embedding
Example #11
0
class ElmoEmbedder(GenericEmbedder):
    def __init__(self) -> None:
        super().__init__()
        self._elmo = AllenNLPElmoEmbedder()

    def embed_sentence(self,
                       text,
                       tokenized=True,
                       term_vectors=False,
                       **kwargs):
        if tokenized:
            tokens = text.split()
        else:
            tokens = self._spacy.word_tokenize(text)

        vectors = self._elmo.embed_sentence(tokens)
        return self._get_term_or_seq_vectors(vectors, term_vectors,
                                             kwargs.get("pooling", "mean"))

    def embed_collection(self,
                         iterable,
                         tokenized=True,
                         term_vectors=False,
                         **kwargs):
        collection_vectors = self._elmo.embed_sentences(
            TokenizerWrapper(self._spacy, iterable, tokenized))
        for vectors in collection_vectors:
            yield self._get_term_or_seq_vectors(vectors, term_vectors,
                                                kwargs.get("pooling", "mean"))
class ElmoEmbedderTransformer():

    def __init__(self):
        self.elmo = ElmoEmbedder()

    def __call__(self, data):
        return self.elmo.embed_sentence(data)
Example #13
0
def single_sequence_processing(
    batch: List[Tuple[str, str]], model: ElmoEmbedder
) -> EmbedderReturnType:
    """
    Single sequence processing in case of runtime error due to
    a) very long sequence or b) too large batch size
    If this fails, you might want to consider lowering batchsize and/or
    cutting very long sequences into smaller chunks

    Returns unprocessed embeddings
    """
    for sample_id, seq in batch:
        try:
            embedding = model.embed_sentence(list(seq))
            yield sample_id, embedding

        except RuntimeError as e:
            logger.error(
                "RuntimeError for {} with {} residues: {}".format(
                    sample_id, len(seq), e
                )
            )
            logger.error(
                "Single sequence processing failed. Skipping this sequence. "
                + "Consider splitting the sequence into smaller parts or using the CPU."
            )
            yield sample_id, None
    def get_elmo_representations(self):
        """
        Get ELMo representations of contexts.
        """
        elmo = ElmoEmbedder()
        sentence_vecs = []

        # for each context, for each word, get the ELMO representation
        # then average the hidden layers into a 1xD dimensional vector
        for context_idx, tokenized_context in enumerate(self.spacy_contexts):
            raw_elmo_rep = elmo.embed_sentence(tokenized_context)
            # dimension is 3 x num_tokens x 1024

            sentence_vec = compute_elmo_tokenized_rep(
                tokenized_context,
                self.word,
                raw_elmo_rep,
                self.tf_idf_weighting,
                self.tf_idf_dicts,
                context_idx,
                self.subtract_context,
            )

            if len(sentence_vec) != 0:
                sentence_vecs.append(sentence_vec)

        vec_df = pd.DataFrame(np.row_stack(sentence_vecs))

        self.num_dims = vec_df.shape[1]
        self.embedding_representation = vec_df
        self.num_contexts = vec_df.shape[0]
Example #15
0
def Elmo_embedding(s):
    try:
        elmo_embedding = ElmoEmbedder(options_file=options_file,weight_file=weight_file)
        words = clean_text(s)
        words = word_tokenize(words)
        words = [w for w in words if not w in stop_words]
        words = [w for w in words if w.isalpha()]
        vectors = elmo_embedding.embed_sentence(words)
    except:
    embedd = np.array(vectors)
    v = embedd.sum(axis=0)  
    if type(v) != np.ndarray:
        return np.zeros(1024)
    return embedd
    

def sent2vec(embeddings_index, s ):
    words = clean_text(s)
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(RetornarVetor(embeddings_index, w))
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

def RetornarVetor(embeddings_index, palavra):
    return embeddings_index[palavra]  # return embedding for each word
Example #16
0
def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param elmo: ELMo embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)
    if batch_size < 1: # Not using batch
        for sent in tqdm(all_sents, desc="Elmo Embedding"):        
            elmo_vecs = elmo.embed_sentence(sent) 
            vec = parse_sentence(elmo_vecs, mode=mode)    
            all_vecs.append(vec)
    else:   # Batched prediction
        for elmo_vecs in tqdm(elmo.embed_sentences(all_sents, batch_size=batch_size), desc="Elmo Embedding", total=len(all_sents)):
            vec = parse_sentence(elmo_vecs, mode=mode)
            all_vecs.append(vec)

    print("Finishing embedding ELMo sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
Example #17
0
def SeqEncode_seqvec(data_df, model_dir, DataDir, save=True, cuda_device=0):
	x = []
	model_dir = Path(model_dir)
	weights = model_dir / 'weights.hdf5'
	options = model_dir / 'options.json'
	seqvec  = ElmoEmbedder(options, weights, cuda_device=cuda_device)

	if save:
		for idx, row in data_df.iterrows():
			print(idx)
			wild_seq = row['wild_seq']
			mut_seq = row['mut_seq']
			wild_embedding = seqvec.embed_sentence(list(wild_seq))
			mut_embedding = seqvec.embed_sentence(list(mut_seq))
			np.save(DataDir+'humvar/humvar_disorder_x_wild_len1000_seqvec.%s.npy' % idx, wild_embedding)
			np.save(DataDir+'humvar/humvar_disorder_x_mut_len1000_seqvec.%s.npy' % idx, mut_embedding)
Example #18
0
class _ElmoEmbedder:
    def __init__(self, dataset):
        self.dataset = dataset
        self.static_embedding_dir = STATIC_EMBEDDING_DIR.format(
            'elmo', dataset)
        self.elmo = ElmoEmbedder()

    def _embed(self, sentence):
        return self.elmo.embed_sentence(sentence).mean(0)

    def get_embeddings(self, sentences):
        try:
            with open(self.static_embedding_dir, 'rb') as f:
                embeddings = pickle.load(f)
            print('Loaded pre-extracted ELMo embeddings.')
        except FileNotFoundError:
            print('Pre-extracted ELMo embeddings for', self.dataset,
                  'not found. Extracting now...')
            embeddings = []
            for sent in sentences:
                # store average of the 3 ELMo embeddings
                embeddings.append(self._embed(sent))

            with open(self.static_embedding_dir, 'wb') as f:
                pickle.dump(np.array(embeddings), f)
            print('Saved extracted ELMo embeddings to:',
                  self.static_embedding_dir)
        return embeddings
Example #19
0
def word_embedding_elmo(sentence: List[str],
                        elmo_model: ElmoEmbedder,
                        remove_stopwords=False,
                        avg_all_layers=True) -> np.ndarray:
    """
    different from sentence_embedding_elmo, this method returns all word context embedding (with avg of all layers states)

    ELMo will compute representation of words from context given a sentence based on a N nearest neighbor approach
    "use the biLM to compute representations for a given target word and take the nearest neighbor sense from the
    training set, falling back to the first sense from WordNet for lemmas not observed during training"

    :param sentence:
    :param elmo_model:
    :param remove_stopwords:
    :param avg_all_layers:
    :return: (seq_size, feature_dim)
    """
    if remove_stopwords:
        sentence = list(stop_words_filter(sentence))
        # print("sentence filtered by stopwords: ", sentence)

    sentence_vectors = elmo_model.embed_sentence(sentence)

    if not avg_all_layers:
        # get the third/top layer's output for the sentence representation (i.e.,contextual representation)
        # In the simplest case, ELMo just selects the top layer
        sentence_word_embeddings = sentence_vectors[2][:]
    else:
        #  averaging all 3 layers improves development accuracy for SNLI
        avg_all_layer_sent_embedding = np.mean(sentence_vectors,
                                               axis=0,
                                               dtype='float32')
        return avg_all_layer_sent_embedding

    return sentence_word_embeddings
Example #20
0
def elmo_to_text(vocabulary_file, output_path, layer='nocontext'):
    """
    :param vocabulary_file: Vocabulary file. Note that usually no vocabulary file is provided with ELMo embeddings.
    :param output_path: Output file path
    :param layer: Either 'full' which equals to full Elmo after second biLSTM layer or
                  'nocontext' (context-insensitive)
    
    (Reused from original CogniVal paper)
    """
    if layer == 'full':
        layer_idx = 2
    elif layer == 'nocontext':
        layer_idx = 0
    else:
        raise ValueError('"layer" must be either "full" or "nocontext"')

    elmo = ElmoEmbedder()

    with open(vocabulary_file, 'r') as f:
        words = f.readlines()

    # Create directory
    os.makedirs(output_path.parent, exist_ok=True)

    with open(output_path, 'w') as embedding_file:
        with ProgressBar() as pb:
            for word in pb(words):
                word = word.strip()
                vectors = elmo.embed_sentence(word)

                # context insensitive - first layer
                embedding = ' '.join(map(str, vectors[layer_idx][0]))
                print(word, embedding, file=embedding_file)
Example #21
0
class ElmoEmbedding(Embedding):
    """
    Reference: https://allennlp.org/elmo
    """

    settings = {
        'weights':
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5',
        'options':
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json',
    }

    def __init__(self):
        from allennlp.modules.elmo import _ElmoCharacterEncoder
        if not path.isdir(self.path('elmo')):
            makedirs(self.path('elmo'))
        self.fweights = self.ensure_file(path.join('elmo', 'weights.hdf5'),
                                         url=self.settings['weights'])
        self.foptions = self.ensure_file(path.join('elmo', 'options.json'),
                                         url=self.settings['options'])
        self.embeddings = _ElmoCharacterEncoder(self.foptions, self.fweights)
        self.sentence_embedder = ElmoEmbedder()

    def emb(self, word, default=None):
        # If `word` is a list, return contextualized vectors
        if isinstance(word, list):
            return self.sentence_embedder.embed_sentence(word)
        # Otherwise, return a single, static vector
        else:
            idx = batch_to_ids([[word]])
            emb = self.embeddings(idx)['token_embedding']
            return emb[0, 1].tolist()
Example #22
0
    def test_average_embedding_works(self):
        sentence = "Michael went to the store to buy some eggs ."
        with open(self.sentences_path, 'w') as f:
            f.write(sentence)

        sys.argv = ["run.py",  # executable
                    "elmo",  # command
                    self.sentences_path,
                    self.output_path,
                    "--average",
                    "--options-file",
                    self.options_file,
                    "--weight-file",
                    self.weight_file]

        main()

        assert os.path.exists(self.output_path)

        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        expected_embedding = embedder.embed_sentence(sentence.split())
        expected_embedding = (expected_embedding[0] + expected_embedding[1] + expected_embedding[2]) / 3

        with h5py.File(self.output_path, 'r') as h5py_file:
            assert set(h5py_file.keys()) == {"0", "sentence_to_index"}
            # The vectors in the test configuration are smaller (32 length)
            embedding = h5py_file.get("0")
            assert embedding.shape == (len(sentence.split()), 32)
            numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
            assert json.loads(h5py_file.get("sentence_to_index")[0]) == {sentence: "0"}
Example #23
0
class LSTM(nn.Module):

    def __init__(self, n_labels, hidden_size, embedding_path, dropout=0.2, label_ignore_idx=0,
                batch_size=32, head_init_range=0.04, device='cuda:1',
                vocab_size=320, input_size=300, num_layers=2, embed_size=1024):
        super().__init__()
        self.input_size = input_size
        self.batch_size = batch_size
        self.n_labels = n_labels + 1
        self.label_ignore_idx = label_ignore_idx
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed_size = embed_size
        self.init_linear = nn.Linear(self.embed_size, self.embed_size)
        self.lstm = nn.LSTM(self.embed_size, self.hidden_size, self.num_layers, batch_first=True, bidirectional=True)
        self.classification_head = nn.Linear(self.num_layers*self.hidden_size, self.n_labels)
        self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)
        #self.crf = CRF(self.n_labels)


        options_file = embedding_path + "/options.json"
        weight_file = embedding_path + "/weights.hdf5"
        self.elmo = ElmoEmbedder(options_file, weight_file, 0)


    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_size),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_size))


    def forward(self, inputs_ids, labels, labels_mask, valid_mask):
        linear_input = self.init_linear(inputs_ids)
        lstm_out, self.hidden = self.lstm(linear_input)
        logits = self.classification_head(lstm_out)
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=self.label_ignore_idx)
            # Only keep active parts of the loss
            if labels_mask is not None:
                active_loss = valid_mask.view(-1) == 1
                active_logits = logits.view(-1, self.n_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(
                    logits.view(-1, self.n_labels), labels.view(-1))
            return loss
        else:
            return logits


    def encode_word(self, sentence):
        """
        takes a string and returns a list of token ids
        """
        tensor_ids = self.elmo.embed_sentence(sentence)[2]
        # remove <s> and </s> ids
        return tensor_ids
Example #24
0
def context_insensitive_character_embeddings(weights_path,
                                             options_path,
                                             word2idx,
                                             cuda=False,
                                             cache_dir=None):
    """
    Embeddings are always saved in sorted order (by vocab) and loaded according to word2idx.
    """
    validate_word2idx(word2idx)

    vocab = list(sorted(word2idx.keys()))
    sorted_word2idx = {k: i for i, k in enumerate(vocab)}
    order = [
        sorted_word2idx[w]
        for w, i in sorted(word2idx.items(), key=lambda x: x[1])
    ]

    if cache_dir is not None:
        key = hash_vocab(vocab)
        cache_path = os.path.join(cache_dir, 'elmo_{}.npy'.format(key))

        if os.path.exists(cache_path):
            print('Loading cached elmo vectors: {}'.format(cache_path))
            return load_elmo_cache(cache_path, order)

    if cuda:
        device = 0
    else:
        device = -1

    batch_size = 256
    nbatches = len(vocab) // batch_size + 1

    # TODO: Does not support padding.
    elmo = ElmoEmbedder(options_file=options_path,
                        weight_file=weights_path,
                        cuda_device=device)
    vec_lst = []
    for i in tqdm(range(nbatches), desc='elmo'):
        start = i * batch_size
        batch = vocab[start:start + batch_size]
        if len(batch) == 0:
            continue
        vec = elmo.embed_sentence(batch)
        vec_lst.append(vec)

    vectors = np.concatenate([x[0] for x in vec_lst], axis=0)

    vectors[word2idx['_PAD']] = 0
    vectors[word2idx['[SEP]']] = np.random.randn(vectors.shape[1])
    vectors[word2idx['[CLS]']] = np.random.randn(vectors.shape[1])

    if cache_dir is not None:
        print('Saving cached elmo vectors: {}'.format(cache_path))
        save_elmo_cache(cache_path, vectors)

    return vectors[order]
Example #25
0
        def get_elmo_embedding(self):
            '''Creates ELMo word embeddings for the given words
            param: list, list
            returns: ndarray, ndarray
              Returns the ELMo embeddings of the tokens of two sentences'''

            elmo = ElmoEmbedder()
            elmo_embedding = elmo.embed_sentence(self.tokenized_sent)

            return elmo_embedding
Example #26
0
class ElmoVectorizer(Vectorizer):
    def __init__(self, options_path: str, weights_path: str, device: int = 0):
        self.model = ElmoEmbedder(options_path,
                                  weights_path,
                                  cuda_device=device)

    def embed(self, context: List[str]) -> torch.Tensor:
        vectors = self.model.embed_sentence(context)
        vectors = np.average(vectors, axis=0)
        return torch.from_numpy(vectors)
Example #27
0
def SeqEncode_seqvec(train_df, model_dir):
    x = []
    model_dir = Path(model_dir)
    weights = model_dir / 'weights.hdf5'
    options = model_dir / 'options.json'
    seqvec = ElmoEmbedder(options, weights, cuda_device=0)
    wild_seqs = []
    mut_seqs = []

    for idx, row in train_df.iterrows():
        print(idx)
        wild_seq = row['wild_seq']
        mut_seq = row['mut_seq']
        wild_embedding = seqvec.embed_sentence(list(wild_seq))
        mut_embedding = seqvec.embed_sentence(list(mut_seq))
        x.append([wild_embedding, mut_embedding])

    y = train_df['label']

    return x, y
Example #28
0
class ELMo(Vectorizer):
    def __init__(self):
        self.elmo = ElmoEmbedder()

    def vectorize(self, sentence: str) -> numpy.ndarray:
        """
		Return a tensor representation of the sentence of size (3 layers, num tokens, 1024 dim).
		"""
        # tokenizer's tokens must be converted to string tokens first
        tokens = list(map(str, spacy_tokenizer(sentence)))
        embeddings = self.elmo.embed_sentence(tokens)
        return embeddings
class Elmo:
    def __init__(self):
        self.elmo = ElmoEmbedder()

    def get_elmo_vector(self, tokens, layer):
        vectors = self.elmo.embed_sentence(tokens)
        X = []
        for vector in vectors[layer]:
            X.append(vector)

        X = np.array(X)

        return X
Example #30
0
def embed_with_fallback(
    batch: List[Tuple[str, str]],
    model: ElmoEmbedder,
    model_dir: Path,
) -> EmbedderReturnType:
    """ Tries to get the embeddings in this order:
      * Full batch GPU
      * Single Sequence GPU
      * Single Sequence CPU

    Single sequence processing is done in case of runtime error due to
    a) very long sequence or b) too large batch size
    If this fails, you might want to consider lowering batchsize and/or
    cutting very long sequences into smaller chunks

    Returns unprocessed embeddings
    """
    global _cpu_elmo_model

    # create List[List[str]] for batch-processing of ELMo
    tokens = [list(seq) for _, seq in batch]
    batch_ids = [identifier for identifier, _ in batch]

    try:  # try to get the embedding for the current sequence
        with torch.no_grad():
            embeddings = model.embed_batch(tokens)
        assert len(batch) == len(embeddings)
        for sequence_id, embedding in zip(batch_ids, embeddings):
            yield sequence_id, embedding
    except RuntimeError as e:
        logger.error("Error processing batch of {} sequences: {}".format(
            len(batch), e))
        logger.error("Sequences in the failing batch: {}".format(batch_ids))
        logger.error("Starting single sequence processing")

        for sample_id, seq in batch:
            try:
                with torch.no_grad():
                    embedding = model.embed_sentence(list(seq))
                yield sample_id, embedding
            except RuntimeError as e:
                logger.error("RuntimeError for {} with {} residues: {}".format(
                    sample_id, len(seq), e))
                logger.error(
                    "Single sequence processing failed. Switching to CPU now. "
                    + "This slows down the embedding process.")
                if not _cpu_elmo_model:
                    _cpu_elmo_model = get_elmo_model(model_dir, cpu=True)
                with torch.no_grad():
                    embedding = _cpu_elmo_model.embed_sentence(list(seq))
                yield sample_id, embedding
Example #31
0
def context_insensitive_elmo(weights_path,
                             options_path,
                             word2idx,
                             cuda=False,
                             cache_dir=None):
    logger = get_logger()

    vocab = [w for w, i in sorted(word2idx.items(), key=lambda x: x[1])]

    validate_word2idx(word2idx)

    if cache_dir is not None:
        key = hash_vocab(vocab)
        cache_path = os.path.join(cache_dir, 'elmo_{}.npy'.format(key))

        if os.path.exists(cache_path):
            logger.info('Loading cached elmo vectors: {}'.format(cache_path))
            return load_elmo_cache(cache_path)

    if cuda:
        device = 0
    else:
        device = -1

    batch_size = 256
    nbatches = len(vocab) // batch_size + 1

    logger.info('Begin caching vectors. nbatches={} device={}'.format(
        nbatches, device))
    logger.info('Initialize ELMo Model.')

    # TODO: Does not support padding.
    elmo = ElmoEmbedder(options_file=options_path,
                        weight_file=weights_path,
                        cuda_device=device)
    vec_lst = []
    for i in tqdm(range(nbatches), desc='elmo'):
        start = i * batch_size
        batch = vocab[start:start + batch_size]
        if len(batch) == 0:
            continue
        vec = elmo.embed_sentence(batch)
        vec_lst.append(vec)

    vectors = np.concatenate([x[0] for x in vec_lst], axis=0)

    if cache_dir is not None:
        logger.info('Saving cached elmo vectors: {}'.format(cache_path))
        save_elmo_cache(cache_path, vectors)

    return vectors
Example #32
0
    def test_embed_batch_is_empty_sentence(self):
        embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file)
        embeddings = embedder.embed_sentence([])

        assert embeddings.shape == (3, 0, 1024)