Ejemplo n.º 1
0
 def transform(self, X: dt.Frame):
     X.replace([None, math.inf, -math.inf], self._repl_val)
     from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
     if self.embedding_name in ["glove", "en"]:
         self.embedding = WordEmbeddings(self.embedding_name)
     elif self.embedding_name in ["bert"]:
         self.embedding = BertEmbeddings()
     self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
     output = []
     X = X.to_pandas()
     text1_arr = X.iloc[:, 0].values
     text2_arr = X.iloc[:, 1].values
     for ind, text1 in enumerate(text1_arr):
         try:
             text1 = Sentence(str(text1).lower())
             self.doc_embedding.embed(text1)
             text2 = text2_arr[ind]
             text2 = Sentence(str(text2).lower())
             self.doc_embedding.embed(text2)
             score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                       text2.get_embedding().reshape(1,
                                                                     -1))[0,
                                                                          0]
             output.append(score)
         except:
             output.append(-99)
     return np.array(output)
def other_embeddings(embd):
    sess = tf.InteractiveSession()
    train_data_list = []
    test_data_list = []
    val_data_list = []
    if embd == 'glove':
        print('Starting Glove Embedding...')
        glove_embedding = WordEmbeddings('glove')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[glove_embedding])
    elif embd == 'xlnet':
        print('Starting XLNet Embedding...')
        xlnet_embedding = XLNetEmbeddings('xlnet-large-cased')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[xlnet_embedding])
    elif embd == 'fasttext':
        print('Starting Fasttext Embedding...')
        fasttext_embedding = WordEmbeddings('en')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[fasttext_embedding])
    elif embd == 'elmo':
        print('Starting ELMo Embedding...')
        elmo_embedding = ELMoEmbeddings()
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[elmo_embedding])
    else:
        # init Flair embeddings
        flair_forward_embedding = FlairEmbeddings('multi-forward')
        flair_backward_embedding = FlairEmbeddings('multi-backward')
        glove_embedding = WordEmbeddings('glove')
        # now create the DocumentPoolEmbeddings object that combines all embeddings
        document_embeddings = DocumentPoolEmbeddings(embeddings=[
            glove_embedding, flair_forward_embedding, flair_backward_embedding
        ])
    print('Train embedding Started...')
    for text in final_train['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        train_data_list.append(emb)
    print('Embedded Train data!!')
    print('Test embedding Started...')
    for text in final_test['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        test_data_list.append(emb)
    print('Embedded Test data!!')
    for text in final_val['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        val_data_list.append(emb)
    print('Embedded Test data!!')
    return train_data_list, test_data_list, val_data_list
Ejemplo n.º 3
0
def answer_similarity(ans1, real):
    sent1 = Sentence(ans1)
    sent2 = Sentence(real)
    document_embeddings.embed(sent1)
    document_embeddings.embed(sent2)
    emb1 = sent1.get_embedding()
    emb2 = sent2.get_embedding()
    emb1 /= torch.sqrt((emb1**2).sum())
    emb2 /= torch.sqrt((emb2**2).sum())

    return max(0., (emb1.T @ emb2).item())
Ejemplo n.º 4
0
def criterion(str1, str2, embed):
    try:
        s1 = Sentence(str1)
        s2 = Sentence(str2)
        embed.embed(s1)
        s1_emb = s1.get_embedding()
        embed.embed(s2)
        s2_emb = s2.get_embedding()

        return torch.cosine_similarity(s1_emb.unsqueeze(0), s2_emb.unsqueeze(0)).item()
    
    except:
        return 0.5
Ejemplo n.º 5
0
    def get_embeddings(self, sentence):

        # document_embeddings = DocumentPoolEmbeddings(
        #    [self.glove_embedding,  # initialize the document embeddings, mode = mean
        #     self.flair_embedding_backward,
        #     self.flair_embedding_forward])

        # Glove + BPE
        document_embeddings = DocumentPoolEmbeddings(
            [self.glove_embedding, self.bpe_embedding])

        # Nilc fasttext 600 emdedding
        #document_embeddings = DocumentPoolEmbeddings(
        #            [self.fast_text_embedding])

        # Flair
        #document_embeddings = DocumentPoolEmbeddings(
        #    [self.flair_embedding_forward])

        # ElMO
        #document_embeddings = DocumentPoolEmbeddings(
        #    [self.elmo_embedding])

        # create an example sentence
        sentence = Sentence(sentence)

        # embed the sentence with our document embedding
        document_embeddings.embed(sentence)

        # now check out the embedded sentence.
        return sentence.get_embedding()
Ejemplo n.º 6
0
    def chunked_embed(corpus, embeddings, chunk_size=256):
        def find_nth(n, substring, text, start):
            index = start
            for _ in range(n):
                index = text.find(substring, index + 1)
            return index

        try:
            partial_embeddings = []
            i = 0
            while i < len(corpus):
                next_i = find_nth(chunk_size, " ", corpus, i)
                if next_i < i:
                    next_i = len(corpus)
                chunk = corpus[i:next_i]
                sentence = Sentence(chunk, use_tokenizer=False)
                embeddings.embed(sentence)
                partial_embeddings.append(sentence.get_embedding().numpy())
                i = next_i
            avg = np.average(np.asarray(partial_embeddings), axis=0)
            return avg
        except RuntimeError:
            print(
                "Please, ignore the message above indicating that the sentence is too long. The problem has been solved."
            )
            return FeatureExtractor.chunked_embed(corpus, embeddings,
                                                  int(chunk_size / 2))
Ejemplo n.º 7
0
def get_pooling_embedding(document):
    tokens = [token.text for token in nlp(document)]
    text = ' '.join(tokens)
    sentence = Sentence(text)
    document_pooling_embeddings.embed(sentence)

    return sentence.get_embedding().squeeze().tolist()
Ejemplo n.º 8
0
 def get_sentence_vector(self, text):
     sentence = Sentence(clean_text(text))
     _ = self.embeddings.embed(sentence)
     a = sentence.get_embedding()
     result = a.cpu().detach().numpy()
     if np.sum(result[0:5]) == 0:
         result = np.random.randn(self.n_dims)
     return result
Ejemplo n.º 9
0
def get_fastText_embeding(text_str):
    '''
    Fonction qui retourne l'embedding d'un
    :param text_str: text a embeddé
    :return: le vecteur d'embeding correspondant au text
    '''
    text = Sentence(text_str)
    pool_embeddings.embed(text)
    return text.get_embedding()
def glove_cosine_similarity(question, sentence_list):
    question = Sentence(question)
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    document_embeddings.embed(question)
    q_emd = question.get_embedding()
    q_emd = q_emd.unsqueeze(0)
    sentence_vectors = torch.empty((1, EMBEDDING_DIM))  # .to(device)
    for idx, sent in enumerate(sentence_list):
        sent = Sentence(sent)
        document_embeddings.embed(sent)
        sent_emd = sent.get_embedding()
        if idx == 0:
            sentence_vectors = sent_emd.unsqueeze(0)
        else:
            sentence_vectors = torch.cat(
                (sentence_vectors, sent_emd.unsqueeze(0)))

    output = cos(q_emd, sentence_vectors)
    return output
def glove_eucleadian(question, sentence_list):
    question = Sentence(question)
    euc = nn.PairwiseDistance(p=2)
    document_embeddings.embed(question)
    q_emd = question.get_embedding()
    q_emd = q_emd.unsqueeze(0)
    sentence_vectors = torch.empty((1, EMBEDDING_DIM))  # .to(device)
    for idx, sent in enumerate(sentence_list):
        sent = Sentence(sent)
        document_embeddings.embed(sent)
        sent_emd = sent.get_embedding()
        if idx == 0:
            sentence_vectors = sent_emd.unsqueeze(0)
        else:
            sentence_vectors = torch.cat(
                (sentence_vectors, sent_emd.unsqueeze(0)))

    output = euc(q_emd, sentence_vectors)
    return output
  def construct_vector(self, original_sentence):
    """
    Given a sentence, Contruct and return a vector based on different stacked embeddings
    """
    
    sentence = Sentence(original_sentence)
    self.stacked_embedding.embed(sentence)
    sentence_embedding = sentence.get_embedding()
    sentence_embedding_array = sentence_embedding.detach().numpy()

    return sentence_embedding_array
Ejemplo n.º 13
0
def save_json(jfile):
    global dataset

    user_id = os.path.basename(jfile).split('.')[0]
    save_path = os.path.join('..', DATA_DIR,
                             'pos_tags_{}_embeds'.format(dataset),
                             '{}.json'.format(user_id))

    if os.path.isfile(save_path):
        print("Skipping user {} file already exists".format(user_id))
        return

    document_embeddings = get_doc_embeddings()

    with open(jfile, encoding='utf-8') as f:
        user_data = json.load(f)

        if len(user_data['tokens']) > 500:
            print('User {}.json has {} posts NOT skipping'.format(
                user_id, len(user_data['tokens'])))
            # return

        posts_list = user_data['tokens']  # each post is a list of tokens
        pos_tags_list = user_data['posTags']
        posts_lowercase_list = []
        posts_embeddings_list = []
        pos_tags_list_lowercase = []

        for i, (post, pos_tags) in enumerate(zip(posts_list, pos_tags_list)):
            post_lowercase = [token.lower() for token in post]
            if any("http" in word for word in post_lowercase):
                continue
            if 0 < len(post_lowercase):
                posts_lowercase_list.append(post_lowercase)
                pos_tags_list_lowercase.append(pos_tags)
                post_sentence = Sentence(' '.join(
                    [post for post in post_lowercase]))
                document_embeddings.embed(post_sentence)
                posts_embeddings_list.append(
                    post_sentence.get_embedding().tolist())
            elif len(post_lowercase) > 100:
                print('long post')
            else:
                continue

        user_data["tokens"] = posts_lowercase_list
        user_data["posTags"] = pos_tags_list_lowercase
        user_data["embeddings"] = posts_embeddings_list

        with open(save_path, 'w') as out_file:
            json.dump(user_data, out_file)
        print('Finished with file {}.json'.format(user_id))
Ejemplo n.º 14
0
def embed_tweet(tweetList):
    # initialize the word embeddings
    tr_embedding = WordEmbeddings('tr')
    char_embedding = CharacterEmbeddings()

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings(
        [tr_embedding, char_embedding])

    tweetTensors = []
    for tweet in tweetList:
        #print(norm_tweet(tweet))
        sentence = Sentence(norm_tweet(tweet))
        document_embeddings.embed(sentence)
        tweetTensors.append(sentence.get_embedding().data)
    return tweetTensors
Ejemplo n.º 15
0
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, OneHotEmbeddings, \
 DocumentRNNEmbeddings

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')
# embeddings = OneHotEmbeddings(corpus)

glove_embedding = WordEmbeddings('glove')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings(
    [glove_embedding],
    # flair_embedding_backward, flair_embedding_forward],
    # pooling='min',
    fine_tune_mode='nonlinear')
document_embeddings = DocumentRNNEmbeddings([glove_embedding])

document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding],
                                                 rnn_type='LSTM')

# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())
Ejemplo n.º 16
0
import numpy as np
from pandas import read_csv
import pickle

from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings, Sentence

data = read_csv('data/abcnews-date-text.csv', error_bad_lines=False)
documents = data[['headline_text']].values.reshape(-1).tolist()
# documents = list(pickle.load(open( "./corpus/df_proyectosFECYT.pkl", "rb" ) )['LEMAS_UC3M'])

glove_embedding = WordEmbeddings('glove')
document_embeddings = DocumentRNNEmbeddings([glove_embedding], hidden_size=512)
embeddings = []

count = 0

try:
    for document in documents:
        count += 1
        sentence = Sentence(document)

        document_embeddings.embed(sentence)

        embeddings.append(sentence.get_embedding().tolist())

        if (count % 1000 == 0): print(count)

finally:  # In case an error occurs before finish, we store previous results
    embedings_array = np.array(embeddings)
    np.save("embeds_abcnews_512_2.npy", embedings_array)
Ejemplo n.º 17
0
db = acl('localhost', 50051)

doc = parse_epub_content(
    '/home/developer/PycharmProjects/medindex_semantic_search_prototype/phys_train.epub'
)

paragraphs = []

for section in doc['sections']:
    paragraphs.extend(section['paragraphs'])

for paragraph in paragraphs[:20]:
    for s in sent_tokenize(paragraph):
        sentence = Sentence(s)
        document_embeddings.embed(sentence)
        embs = sentence.get_embedding()
        sample = db.convertDocument(embs, {"text": s})
        db.addDocuments([sample])

query = 'Which are the key exclusion criteria for patients with subacute phase of ischaemic or haemorrhagic stroke'
sentence = Sentence(query)
document_embeddings.embed(sentence)
query_embs = sentence.get_embedding()
query_vec = db.convertMatrix(query_embs)

k = 5
result = db.getNearest(query_vec, k)

r = json.loads(result.documents.decode('utf-8'))
r
Ejemplo n.º 18
0
 def get_vector(self, text: str) -> np.ndarray:
     sentence = Sentence(text, use_tokenizer=False)
     self.document_embeddings.embed(sentence)
     with torch.no_grad():
         vector = sentence.get_embedding()
         return vector.numpy()
def get_embedding(content, embedder=get_embedder()):
  paragraph = Sentence(str(content))
  embedder.embed(paragraph)
  return paragraph.get_embedding().unsqueeze(0)
    sentence = Sentence(row.text)
    label = row.label
    if count_id == count:
      test_sentences.append(sentence)
      test_labels.append(label)
    else:
      train_sentences.append(sentence)
      train_labels.append(label)

  #Training Embeddings:
  train_embeddings = []
  # since processing requires some memory we provide sentences into the document embedder in batches (small parts)
  for n in range(0, 5000, 250):  
    sents = train_sentences[n:n+250]
    bert_train_document_embeddings.embed(sents)
    train_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  #Test Embeddings:
  test_embeddings = []
  for n in range(0, 5000, 250):  
    sents = test_sentences[n:n+250]
    bert_test_document_embeddings.embed(sents)
    test_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  clf_b = DecisionTreeClassifier(max_depth=5, random_state=0)
  clf_b.fit(train_embeddings[:5000], train_labels[:5000])

  predicted_labels = clf_b.predict(test_embeddings[0:5000])
  end_time = datetime.now()
  
  print('Bert - round', count+1, ': Accuracy =',
Ejemplo n.º 21
0
document_embeddings = DocumentPoolEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
])

#opening english file and embedding each sentence
eng = open('<PATH> /E_sentences.txt', 'r')
#vec_eng=open('/home/dheeraj/Desktop/IIITH-Intern/Major-TH-Tool/glossary/E_vecs.txt','a')
line = eng.readline()
eng_vecs = []
while (line):
    sentence = Sentence(line)
    document_embeddings.embed(sentence)
    li = sentence.get_embedding()
    li = li.tolist()
    eng_vecs.append(li)
    line = eng.readline()
eng.close()

#opening hindi file and embedding each hindi sentence
hin = open('<PATH> /H_sentences_eng.txt', 'r')
line = hin.readline()
hin_vecs = []
while (line):
    sentence = Sentence(line)
    document_embeddings.embed(sentence)
    li = sentence.get_embedding()
    li = li.tolist()
    hin_vecs.append(li)
Ejemplo n.º 22
0
def _get_embedding(text):
    sentence = Sentence(text)
    embedding.embed(sentence)
    vector = sentence.get_embedding()
    return vector