Example #1
0
import nltk
nltk.download('punkt')
import torch
from models import InferSent

# Initialize
infersent = InferSent({
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
})
infersent.load_state_dict(
    torch.load(
        '/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl'
    ))
infersent.set_w2v_path(
    '/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt')

# My sentences
sentences = ["Hi I'm Peter", "Hi I'm Danny", "Hi I'm Ryan"]
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True)
print(embeddings)
infersent.visualize(sentences[0], tokenize=True)
Example #2
0
# In[11]:


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


# In[12]:

cosine(
    model.encode(['the cat eats.'])[0],
    model.encode(['the cat drinks.'])[0])

# In[13]:

idx = randint(0, len(sentences))
_, _ = model.visualize(sentences[idx])

# In[14]:

my_sent = 'The cat is drinking milk.'
_, _ = model.visualize(my_sent)

# In[15]:

model.build_vocab_k_words(500000)  # getting 500K words vocab
my_sent = 'barack-obama is the former president of the United-States.'
_, _ = model.visualize(my_sent)

# In[ ]:
for input in range(len(sentences)):

    top = max_similar(input, embeddings, tembeddings)

    #print(top);

    #print("output is")
    #print()
    fo = open("./output/inference_outputs", "a+")

    #print(sentences[top[i][0]])
    fo.write(esentences[top[0][0]] + '\n')

    fa = open("./output/output_nos", "a+")
    fa.write("%d" % (top[0][0] + 1) + '\n')

    fb = open("./output/sim_scores", "a+")
    fb.write("%f" % (top[0][1]) + '\n')

    #print()
fo.close()
fa.close()
fb.close()

print(sentences[max_similar(input, embeddings)],
      cosine(embeddings[input], embeddings[max_similar(input, embeddings)]))

for i in range(9):
    _, _ = model.visualize(sentences[i], i)
Example #4
0
        refs.append(line[:-1])

hyps = []
with open(args.generated, 'r') as f:
    for line in f:
        hyps.append(line[:-1])

# build voceb
infersent.build_vocab(refs+hyps, tokenize=True)

# get embeddings
refs_embeds = infersent.encode(refs, tokenize=True)
hyps_embeds = infersent.encode(hyps, tokenize=True)

# compute cosine similarity
refs_norm = np.linalg.norm(refs_embeds, ord=2, axis=1)
hyps_norm = np.linalg.norm(hyps_embeds, ord=2, axis=1)  

cosine = np.sum((refs_embeds*hyps_embeds), axis=1)/refs_norm/hyps_norm

if args.output_file is not None:
    with open(args.output_file, 'a') as f:
        print(json.dumps({'embedding_cosin':float(np.mean(cosine))}), file=f)
else:
    print ('%s,%f'%(sys.argv[1].split('/')[-2], np.mean(cosine)))

'''
# visualize importance 
infersent.visualize('A man plays an instrument.', tokenize=True)
'''
Example #5
0
# In[20]:


infersent.build_vocab(train_doc, tokenize=True)


# In[21]:


embeddings = infersent.encode(train_doc, tokenize=True)


# In[22]:


infersent.visualize('A man plays an instrument.', tokenize=True)


# In[31]:


embeddings.shape
#This outputs a numpy array with n vectors of dimension 4096.


# Now we have each and every sentence in the form of vector. We can try to match up each to some of our topics or we can try to apply clustering to see the distribution.

# # k nn clustering applied 

# In[34]: