def inferSent(): import nltk # nltk.download('punkt') from InferSent.models import InferSent import torch # use_cuda = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model = model.cuda() if use_cuda else model # V = 2 MODEL_PATH = 'encoder/infersent2.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0, 'version': 2 } infersent = InferSent(params_model).to(device) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) print('set w2v') infersent.build_vocab(dataset, tokenize=True) embeddings = infersent.encode(dataset, bsize=64, tokenize=True) idx = randint(0, len(dataset)) _, _ = infersent.visualize(dataset[idx]) print('done') return embeddings
def compute_intent_vectors(self, sentences): # TODO IMPLEMENT CACHING! from InferSent.models import InferSent infersent_folder = Path('./InferSent') infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl') MODEL_PARAMETERS = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt' model = InferSent(MODEL_PARAMETERS) model.load_state_dict(torch.load(infersent_path)) if torch.cuda.is_available(): model.cuda() model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) utterances_dict = self.get_utterances_dict(sentences) vectors = {} for i, (intent, sentences) in enumerate(utterances_dict.items()): LOGGER.info('{}/{} done'.format(i + 1, len(utterances_dict.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
class _InferSent: def __init__(self): from InferSent.models import InferSent import torch V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' self.infersent.set_w2v_path(W2V_PATH) def build_vocab(self, queries): self.infersent.build_vocab(queries, tokenize=True) def update_vocab(self, text): self.infersent.update_vocab(text, tokenize=True) def predict(self, text): # self.update_vocab(text) return self.infersent.encode(text, tokenize=True)
class UniversalSentenceEncoder: def __init__(self): super().__init__() model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) self.model.eval() use_cuda = True self.model = self.model.cuda() if use_cuda else self.model self.model.set_w2v_path(W2V_PATH) self.model.build_vocab_k_words(K=100000) def semantic_sim(self, sents1, sents2): embed1 = self.model.encode(sents1, tokenize=False) embed2 = self.model.encode(sents2, tokenize=False) embed1 = torch.tensor(embed1) embed2 = torch.tensor(embed2) sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True) sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True) cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1) clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0) scores = 1.0 - torch.acos(clip_cosine_similarities) return scores.cpu().numpy()
def generate_embeddings(df): paras = list(df["context"].drop_duplicates().reset_index(drop=True)) print("Paragraph count:", len(paras)) blob = TextBlob(" ".join(paras)) sentences = [item.raw for item in blob.sentences] params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(infersent_pretrained_path)) infersent.set_w2v_path(glove_path) print("Building Infersent vocabulary") infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} print("Building sentence embeddings") print("Sentence count:", len(sentences)) for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) print("Building question embeddings") questions = df["question"].tolist() print("Questions count:", len(questions)) for i in range(len(questions)): dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True) return dict_embeddings
def Start_chatbot(): model_version = 1 MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False model = model.cuda() if use_cuda else model W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=570000) dict = {} embeddings = {} questions = [] answers = [] with open('../data/questions.txt') as f: content = f.readlines() questions = [x.strip() for x in content] with open('../data/answers.txt') as f: content = f.readlines() answers = [x.strip() for x in content] for i in range(len(questions)): dict[questions[i]] = answers[i] embeddings[questions[i]] = model.encode([questions[i]])[0] return model, dict, embeddings
'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) sentences = [ "I am an engineer now.", "You can be an engineer.", "Building stuff is very fun.", "Stuff breaks often too though." ] infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) infersent.visualize('A man plays an instrument.', tokenize=True) encoded_sentences = embeddings # greedy decoder def greedy_decoder(data): # index for largest probability each row return [np.argmax(s) for s in data] # decode sequence result = greedy_decoder(encoded_sentences) print(result)
def process(channel): # Load the Classifier tf.reset_default_graph() NN = classifer() NN.load('nn-classifier-v2') # Load the sentence embedder model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') all_files = glob.glob('../files/CableNews/%s/*.p' % channel) read_files = pickle.load(open('%s_visit.p' % (channel), 'rb')) counter = len(read_files) for file in tqdm(all_files): if file in read_files: continue else: read_files.append(file) if np.random.rand() < 0.3: pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb')) res = pickle.load(open(file, 'rb')) results = {} prev_text = "" all_text = [] all_keys = [] for key in res.keys(): meta_data = res[key][0] # First in the list if len(meta_data['text']) < 10: continue # Make sure we drop the duplicates: Texts should be differents current_text = meta_data['text'][:10] if current_text == prev_text: continue else: prev_text = current_text text = tokenizer.tokenize(meta_data['text']) if len(text) <= 2: continue # Drop the first sentence text = text[1:] senteces = [] for s in text: #Drop super small and super large senteces if len(s.split()) > 30 and len(s.split()) < 50: senteces.append(s) if len(senteces) == 0: continue # Calculate the embedding all_text.extend(senteces) all_keys.extend([key] * len(senteces)) if len(all_text) == 0: continue all_embed = model.encode(all_text, bsize=128, tokenize=True, verbose=False) all_predictions = NN.predict(all_embed)[ 0] # Merge the probabilties and take top 2: prev_key = None total_prob = np.zeros((13, 1)) key_counter = 0 for current_key in all_keys: if current_key == prev_key: total_prob[:, 0] += all_predictions[key_counter, :] else: Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])] Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100 results[current_key] = { 'Topics': list(Topics), 'Probs': list(Probs), 'gender': res[current_key][0]['gender'], 'persons': res[current_key][0]['persons'], 'locations': res[current_key][0]['locations'] } prev_key = current_key total_prob = np.zeros((13, 1)) total_prob[:, 0] += all_predictions[key_counter, :] key_counter += 1 pickle.dump(results, open('processed_data/%s/%d.p' % (channel, counter), 'wb')) counter += 1
'dpout_model': 0.0, 'version': 1 } infer_sent_model = InferSent(params_model) infer_sent_model.load_state_dict(torch.load(model_pkl)) # In[111]: infer_sent_model.set_w2v_path(glove_w2v_loc) infer_sent_model.build_vocab_k_words(K=100000) # infer_sent_model.to(torch.device("cuda:0")) # In[112]: infer_sent_model.encode(["This man is playing computer games"], tokenize=True) # In[113]: def get_embedding_for_context(ctx): if not isinstance(ctx, list): # print("ctx is not list") ctx = [ctx] return infer_sent_model.encode(ctx, tokenize=True) # In[114]: from sklearn.metrics.pairwise import cosine_similarity
notes = os.listdir('{}/{}'.format(dis, patient)) for note in notes: tps = os.listdir('{}/{}/{}'.format(dis, patient, note)) if os.path.exists('{}/{}/{}'.format(res, patient, note)): shutil.rmtree('{}/{}/{}'.format(res, patient, note)) os.makedirs('{}/{}/{}'.format(res, patient, note)) for tp in tps: print('{}/{}/{}/{}'.format(dis, patient, note, tp)) with open('{}/{}/{}/{}'.format(dis, patient, note, tp), 'r') as f: sents = f.read() t_sents = sent_tokenize(sents) # print(len(t_sent)) # time.sleep(0.4) val_sent = [] with open('{}/{}/{}/{}pkl'.format(res, patient, note, tp[:-3]), 'wb') as f: for sent in t_sents: # print(sent) length = len(sent.split()) if length < 10: continue val_sent.append(sent) #print(val_sent) if val_sent == []: continue embedding = model.encode(val_sent, bsize=128, tokenize=False, verbose=True) pickle.dump(embedding, f)
from InferSent.models import InferSent model_version = 1 dirname = os.path.dirname(__file__) MODEL_PATH = os.path.join( dirname, "InferSent/encoder/infersent%s.pkl" % model_version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(dirname, 'InferSent/embs/glove.840B.300d.txt') model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) for file_name in files: with open(file_name) as istr: data = map(str.strip, istr) data = list(data) output_file = output_dir / file_name.with_suffix('.emb.tsv').name embeddings = model.encode(data, tokenize=True) with open(output_file, "w") as ostr: writer = csv.writer(ostr, delimiter="\t") for sent, emb in zip(data, embeddings): _ = writer.writerow([sent, " ".join(map(str, emb.tolist()))])
paras = list(train_df["contexts"].drop_duplicates().reset_index(drop= True)) blob = TextBlob(" ".join(paras)) sentences = get_all_sentences(train_df['sentences']) infersent.build_vocab(sentences, tokenize=True) # # Build Embeddings # In[15]: # Sentence Embeddings dict_embeddings = {} for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)[0] # Question Embeddings questions = list(train_df["questions"]) for i in range(len(questions)): dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)[0] # # Save Embeddings # In[156]: # Todo # This will help to save the computation time
'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) # infersent.build_vocab(sentences, tokenize=True) infersent.build_vocab_k_words(K=100000) embeddings = infersent.encode(sentences, bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) #### End Paste parsedQs = [] with open(questions, "r+") as f: for q in f.readlines(): parsedQs.append(preprocessQs(q)) # print(parsedQs)s qEmbeddings = infersent.encode(parsedQs, bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings)))