class _InferSent: def __init__(self): from InferSent.models import InferSent import torch V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' self.infersent.set_w2v_path(W2V_PATH) def build_vocab(self, queries): self.infersent.build_vocab(queries, tokenize=True) def update_vocab(self, text): self.infersent.update_vocab(text, tokenize=True) def predict(self, text): # self.update_vocab(text) return self.infersent.encode(text, tokenize=True)
def compute_intent_vectors(self, sentences): # TODO IMPLEMENT CACHING! from InferSent.models import InferSent infersent_folder = Path('./InferSent') infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl') MODEL_PARAMETERS = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt' model = InferSent(MODEL_PARAMETERS) model.load_state_dict(torch.load(infersent_path)) if torch.cuda.is_available(): model.cuda() model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) utterances_dict = self.get_utterances_dict(sentences) vectors = {} for i, (intent, sentences) in enumerate(utterances_dict.items()): LOGGER.info('{}/{} done'.format(i + 1, len(utterances_dict.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
class UniversalSentenceEncoder: def __init__(self): super().__init__() model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) self.model.eval() use_cuda = True self.model = self.model.cuda() if use_cuda else self.model self.model.set_w2v_path(W2V_PATH) self.model.build_vocab_k_words(K=100000) def semantic_sim(self, sents1, sents2): embed1 = self.model.encode(sents1, tokenize=False) embed2 = self.model.encode(sents2, tokenize=False) embed1 = torch.tensor(embed1) embed2 = torch.tensor(embed2) sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True) sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True) cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1) clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0) scores = 1.0 - torch.acos(clip_cosine_similarities) return scores.cpu().numpy()
def inferSent(): import nltk # nltk.download('punkt') from InferSent.models import InferSent import torch # use_cuda = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model = model.cuda() if use_cuda else model # V = 2 MODEL_PATH = 'encoder/infersent2.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0, 'version': 2 } infersent = InferSent(params_model).to(device) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) print('set w2v') infersent.build_vocab(dataset, tokenize=True) embeddings = infersent.encode(dataset, bsize=64, tokenize=True) idx = randint(0, len(dataset)) _, _ = infersent.visualize(dataset[idx]) print('done') return embeddings
def __init__(self, bsize=64, word_emb_dim=300, enc_lstm_dim=2048, pool_type='max', dpout_model=0.0, version=2, model_path='../infersent/infersent2.pkl', path_to_w2v='../fasttext/crawl-300d-2M.vec', use_cuda=True): self.version = version self.dpout_model = dpout_model self.pool_type = pool_type self.enc_lstm_dim = enc_lstm_dim self.word_emb_dim = word_emb_dim self.bsize = bsize model = InferSent({ 'bsize': bsize, 'word_emb_dim': word_emb_dim, 'enc_lstm_dim': enc_lstm_dim, 'pool_type': pool_type, 'dpout_model': dpout_model, 'version': version }) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(path_to_w2v) if not use_cuda: self.model = model else: self.model = model.cuda() self.first_call = True
def _load_pretrained_model(verbose=True): if verbose: print(f">>> Loading pretrained model from {_MODEL_PATH}") infersent = InferSent(_PARAMS_MODEL) infersent.load_state_dict(torch.load(_MODEL_PATH)) infersent.set_w2v_path(_W2V_PATH) infersent.build_vocab_k_words(K=_K_WORDS_VOCAB) return infersent
def get_infersent(V=2): ''' Builds the infersent model using either GloVe or fastText ''' MODEL_PATH = 'encoder/infersent%s.pkl' %V if V == 2: W2V_PATH = 'fastText/crawl-300d-2M.vec' elif V == 1: W2V_PATH = 'GloVe/glove.840B.300d.txt' params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, \ 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) return infersent
def __init__(self): #print("Initializing Infersent..") model_version = 1 MODEL_PATH = get_project_root() / Path("encoder/infersent%s.pkl" % model_version) params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # word vector path for the model: W2V_PATH = get_project_root() / Path('GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec') model.set_w2v_path(W2V_PATH) # build the vocabulary of word vectors model.build_vocab_k_words(K=100000) self.model = model
def Start_chatbot(): model_version = 1 MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False model = model.cuda() if use_cuda else model W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=570000) dict = {} embeddings = {} questions = [] answers = [] with open('../data/questions.txt') as f: content = f.readlines() questions = [x.strip() for x in content] with open('../data/answers.txt') as f: content = f.readlines() answers = [x.strip() for x in content] for i in range(len(questions)): dict[questions[i]] = answers[i] embeddings[questions[i]] = model.encode([questions[i]])[0] return model, dict, embeddings
def generate_embeddings(df): paras = list(df["context"].drop_duplicates().reset_index(drop=True)) print("Paragraph count:", len(paras)) blob = TextBlob(" ".join(paras)) sentences = [item.raw for item in blob.sentences] params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(infersent_pretrained_path)) infersent.set_w2v_path(glove_path) print("Building Infersent vocabulary") infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} print("Building sentence embeddings") print("Sentence count:", len(sentences)) for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) print("Building question embeddings") questions = df["question"].tolist() print("Questions count:", len(questions)) for i in range(len(questions)): dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True) return dict_embeddings
from InferSent.models import InferSent V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) sentences = [ "I am an engineer now.", "You can be an engineer.", "Building stuff is very fun.", "Stuff breaks often too though." ] infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) infersent.visualize('A man plays an instrument.', tokenize=True) encoded_sentences = embeddings # greedy decoder
def process(channel): # Load the Classifier tf.reset_default_graph() NN = classifer() NN.load('nn-classifier-v2') # Load the sentence embedder model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') all_files = glob.glob('../files/CableNews/%s/*.p' % channel) read_files = pickle.load(open('%s_visit.p' % (channel), 'rb')) counter = len(read_files) for file in tqdm(all_files): if file in read_files: continue else: read_files.append(file) if np.random.rand() < 0.3: pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb')) res = pickle.load(open(file, 'rb')) results = {} prev_text = "" all_text = [] all_keys = [] for key in res.keys(): meta_data = res[key][0] # First in the list if len(meta_data['text']) < 10: continue # Make sure we drop the duplicates: Texts should be differents current_text = meta_data['text'][:10] if current_text == prev_text: continue else: prev_text = current_text text = tokenizer.tokenize(meta_data['text']) if len(text) <= 2: continue # Drop the first sentence text = text[1:] senteces = [] for s in text: #Drop super small and super large senteces if len(s.split()) > 30 and len(s.split()) < 50: senteces.append(s) if len(senteces) == 0: continue # Calculate the embedding all_text.extend(senteces) all_keys.extend([key] * len(senteces)) if len(all_text) == 0: continue all_embed = model.encode(all_text, bsize=128, tokenize=True, verbose=False) all_predictions = NN.predict(all_embed)[ 0] # Merge the probabilties and take top 2: prev_key = None total_prob = np.zeros((13, 1)) key_counter = 0 for current_key in all_keys: if current_key == prev_key: total_prob[:, 0] += all_predictions[key_counter, :] else: Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])] Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100 results[current_key] = { 'Topics': list(Topics), 'Probs': list(Probs), 'gender': res[current_key][0]['gender'], 'persons': res[current_key][0]['persons'], 'locations': res[current_key][0]['locations'] } prev_key = current_key total_prob = np.zeros((13, 1)) total_prob[:, 0] += all_predictions[key_counter, :] key_counter += 1 pickle.dump(results, open('processed_data/%s/%d.p' % (channel, counter), 'wb')) counter += 1
model_pkl = '../InferSent/encoder/infersent1.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } infer_sent_model = InferSent(params_model) infer_sent_model.load_state_dict(torch.load(model_pkl)) # In[111]: infer_sent_model.set_w2v_path(glove_w2v_loc) infer_sent_model.build_vocab_k_words(K=100000) # infer_sent_model.to(torch.device("cuda:0")) # In[112]: infer_sent_model.encode(["This man is playing computer games"], tokenize=True) # In[113]: def get_embedding_for_context(ctx): if not isinstance(ctx, list): # print("ctx is not list") ctx = [ctx]
'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # CUDA use_cuda = True model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words print('Load glove') model.build_vocab_k_words(K=2000000) # load sentence dis = '/home/shl183/nlp4note/classified_txt/discharge-sep' res = '/home/shl183/nlp4note/infersent' patients = os.listdir(dis) # exist = os.listdir(res) # with open('./tmp.pkl','wb') as f: # pickle.dump(exist,f) for patient in patients: # if patient in exist: # continue
return json.JSONEncoder.default(self, obj) if __name__ == "__main__": # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval['infersent'] = model.cuda() se = senteval.engine.SE(params_senteval, batcher, prepare) # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', # 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', # 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark'] transfer_tasks = ['AmenitySimilarEvents'] results = se.eval(transfer_tasks) print(results) if not os.path.exists(PATH_TO_RESULTS): os.mkdir(PATH_TO_RESULTS) with open(os.path.join(PATH_TO_RESULTS, 'infersent.json'),
'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': VERSION } if VERSION == 1: W2V = 'C:/users/georg/Desktop/GloVe/glove.840B.300d.txt' else: W2V = 'C:/Users/georg/Desktop/fastText/crawl-300d-2M.vec' VOCAB_SIZE = 100000 NUM_STEPS = 300 # set up model model = InferSent(PARAMS).to(DEVICE) model.load_state_dict(torch.load(WEIGHTS)) model.set_w2v_path(W2V) word2vec = model.build_vocab_k_words(K=VOCAB_SIZE) # setup the NN-classifer vec2word = KNeighborsClassifier(n_neighbors=1) vecs = [] words = [] for key, val in word2vec.items(): if val.shape == (300, ): vecs.append(val) words.append(key) X = np.vstack(vecs) y = np.array(words) vec2word.fit(X, y)