def inferSent(): import nltk # nltk.download('punkt') from InferSent.models import InferSent import torch # use_cuda = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model = model.cuda() if use_cuda else model # V = 2 MODEL_PATH = 'encoder/infersent2.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0, 'version': 2 } infersent = InferSent(params_model).to(device) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) print('set w2v') infersent.build_vocab(dataset, tokenize=True) embeddings = infersent.encode(dataset, bsize=64, tokenize=True) idx = randint(0, len(dataset)) _, _ = infersent.visualize(dataset[idx]) print('done') return embeddings
class UniversalSentenceEncoder: def __init__(self): super().__init__() model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) self.model.eval() use_cuda = True self.model = self.model.cuda() if use_cuda else self.model self.model.set_w2v_path(W2V_PATH) self.model.build_vocab_k_words(K=100000) def semantic_sim(self, sents1, sents2): embed1 = self.model.encode(sents1, tokenize=False) embed2 = self.model.encode(sents2, tokenize=False) embed1 = torch.tensor(embed1) embed2 = torch.tensor(embed2) sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True) sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True) cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1) clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0) scores = 1.0 - torch.acos(clip_cosine_similarities) return scores.cpu().numpy()
def __init__(self, model_path, w2v_path, vocab_size=100000, device='cpu', hp=False): self.vocab = set() self.device = device self.hp = hp self.tokenizer = word_tokenize self.vocab_size = vocab_size params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.sent_model = InferSent(params_model).to(self.device).eval() self.sent_model.load_state_dict( torch.load(model_path, map_location=device)) if hp: self.sent_model = self.sent_model.half() # self.sent_model = self.sent_model.cuda() # self.sent_model.set_w2v_path(w2v_path) self.init_vocab(w2v_path) self.cache = dict()
class _InferSent: def __init__(self): from InferSent.models import InferSent import torch V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' self.infersent.set_w2v_path(W2V_PATH) def build_vocab(self, queries): self.infersent.build_vocab(queries, tokenize=True) def update_vocab(self, text): self.infersent.update_vocab(text, tokenize=True) def predict(self, text): # self.update_vocab(text) return self.infersent.encode(text, tokenize=True)
def _load_pretrained_model(verbose=True): if verbose: print(f">>> Loading pretrained model from {_MODEL_PATH}") infersent = InferSent(_PARAMS_MODEL) infersent.load_state_dict(torch.load(_MODEL_PATH)) infersent.set_w2v_path(_W2V_PATH) infersent.build_vocab_k_words(K=_K_WORDS_VOCAB) return infersent
def __init__(self): from InferSent.models import InferSent import torch V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' self.infersent.set_w2v_path(W2V_PATH)
def __init__(self, bsize=64, word_emb_dim=300, enc_lstm_dim=2048, pool_type='max', dpout_model=0.0, version=2, model_path='../infersent/infersent2.pkl', path_to_w2v='../fasttext/crawl-300d-2M.vec', use_cuda=True): self.version = version self.dpout_model = dpout_model self.pool_type = pool_type self.enc_lstm_dim = enc_lstm_dim self.word_emb_dim = word_emb_dim self.bsize = bsize model = InferSent({ 'bsize': bsize, 'word_emb_dim': word_emb_dim, 'enc_lstm_dim': enc_lstm_dim, 'pool_type': pool_type, 'dpout_model': dpout_model, 'version': version }) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(path_to_w2v) if not use_cuda: self.model = model else: self.model = model.cuda() self.first_call = True
def __init__(self): super().__init__() model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) self.model.eval() use_cuda = True self.model = self.model.cuda() if use_cuda else self.model self.model.set_w2v_path(W2V_PATH) self.model.build_vocab_k_words(K=100000)
def Start_chatbot(): model_version = 1 MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False model = model.cuda() if use_cuda else model W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=570000) dict = {} embeddings = {} questions = [] answers = [] with open('../data/questions.txt') as f: content = f.readlines() questions = [x.strip() for x in content] with open('../data/answers.txt') as f: content = f.readlines() answers = [x.strip() for x in content] for i in range(len(questions)): dict[questions[i]] = answers[i] embeddings[questions[i]] = model.encode([questions[i]])[0] return model, dict, embeddings
def generate_embeddings(df): paras = list(df["context"].drop_duplicates().reset_index(drop=True)) print("Paragraph count:", len(paras)) blob = TextBlob(" ".join(paras)) sentences = [item.raw for item in blob.sentences] params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(infersent_pretrained_path)) infersent.set_w2v_path(glove_path) print("Building Infersent vocabulary") infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} print("Building sentence embeddings") print("Sentence count:", len(sentences)) for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) print("Building question embeddings") questions = df["question"].tolist() print("Questions count:", len(questions)) for i in range(len(questions)): dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True) return dict_embeddings
def __init__(self): #print("Initializing Infersent..") model_version = 1 MODEL_PATH = get_project_root() / Path("encoder/infersent%s.pkl" % model_version) params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # word vector path for the model: W2V_PATH = get_project_root() / Path('GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec') model.set_w2v_path(W2V_PATH) # build the vocabulary of word vectors model.build_vocab_k_words(K=100000) self.model = model
def get_infersent(V=2): ''' Builds the infersent model using either GloVe or fastText ''' MODEL_PATH = 'encoder/infersent%s.pkl' %V if V == 2: W2V_PATH = 'fastText/crawl-300d-2M.vec' elif V == 1: W2V_PATH = 'GloVe/glove.840B.300d.txt' params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, \ 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) return infersent
# In[109]: from InferSent.models import InferSent # In[110]: model_pkl = '../InferSent/encoder/infersent1.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } infer_sent_model = InferSent(params_model) infer_sent_model.load_state_dict(torch.load(model_pkl)) # In[111]: infer_sent_model.set_w2v_path(glove_w2v_loc) infer_sent_model.build_vocab_k_words(K=100000) # infer_sent_model.to(torch.device("cuda:0")) # In[112]: infer_sent_model.encode(["This man is playing computer games"], tokenize=True) # In[113]:
import shutil import pickle from nltk.tokenize import sent_tokenize # Load Model from InferSent.models import InferSent MODEL_PATH = "encoder/infersent1.pkl" params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # CUDA use_cuda = True model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words print('Load glove') model.build_vocab_k_words(K=2000000) # load sentence
if isinstance(obj, np.ndarray): return obj.tolist() return json.JSONEncoder.default(self, obj) if __name__ == "__main__": # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval['infersent'] = model.cuda() se = senteval.engine.SE(params_senteval, batcher, prepare) # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', # 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', # 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark'] transfer_tasks = ['AmenitySimilarEvents'] results = se.eval(transfer_tasks) print(results) if not os.path.exists(PATH_TO_RESULTS): os.mkdir(PATH_TO_RESULTS)
import numpy as np import torch from InferSent.models import InferSent V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) sentences = [ "I am an engineer now.", "You can be an engineer.", "Building stuff is very fun.", "Stuff breaks often too though." ] infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) infersent.visualize('A man plays an instrument.', tokenize=True) encoded_sentences = embeddings
with open(text, "r+") as f: # print(sentTokenizer.tokenize(f.read().strip())) sentences = sentTokenizer.tokenize(f.read().strip()) ### pasted from https://github.com/facebookresearch/InferSent V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) # infersent.build_vocab(sentences, tokenize=True) infersent.build_vocab_k_words(K=100000) embeddings = infersent.encode(sentences, bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) #### End Paste
class SentEnc(): def __init__(self, model_path, w2v_path, vocab_size=100000, device='cpu', hp=False): self.vocab = set() self.device = device self.hp = hp self.tokenizer = word_tokenize self.vocab_size = vocab_size params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.sent_model = InferSent(params_model).to(self.device).eval() self.sent_model.load_state_dict( torch.load(model_path, map_location=device)) if hp: self.sent_model = self.sent_model.half() # self.sent_model = self.sent_model.cuda() # self.sent_model.set_w2v_path(w2v_path) self.init_vocab(w2v_path) self.cache = dict() def init_vocab(self, w2v_path): self.w2v, self.bs, self.es = load_WE(w2v_path, self.vocab_size) self.vocab.update(self.w2v.keys()) # sentences = list(self.vocab) + [self.unk, self.bs, self.es] # self.sent_model.build_vocab(sentences, tokenize=True) def prune_sent(self, sent): if sent is None: sent_tok = [] else: sent_tok = self.tokenizer(sent) sent_tok = [x for x in sent_tok if x in self.w2v] sent_rec = ' '.join(sent_tok) return sent_rec def get_number_encoding(self, num): d = 256 device = 'cpu' # consider 4 decimal points a = int(num) b = int((num - a) * 1e2) a = str(a)[::-1] b = str(b) J = torch.arange(0, d) J_even = (J % 2 == 0).float().to(device) J = J.float().to(device) Ia = torch.arange(0, len(a)).float().to(device) Ib = (torch.arange(0, len(b)) + 1).float().to(device) A = torch.FloatTensor([float(x) for x in a]).to(device) B = torch.FloatTensor([float(x) for x in b]).to(device) J = J.float() A = torch.cat([A.view(1, -1)] * d, dim=0).T Ia = torch.cat([Ia.view(1, -1)] * d, dim=0).T B = torch.cat([B.view(1, -1)] * d, dim=0).T Ib = torch.cat([Ib.view(1, -1)] * d, dim=0).T resA = A * (2**Ia) / 10 * (J_even * torch.sin(Ia / (10000**(J / d))) + (1 - J_even) * torch.cos(Ia / (10000**( (J - 1) / d)))) resB = B * (2.0**(-Ib)) / 10 * ( J_even * torch.sin(-Ib / (10000**(J / d))) + (1 - J_even) * torch.cos(-Ib / (10000**((J - 1) / d)))) res = torch.sum(resA, axis=0) + torch.sum(resB, axis=0) res = res / (len(a) + len(b)) res = res.numpy() if d < 4096: res = np.append(res, np.zeros(4096 - d)).astype('float32') return res def is_number(self, string): # try: # num = abs(float(string)) # return num # except: # return None string = string.strip() if re.match('^[-+]?\d+\.\d+$', string): return abs(float(string)) if re.match('^[-+]?\d+$', string) and len(string) < 5: return abs(int(string)) if re.match('^[-+]?[\d,]+$', string) and len(string) < 5: return abs(int(string.replace(',', ''))) return None def get_text_encoding(self, sent): sent_tok = sent.split() s = np.array([self.w2v[self.bs]] + [self.w2v[x] for x in sent_tok] + [self.w2v[self.es]]) l = len(s) s = s.reshape([l, 1, 300]) s = torch.from_numpy(s).float() if self.hp: s = s.half() s = s.to(self.device) l = np.array([l], dtype='int64') # s = ' '.join(s) with torch.no_grad(): res = self.sent_model.forward((s, l)) # # v = torch.zeros([1,4096]) if res.shape[1] != 4096: print(v, s) v = res.cpu().detach().numpy()[0] del res return v def cache_sentences(self, sentences): self.sent_cache = set() self.num_cache = set() for s in sentences: if s is None: s = '' if REG: num = self.is_number(s) if num is not None: self.num_cache.add((num, s)) else: pruned_s = self.prune_sent(s) self.sent_cache.add(pruned_s) else: pruned_s = self.prune_sent(s) self.sent_cache.add(pruned_s) print(f'initialize {len(self.sent_cache)} text sentences...') for s in tqdm.tqdm(self.sent_cache): self.cache[s] = self.get_text_encoding(s) print(f'initialize {len(self.num_cache)} numeric sentences...') self.num_cache = list(self.num_cache) for n, n_str in tqdm.tqdm(self.num_cache): self.cache[n_str] = self.get_number_encoding(n) # p = Pool(NUM_THREADS) # vecs = p.map(self.get_number_encoding, self.num_cache) # p.terminate() # for n, v in zip(self.num_cache, vecs): # self.cache[str(num)] = v def __getitem__(self, sent): if sent is None: sent = '' if REG: num = self.is_number(sent) if num is not None: num_str = sent return self.cache[num_str] else: pruned_s = self.prune_sent(sent) return self.cache[pruned_s] pruned_s = self.prune_sent(sent) return self.cache[pruned_s]
output_dir = pathlib.Path(args.output_dir) from InferSent.models import InferSent model_version = 1 dirname = os.path.dirname(__file__) MODEL_PATH = os.path.join( dirname, "InferSent/encoder/infersent%s.pkl" % model_version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(dirname, 'InferSent/embs/glove.840B.300d.txt') model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) for file_name in files: with open(file_name) as istr: data = map(str.strip, istr) data = list(data) output_file = output_dir / file_name.with_suffix('.emb.tsv').name embeddings = model.encode(data, tokenize=True) with open(output_file, "w") as ostr: writer = csv.writer(ostr, delimiter="\t") for sent, emb in zip(data, embeddings): _ = writer.writerow([sent, " ".join(map(str, emb.tolist()))])
def compute_intent_vectors(self, sentences): # TODO IMPLEMENT CACHING! from InferSent.models import InferSent infersent_folder = Path('./InferSent') infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl') MODEL_PARAMETERS = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt' model = InferSent(MODEL_PARAMETERS) model.load_state_dict(torch.load(infersent_path)) if torch.cuda.is_available(): model.cuda() model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) utterances_dict = self.get_utterances_dict(sentences) vectors = {} for i, (intent, sentences) in enumerate(utterances_dict.items()): LOGGER.info('{}/{} done'.format(i + 1, len(utterances_dict.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
def process(channel): # Load the Classifier tf.reset_default_graph() NN = classifer() NN.load('nn-classifier-v2') # Load the sentence embedder model_version = 1 MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') all_files = glob.glob('../files/CableNews/%s/*.p' % channel) read_files = pickle.load(open('%s_visit.p' % (channel), 'rb')) counter = len(read_files) for file in tqdm(all_files): if file in read_files: continue else: read_files.append(file) if np.random.rand() < 0.3: pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb')) res = pickle.load(open(file, 'rb')) results = {} prev_text = "" all_text = [] all_keys = [] for key in res.keys(): meta_data = res[key][0] # First in the list if len(meta_data['text']) < 10: continue # Make sure we drop the duplicates: Texts should be differents current_text = meta_data['text'][:10] if current_text == prev_text: continue else: prev_text = current_text text = tokenizer.tokenize(meta_data['text']) if len(text) <= 2: continue # Drop the first sentence text = text[1:] senteces = [] for s in text: #Drop super small and super large senteces if len(s.split()) > 30 and len(s.split()) < 50: senteces.append(s) if len(senteces) == 0: continue # Calculate the embedding all_text.extend(senteces) all_keys.extend([key] * len(senteces)) if len(all_text) == 0: continue all_embed = model.encode(all_text, bsize=128, tokenize=True, verbose=False) all_predictions = NN.predict(all_embed)[ 0] # Merge the probabilties and take top 2: prev_key = None total_prob = np.zeros((13, 1)) key_counter = 0 for current_key in all_keys: if current_key == prev_key: total_prob[:, 0] += all_predictions[key_counter, :] else: Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])] Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100 results[current_key] = { 'Topics': list(Topics), 'Probs': list(Probs), 'gender': res[current_key][0]['gender'], 'persons': res[current_key][0]['persons'], 'locations': res[current_key][0]['locations'] } prev_key = current_key total_prob = np.zeros((13, 1)) total_prob[:, 0] += all_predictions[key_counter, :] key_counter += 1 pickle.dump(results, open('processed_data/%s/%d.p' % (channel, counter), 'wb')) counter += 1
import numpy as np from numpy import dot from numpy.linalg import norm import nltk nltk.download('punkt') from InferSent.models import InferSent # Load Infersent Model V = 1 MODEL_PATH = './encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = './Glove/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) print("infersent is loaded") def feature_value(sentences, sentences_embeddings, question_embedding, metric): result = [] for i in range(0,len(sentences)): question_embedding = [question_embedding] sentence_embedding = [sentences_embeddings[i]] if metric == 'cosine_similarity': metric = cosine_similarity(question_embedding, sentence_embedding)
import nltk nltk.download('punkt') # # Load Infersent Pre-trained Model # In[7]: from InferSent.models import InferSent V = 1 MODEL_PATH = './encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = './Glove/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) # # Read Data # In[8]: train = pd.read_json('./data/train-v1.1.json') # In[9]:
'bsize': 1, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': VERSION } if VERSION == 1: W2V = 'C:/users/georg/Desktop/GloVe/glove.840B.300d.txt' else: W2V = 'C:/Users/georg/Desktop/fastText/crawl-300d-2M.vec' VOCAB_SIZE = 100000 NUM_STEPS = 300 # set up model model = InferSent(PARAMS).to(DEVICE) model.load_state_dict(torch.load(WEIGHTS)) model.set_w2v_path(W2V) word2vec = model.build_vocab_k_words(K=VOCAB_SIZE) # setup the NN-classifer vec2word = KNeighborsClassifier(n_neighbors=1) vecs = [] words = [] for key, val in word2vec.items(): if val.shape == (300, ): vecs.append(val) words.append(key) X = np.vstack(vecs) y = np.array(words) vec2word.fit(X, y)