class _InferSent: def __init__(self): from InferSent.models import InferSent import torch V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' self.infersent.set_w2v_path(W2V_PATH) def build_vocab(self, queries): self.infersent.build_vocab(queries, tokenize=True) def update_vocab(self, text): self.infersent.update_vocab(text, tokenize=True) def predict(self, text): # self.update_vocab(text) return self.infersent.encode(text, tokenize=True)
def inferSent(): import nltk # nltk.download('punkt') from InferSent.models import InferSent import torch # use_cuda = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model = model.cuda() if use_cuda else model # V = 2 MODEL_PATH = 'encoder/infersent2.pkl' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0, 'version': 2 } infersent = InferSent(params_model).to(device) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) print('set w2v') infersent.build_vocab(dataset, tokenize=True) embeddings = infersent.encode(dataset, bsize=64, tokenize=True) idx = randint(0, len(dataset)) _, _ = infersent.visualize(dataset[idx]) print('done') return embeddings
def generate_embeddings(df): paras = list(df["context"].drop_duplicates().reset_index(drop=True)) print("Paragraph count:", len(paras)) blob = TextBlob(" ".join(paras)) sentences = [item.raw for item in blob.sentences] params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(infersent_pretrained_path)) infersent.set_w2v_path(glove_path) print("Building Infersent vocabulary") infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} print("Building sentence embeddings") print("Sentence count:", len(sentences)) for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) print("Building question embeddings") questions = df["question"].tolist() print("Questions count:", len(questions)) for i in range(len(questions)): dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True) return dict_embeddings
'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) sentences = [ "I am an engineer now.", "You can be an engineer.", "Building stuff is very fun.", "Stuff breaks often too though." ] infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) infersent.visualize('A man plays an instrument.', tokenize=True) encoded_sentences = embeddings # greedy decoder def greedy_decoder(data): # index for largest probability each row return [np.argmax(s) for s in data] # decode sequence