def load_infersent_model(model_path, w2v_path): params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(model_path)) infersent.set_w2v_path(w2v_path) return infersent
def embed_sent(datafile): sentences = [] with open(datafile, 'r') as f: i = 0 for line in f: line = line.replace('\n', '') sentences.append(line) i += 1 if i == 455820: break V = 1 MODEL_PATH = 'infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'GloVe/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) np.savetxt("../../wiki-split/Data/Infersent_vectors/complex_sent", embeddings)
def prepare(model_path: str, word_vecs: str, out_path: str, sentences: Union[str, List[str]] = None, max_vocab: int = 0): """ this method is for adapting the vocabulary, :param model_path: unadapted model state :param word_vecs: word vectors :param out_path: where to store the state :param sentences: training sentences for scanning the vocabulary :param max_vocab: maximum vocabulary size (optional) :return: """ assert bool(sentences) != bool( max_vocab), 'Either sentences or max_vocab should be given' model = InferSent(config=MODEL_CONF) log.info(f"Loading state from {out_path}") model.load_state_dict(torch.load(model_path)) log.info(f"Loading word vecs from {out_path}") model.set_w2v_path(word_vecs) if sentences: if type(sentences) is not list: sentences = list(read_lines(sentences)) log.info("Building vocabulary from sentences") model.build_vocab(sentences, tokenize=True) if max_vocab: log.info(f"Pruning vocabulary to top {max_vocab} types") model.build_vocab_k_words(K=max_vocab) log.info(f"Saving at {out_path}") state = SentenceEncoder._get_state(model) torch.save(state, out_path)
def load_inferSent(sentences): logger.info('load InferSent') V = 2 MODEL_PATH = 'Infersent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) if torch.cuda.is_available(): infersent.cuda() # set word vector if V == 1: W2V_PATH = 'Infersent/Glove/glove.840B.300d.txt' logger.warning('Use Glove Embedding') elif V == 2: W2V_PATH = 'Infersen/fastText/crawl-300d-2M.vec' logger.warning('Use fastText Embedding') else: raise NotImplementedError infersent.set_w2v_path(W2V_PATH) # build voceb infersent.build_vocab(sentences, tokenize=True) return infersent
def create_embeddings(infer_path, data_path, em_type): yt_titles = yt.get_yt_titles() with open("data/whtitles", "r") as f: wh_titles = [line.rstrip('\n') for line in f] if em_type == "yt": # Youtube save_f = os.path.join(data_path, "yt_embed") titles = yt_titles elif em_type == "wh": # Wikihow save_f = os.path.join(data_path, "wh_embed") titles = wh_titles else: raise "Unknown embedding type: {}".format(em_type) nltk.download('punkt') V = 1 MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V) params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent = infersent.cuda() W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt') infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(yt_titles + wh_titles, tokenize=True) embed = infersent.encode(titles, tokenize=True) np.save(save_f, embed)
class Encoder2: ''' Encoder based on InferSent ''' WORD_VECTORS_FILE = 'crawl-300d-2M.vec' MODEL_FILE = 'infersent2.pkl' def __init__(self, word_vectors_dir, models_dir): word_vectors = os.path.join(word_vectors_dir, self.WORD_VECTORS_FILE) model_file = os.path.join(models_dir, self.MODEL_FILE) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(model_file)) self.model.set_w2v_path(word_vectors) def start(self, texts): texts_list = texts.values.tolist() self.model.build_vocab(texts_list, tokenize=True) def close(self): pass def encode(self, texts_batch): texts_batch_list = texts_batch.values.tolist() texts_batch_vec = self.model.encode(texts_batch_list, tokenize=True) return texts_batch_vec
def infersent_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path): model_path = data_fold_path + 'word_sent_embed/infersent2.pickle' word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec' posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim)) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(word_emb_path) all_sents = [] for sens in posts: all_sents.extend(sens) model.build_vocab(all_sents, tokenize=False) for ind, sentences in enumerate(posts): embeddings = model.encode(sentences, tokenize=False, verbose=False) l = min(max_sent_cnt, len(sentences)) posts_arr[ind, :l, :] = embeddings[:l] return posts_arr
class InferSentFeatures: def __init__(self, lang_enc_dir, sentences): sys.path.insert(0, os.path.join(lang_enc_dir, 'InferSent/')) from models import InferSent version = 1 MODEL_PATH = os.path.join( lang_enc_dir, 'InferSent/encoder/infersent%s.pkl' % version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(lang_enc_dir, 'glove/glove.6B.300d.txt') self.model.set_w2v_path(W2V_PATH) self.model.build_vocab(sentences, tokenize=True) def generate_embeddings(self, sentences): embeddings = self.model.encode(sentences, tokenize=True) return embeddings
def calcule_eucl(text, question): blob = TextBlob("".join(text)) sentences = [item.raw for item in blob.sentences] V = 2 MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'InferSent/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) dict_embeddings = {} for i in range(len(sentences)): dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True) encode_question = infersent.encode([question], tokenize=True) eucl = eucl_sim(dict_embeddings, encode_question) return sentences, eucl
def load_model(FLAGS): if FLAGS.sr_model == 'IS': #Load InferSent MODEL_PATH = os.path.join(FLAGS.is_dir, 'encoder/infersent1.pkl') params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(FLAGS.is_dir, 'dataset/GloVe/glove.840B.300d.txt') model.set_w2v_path(W2V_PATH) elif FLAGS.sr_model == 'QT': # Load Quick-Thought model = encoder_manager.EncoderManager() with open(FLAGS.model_config) as json_config_file: model_config = json.load(json_config_file) if type(model_config) is dict: model_config = [model_config] for mdl_cfg in model_config: model_config = configuration.model_config(mdl_cfg, mode='encode') model.load_model(model_config) elif FLAGS.sr_model == 'USE': model = hub.Module( 'https://tfhub.dev/google/universal-sentence-encoder-large/2') return model
def get_loaded_model(force_gpu=False, k_most_frequent_words=1000000): model_path = "infersent/encoder/infersent{}.pkl".format(model_version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) if (not torch.cuda.is_available()) and force_gpu: raise GPUNotFoundException() if torch.cuda.is_available(): model = model.cuda() # If infersent1 -> use GloVe embeddings. # If infersent2 -> use InferSent embeddings. W2V_PATH = 'infersent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'infersent/dataset/fastText/crawl-300d-2M.vec' ## noqa model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=k_most_frequent_words) return model
def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def load_infersent(): V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab_k_words(K=100000) return infersent
def apply_logician(s1, s2 , is_list=False, sick_model = False): # is_list : If you are directly sending sentences then keep is_list = False # If you are sending list of list of words then keep is_list = True # sick_model: if True, will use sick model for prediction # : if False, will use snli model for prediction # Load InferSent model params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 'tenacity': 3, 'epoch_size': 2} params_senteval['infersent'] = model.cuda() if not is_list: s1 = convert_str2lst(s1) s2 = convert_str2lst(s2) samples = s1+s2 params_senteval['batch_size'] = min(128,len(s1)) params_senteval = utils.dotdict(params_senteval) params_senteval.usepytorch = True prepare(params_senteval, samples) emb_s1 = batcher(params_senteval, s1) emb_s2 = batcher(params_senteval, s2) if sick_model: testF = np.c_[ np.abs(emb_s1 - emb_s2),emb_s1 * emb_s2] cp = torch.load('./saved_sick.pth') print('[Contradiction Neutral Entailment]') else: testF = np.c_[emb_s1, emb_s2, emb_s1 * emb_s2, np.abs(emb_s1 - emb_s2)] cp = torch.load('./saved_snli_augment_ordered.pth') print('[ Entailment Neutral Contradiction ]') inputdim = testF.shape[1] nclasses = 3 clf = nn.Sequential(nn.Linear(inputdim, nclasses),).cuda() clf.load_state_dict(cp) testF = torch.FloatTensor(testF).cuda() out = clf(testF) sf = nn.Softmax(1) probs = sf(out) return probs
def init_models(vocal_size: int = VOCAB_SIZE): model = InferSent({ 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': VERSION }) model.load_state_dict(torch.load(MODEL_PATH)) model = model.cuda() if USE_CUDA else model model.set_w2v_path(VECTOR_PATH) model.build_vocab_k_words(K=VOCAB_SIZE) return model
def resume_model(model_path, dict_path, version, use_cuda): device = torch.device('cuda' if use_cuda else 'cpu') params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } infer_sent = InferSent(params_model) infer_sent.load_state_dict(torch.load(model_path, map_location=device)) infer_sent.set_w2v_path(dict_path) return infer_sent
def infersent_flat_embed_posts(posts, embed_dim, data_fold_path): model_path = data_fold_path + 'word_sent_embed/infersent2.pickle' word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec' params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) model.set_w2v_path(word_emb_path) model.build_vocab(posts, tokenize=False) return model.encode(posts, tokenize=False, verbose=False)
def load_infersent_model(): file_path = dirname(os.path.realpath(__file__)) MODEL_PATH = os.path.join(file_path, 'encoder', 'infersent2.pkl') params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = os.path.join(file_path, 'fastText', 'crawl-300d-2M.vec') infersent.set_w2v_path(W2V_PATH) return infersent
def load_model(): model_version = 1 MODEL_PATH = "encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) print('building vocab') model.build_vocab_k_words(K=100000) print('done building vocab') return model
class Infersent: def __init__(self): V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } self.infersent = InferSent(params_model) self.infersent.load_state_dict(torch.load(MODEL_PATH)) self.infersent.set_w2v_path('fastText/crawl-300d-2M.vec') def get(self, sentences): self.infersent.build_vocab(sentences, tokenize=True) return self.infersent.encode(sentences, tokenize=True)
def embed_dataset(dataset_path, infersent_path, force_cpu=False): """ To make this work, first run ./get_infersent.sh """ MODEL_PATH = infersent_path / "encoder/infersent1.pkl" params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1} model = InferSent(params_model) if force_cpu: model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu')) else: model.load_state_dict(torch.load(MODEL_PATH)) model.cuda() W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) csv_data = read_csv(dataset_path / 'train.csv') csv_data = csv_data[1:] # skip header data = defaultdict(list) for irow, row in enumerate(csv_data): if 'snips' in str(dataset_path): utterance, labels, delexicalised, intent = row else: raise TypeError( "Unknown dataset type. Implement your own first. See the " "README") data[intent].append(utterance) vectors = {} for i, (intent, sentences) in enumerate(data.items()): print('{}/{} done'.format(i, len(data.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
def infersent_embeddings(): train_data_list = [] test_data_list = [] sys.path.append( '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master') # Load model from models import InferSent model_version = 1 MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) train_data_list = model.encode(final_train['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(train_data_list))) test_data_list = model.encode(final_test['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(test_data_list))) return train_data_list, test_data_list
def no_stopwords(): infersent2 = InferSent(params_model) infersent2.load_state_dict(torch.load(MODEL_PATH)) infersent2.set_w2v_path(W2V_PATH) use_cuda = True infersent2 = infersent.cuda() if use_cuda else infersent pdss = pd.DataFrame(columns=['embds', 'set', 'catg']) start = time.time() global current_idx for x in range(3): crix = current_idx abss, catg, sets, crix = get_batch_from_dataframe(crix) for index in range(len(abss)): doc = nlp(abss[index]) strs_after_stop_arr = [] for token in doc: if not token.is_stop: strs_after_stop_arr.append(token.text) abss[index] = ' '.join(strs_after_stop_arr) if x == 0: infersent2.build_vocab(abss, tokenize=True) else: infersent2.update_vocab(abss, tokenize=True) embed = infersent2.encode(abss, tokenize=True) df2 = pd.DataFrame({ 'embds': embed.tolist(), 'set': sets, 'catg': catg }) pdss = pdss.append(df2, ignore_index=True) current_idx = crix end = time.time() - start print("Time without stopwords", end) pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
def init_infersent_model(self): model_version = 1 MODEL_PATH = "encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) self.model = model
def infersent_glove(): #Set Model for InferSent+Glove V = 1 MODEL_PATH = '/tmp/GloVe/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } modelg = InferSent(params_model) modelg.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = True modelg = modelg.cuda() if use_cuda else modelg # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/tmp/GloVe/glove.840B.300d.txt' if V == 1 else '/home/ganesh/Quora_dev/tmp/GloVe/glove.840B.300d.txt' modelg.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words modelg.build_vocab_k_words(K=100000) return modelg
import numpy as np from models import InferSent from nltk.tokenize import word_tokenize from scipy.spatial.distance import pdist, squareform from sklearn.preprocessing import StandardScaler, MinMaxScaler #### Parameters PARSER = argparse.ArgumentParser(description='Ask a question') PARSER.add_argument('--question', metavar='string', required=True, help="The question you want answered") ARGS = PARSER.parse_args() question = ARGS.question sentences = [question] #### Load Facebook's InferSent (download the files from the internet) infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}) infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl')) infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt') # Extract the most relevant Wikipedia page #### Wikipedia recommends 10 pages wikipedia_pages = wikipedia.search(question) sentences = sentences + wikipedia_pages #### Convert sentences to numbers infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True, verbose=False) #### Choose the most relevant pages distances = pdist(np.array(embeddings), metric='euclidean') sentence_similarity_matrix = squareform(distances) most_relevant_pages = np.argsort(sentence_similarity_matrix[0][1:]) #### Extract the content on the most relevant page (tries multiple pages in case of failure) for page in most_relevant_pages: try:
V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'final_text_vectors.txt' model.set_w2v_path(W2V_PATH) model.build_vocab(sentences, tokenize=True) #build_vocab_k_words(K=100000) embeddings = model.encode( sentences, tokenize=True) #(sentences, bsize=168, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(embeddings))) sen_vec = preprocessing.normalize(embeddings) sen_vec = Variable(torch.from_numpy(sen_vec)) #sen_vec = nn.Linear(4096,300) model = net() n = (1, 300) nparray = np.zeros(n) for i in sen_vec: out = model(i)
name = "InferSent" V = 2 MODEL_PATH = '/data/InferSent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = '/home/saradhix/glove.840B/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) def get_vector(sentence): return get_vectors([sentence]) def get_vectors(sentences): sentences = [s.lower() for s in sentences] #sentences = ["Hello, I am bakhtiyar", "wow here is a cake for you!"] infersent.build_vocab(sentences, tokenize=True) embeddings = infersent.encode(sentences, tokenize=True) return embeddings #print (embeddings)
#Load InferSent model model_version = 2 MODEL_PATH = "/MAD/InferSent/encoder/infersent%s.pkl" % model_version #AD: load in the InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model_infersent = InferSent(params_model) model_infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = '/MAD/InferSent/dataset/crawl-300d-2M.vec' model_infersent.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model_infersent.build_vocab_k_words(K=1000000) print("InferSent model loaded") #input: src, tgt, tgt.translated (to src, being English). src = open(args.path_src, "r").read().split("\n") src = src[:-1] tgt = open(args.path_tgt, "r").read().split("\n") tgt = tgt[:-1] Txt_target_2_cross = open(args.path_tgt_translated).read().split("\n") Txt_target_2_cross = Txt_target_2_cross[:-1]
} # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval['infersent'] = model.cuda() se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion' ] results = se.eval(transfer_tasks) print(results)
parser.add_argument('-b', '--batch-size', type=int, default=64, help='Batch size (default: 64)') parser.add_argument('files', nargs='+', help='List of files to extract sentence embeddings') args = parser.parse_args() params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version} model = InferSent(params_model) model.load_state_dict(torch.load(args.model_path)) if not args.cpu: model = model.cuda() model.set_w2v_path(args.w2v_path) # Ensure directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # Read files and extract features for fpath in args.files: print('Reading file {}'.format(fpath)) sents = [] with open(fpath) as f: for line in f: line = line.strip() assert line, 'Empty line in {}'.format(fpath) sents.append(line)