Python InferSent.load_state_dict Examples, models.InferSent.load_state_dict Python Examples

Example #1

0

Show file

File: get_infersent.py Project: pangzhan27/layer_augmentation_qa

def load_infersent_model(model_path, w2v_path):
	params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
		'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
	infersent = InferSent(params_model)
	infersent.load_state_dict(torch.load(model_path))
	infersent.set_w2v_path(w2v_path)
	return infersent

Example #2

0

Show file

File: infersent_baseline.py Project: DrameMariama/Sentence_Composition

def embed_sent(datafile):
    sentences = []
    with open(datafile, 'r') as f:
        i = 0
        for line in f:
            line = line.replace('\n', '')
            sentences.append(line)
            i += 1
            if i == 455820:
                        break
    V = 1
    MODEL_PATH = 'infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(sentences, tokenize=True)

    embeddings = infersent.encode(sentences, tokenize=True)

    np.savetxt("../../wiki-split/Data/Infersent_vectors/complex_sent", embeddings)

Example #3

0

Show file

File: unisenc.py Project: thammegowda/virtchar

    def prepare(model_path: str,
                word_vecs: str,
                out_path: str,
                sentences: Union[str, List[str]] = None,
                max_vocab: int = 0):
        """
        this method is for adapting the vocabulary,
        :param model_path: unadapted model state
        :param word_vecs: word vectors
        :param out_path: where to store the state
        :param sentences: training sentences for scanning the vocabulary
        :param max_vocab: maximum vocabulary size (optional)
        :return:
        """
        assert bool(sentences) != bool(
            max_vocab), 'Either sentences or max_vocab should be given'

        model = InferSent(config=MODEL_CONF)
        log.info(f"Loading state from {out_path}")

        model.load_state_dict(torch.load(model_path))
        log.info(f"Loading word vecs from {out_path}")
        model.set_w2v_path(word_vecs)
        if sentences:
            if type(sentences) is not list:
                sentences = list(read_lines(sentences))
            log.info("Building vocabulary from sentences")
            model.build_vocab(sentences, tokenize=True)
        if max_vocab:
            log.info(f"Pruning vocabulary to top {max_vocab} types")
            model.build_vocab_k_words(K=max_vocab)
        log.info(f"Saving at {out_path}")

        state = SentenceEncoder._get_state(model)
        torch.save(state, out_path)

Example #4

0

Show file

File: cluster.py Project: superspray/transformers-for-question-generation

def load_inferSent(sentences):
    logger.info('load InferSent')
    V = 2
    MODEL_PATH = 'Infersent/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    if torch.cuda.is_available():
        infersent.cuda()

    # set word vector
    if V == 1:
        W2V_PATH = 'Infersent/Glove/glove.840B.300d.txt'
        logger.warning('Use Glove Embedding')
    elif V == 2:
        W2V_PATH = 'Infersen/fastText/crawl-300d-2M.vec'
        logger.warning('Use fastText Embedding')
    else:
        raise NotImplementedError
    infersent.set_w2v_path(W2V_PATH)

    # build voceb
    infersent.build_vocab(sentences, tokenize=True)

    return infersent

Example #5

0

Show file

def create_embeddings(infer_path, data_path, em_type):
    yt_titles = yt.get_yt_titles()
    with open("data/whtitles", "r") as f:
        wh_titles = [line.rstrip('\n') for line in f]

    if em_type == "yt":  # Youtube
        save_f = os.path.join(data_path, "yt_embed")
        titles = yt_titles
    elif em_type == "wh":  # Wikihow
        save_f = os.path.join(data_path, "wh_embed")
        titles = wh_titles
    else:
        raise "Unknown embedding type: {}".format(em_type)

    nltk.download('punkt')
    V = 1
    MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V)
    params_model = {
        'bsize': 256,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent = infersent.cuda()

    W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt')
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(yt_titles + wh_titles, tokenize=True)
    embed = infersent.encode(titles, tokenize=True)
    np.save(save_f, embed)

Example #6

0

Show file

File: preprocess_data.py Project: tsujuifu/code_ssi

class InferSentFeatures:
    def __init__(self, lang_enc_dir, sentences):
        sys.path.insert(0, os.path.join(lang_enc_dir, 'InferSent/'))
        from models import InferSent

        version = 1
        MODEL_PATH = os.path.join(
            lang_enc_dir, 'InferSent/encoder/infersent%s.pkl' % version)
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))

        W2V_PATH = os.path.join(lang_enc_dir, 'glove/glove.6B.300d.txt')
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab(sentences, tokenize=True)

    def generate_embeddings(self, sentences):
        embeddings = self.model.encode(sentences, tokenize=True)
        return embeddings

Example #7

0

Show file

File: similarite.py Project: WassilDahi/question-answering-master

def calcule_eucl(text, question):
    blob = TextBlob("".join(text))
    sentences = [item.raw for item in blob.sentences]

    V = 2
    MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'InferSent/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)
        encode_question = infersent.encode([question], tokenize=True)
    eucl = eucl_sim(dict_embeddings, encode_question)

    return sentences, eucl

Example #8

0

Show file

File: tsv2npz.py Project: Xilong-Zhang/RUSE

def load_model(FLAGS):
    if FLAGS.sr_model == 'IS':
        #Load InferSent
        MODEL_PATH = os.path.join(FLAGS.is_dir, 'encoder/infersent1.pkl')

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))
        W2V_PATH = os.path.join(FLAGS.is_dir,
                                'dataset/GloVe/glove.840B.300d.txt')
        model.set_w2v_path(W2V_PATH)
    elif FLAGS.sr_model == 'QT':
        # Load Quick-Thought
        model = encoder_manager.EncoderManager()

        with open(FLAGS.model_config) as json_config_file:
            model_config = json.load(json_config_file)
        if type(model_config) is dict:
            model_config = [model_config]

        for mdl_cfg in model_config:
            model_config = configuration.model_config(mdl_cfg, mode='encode')
            model.load_model(model_config)
    elif FLAGS.sr_model == 'USE':
        model = hub.Module(
            'https://tfhub.dev/google/universal-sentence-encoder-large/2')

    return model

Example #9

0

Show file

def infersent_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path):
    model_path = data_fold_path + 'word_sent_embed/infersent2.pickle'
    word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec'
    posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim))

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    model.set_w2v_path(word_emb_path)

    all_sents = []
    for sens in posts:
        all_sents.extend(sens)

    model.build_vocab(all_sents, tokenize=False)

    for ind, sentences in enumerate(posts):
        embeddings = model.encode(sentences, tokenize=False, verbose=False)
        l = min(max_sent_cnt, len(sentences))
        posts_arr[ind, :l, :] = embeddings[:l]

    return posts_arr

Example #10

0

Show file

File: commons.py Project: codecoffeeme/kylo

def get_loaded_model(force_gpu=False, k_most_frequent_words=1000000):

    model_path = "infersent/encoder/infersent{}.pkl".format(model_version)
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }

    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))

    if (not torch.cuda.is_available()) and force_gpu:
        raise GPUNotFoundException()

    if torch.cuda.is_available():
        model = model.cuda()

    # If infersent1 -> use GloVe embeddings.
    # If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'infersent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'infersent/dataset/fastText/crawl-300d-2M.vec'  ## noqa
    model.set_w2v_path(W2V_PATH)

    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=k_most_frequent_words)
    return model

Example #11

0

Show file

File: text_classifier.py Project: f-data/ADD

class Encoder2:
    ''' Encoder based on InferSent '''

    WORD_VECTORS_FILE = 'crawl-300d-2M.vec'
    MODEL_FILE = 'infersent2.pkl'

    def __init__(self, word_vectors_dir, models_dir):
        word_vectors = os.path.join(word_vectors_dir, self.WORD_VECTORS_FILE)
        model_file = os.path.join(models_dir, self.MODEL_FILE)

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 2
        }

        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(model_file))
        self.model.set_w2v_path(word_vectors)

    def start(self, texts):
        texts_list = texts.values.tolist()
        self.model.build_vocab(texts_list, tokenize=True)

    def close(self):
        pass

    def encode(self, texts_batch):
        texts_batch_list = texts_batch.values.tolist()
        texts_batch_vec = self.model.encode(texts_batch_list, tokenize=True)

        return texts_batch_vec

Example #12

0

Show file

File: sent_embed.py Project: JRChow/InferSent

 def load_pretrained_model(model_version):
     model_path = RAW_MODEL_PATH % model_version
     params = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
               'pool_type': 'max', 'dpout_model': 0.0,
               'version': model_version}
     model = InferSent(params)
     model.load_state_dict(torch.load(model_path))
     return model

Example #13

0

Show file

def load_infersent():
    V = 2
    MODEL_PATH = 'encoder/infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab_k_words(K=100000)
    return infersent

Example #14

0

Show file

File: apply.py Project: smit14/SentEval

def apply_logician(s1, s2 , is_list=False, sick_model = False):

	# is_list : If you are directly sending sentences then keep is_list = False
	#			If you are sending list of list of words then keep is_list = True

	# sick_model: if True, will use sick model for prediction
	#			: if False, will use snli model for prediction

	# Load InferSent model
	params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
					'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
	model = InferSent(params_model)
	model.load_state_dict(torch.load(MODEL_PATH))
	model.set_w2v_path(PATH_TO_W2V)

	params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
	params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
									 'tenacity': 3, 'epoch_size': 2}

	params_senteval['infersent'] = model.cuda()

	if not is_list:
		s1 = convert_str2lst(s1)
		s2 = convert_str2lst(s2)
	samples = s1+s2
	params_senteval['batch_size'] = min(128,len(s1))
	params_senteval = utils.dotdict(params_senteval)
	params_senteval.usepytorch  = True

	prepare(params_senteval, samples)

	emb_s1 = batcher(params_senteval, s1)
	emb_s2 = batcher(params_senteval, s2)
	if sick_model:
		testF = np.c_[ np.abs(emb_s1 - emb_s2),emb_s1 * emb_s2]
		cp = torch.load('./saved_sick.pth')
		print('[Contradiction  Neutral  Entailment]')
	else:
		testF = np.c_[emb_s1, emb_s2, emb_s1 * emb_s2, np.abs(emb_s1 - emb_s2)]
		cp = torch.load('./saved_snli_augment_ordered.pth')
		print('[ Entailment  Neutral Contradiction ]')
	inputdim = testF.shape[1]
	nclasses = 3
	clf = nn.Sequential(nn.Linear(inputdim, nclasses),).cuda()
	clf.load_state_dict(cp)

	testF = torch.FloatTensor(testF).cuda()
	out = clf(testF)
	sf = nn.Softmax(1)
	probs = sf(out)
	return probs

Example #15

0

Show file

File: encode_sentence.py Project: contemn1/sentence_evaluation

def resume_model(model_path, dict_path, version, use_cuda):
    device = torch.device('cuda' if use_cuda else 'cpu')
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': version
    }
    infer_sent = InferSent(params_model)
    infer_sent.load_state_dict(torch.load(model_path, map_location=device))

    infer_sent.set_w2v_path(dict_path)
    return infer_sent

Example #16

0

Show file

def init_models(vocal_size: int = VOCAB_SIZE):
    model = InferSent({
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': VERSION
    })
    model.load_state_dict(torch.load(MODEL_PATH))
    model = model.cuda() if USE_CUDA else model

    model.set_w2v_path(VECTOR_PATH)
    model.build_vocab_k_words(K=VOCAB_SIZE)
    return model

Example #17

0

Show file

def infersent_flat_embed_posts(posts, embed_dim, data_fold_path):
    model_path = data_fold_path + 'word_sent_embed/infersent2.pickle'
    word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec'

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    model.set_w2v_path(word_emb_path)

    model.build_vocab(posts, tokenize=False)
    return model.encode(posts, tokenize=False, verbose=False)

Example #18

0

Show file

File: sentence_embed.py Project: apoorvab93/Autodocs

def load_infersent_model():
    file_path = dirname(os.path.realpath(__file__))
    MODEL_PATH = os.path.join(file_path, 'encoder', 'infersent2.pkl')
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = os.path.join(file_path, 'fastText', 'crawl-300d-2M.vec')
    infersent.set_w2v_path(W2V_PATH)

    return infersent

Example #19

0

Show file

def load_model():
    model_version = 1
    MODEL_PATH = "encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)

    print('building vocab')
    model.build_vocab_k_words(K=100000)
    print('done building vocab')
    return model

Example #20

0

Show file

class Infersent:
    def __init__(self):

        V = 2
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }

        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        self.infersent.set_w2v_path('fastText/crawl-300d-2M.vec')

    def get(self, sentences):
        self.infersent.build_vocab(sentences, tokenize=True)

        return self.infersent.encode(sentences, tokenize=True)

Example #21

0

Show file

def embed_dataset(dataset_path, infersent_path, force_cpu=False):
    """
    To make this work, first run ./get_infersent.sh
    """
    MODEL_PATH = infersent_path / "encoder/infersent1.pkl"
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
    model = InferSent(params_model)
    if force_cpu:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
    else:
        model.load_state_dict(torch.load(MODEL_PATH))
        model.cuda()

    W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)

    csv_data = read_csv(dataset_path / 'train.csv')
    csv_data = csv_data[1:]  # skip header
    data = defaultdict(list)

    for irow, row in enumerate(csv_data):
        if 'snips' in str(dataset_path):
            utterance, labels, delexicalised, intent = row
        else:
            raise TypeError(
                "Unknown dataset type. Implement your own first. See the "
                "README")
        data[intent].append(utterance)

    vectors = {}
    for i, (intent, sentences) in enumerate(data.items()):
        print('{}/{} done'.format(i, len(data.items())))
        embeddings = model.encode(sentences)
        avg_embedding = np.mean(embeddings, axis=0)
        vectors[intent] = avg_embedding

    return vectors

Example #22

0

Show file

File: reuters-Ensemble-Methods.py Project: song6cy/one-class-text-classification-using-ensemble-approach

def infersent_embeddings():
    train_data_list = []
    test_data_list = []
    sys.path.append(
        '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master')
    # Load model
    from models import InferSent
    model_version = 1
    MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model
    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)
    train_data_list = model.encode(final_train['text'].tolist(),
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)
    print('nb sentences encoded : {0}'.format(len(train_data_list)))
    test_data_list = model.encode(final_test['text'].tolist(),
                                  bsize=128,
                                  tokenize=False,
                                  verbose=True)
    print('nb sentences encoded : {0}'.format(len(test_data_list)))
    return train_data_list, test_data_list

Example #23

0

Show file

def no_stopwords():
    infersent2 = InferSent(params_model)
    infersent2.load_state_dict(torch.load(MODEL_PATH))
    infersent2.set_w2v_path(W2V_PATH)
    use_cuda = True
    infersent2 = infersent.cuda() if use_cuda else infersent
    pdss = pd.DataFrame(columns=['embds', 'set', 'catg'])
    start = time.time()
    global current_idx
    for x in range(3):
        crix = current_idx
        abss, catg, sets, crix = get_batch_from_dataframe(crix)
        for index in range(len(abss)):
            doc = nlp(abss[index])
            strs_after_stop_arr = []
            for token in doc:
                if not token.is_stop:
                    strs_after_stop_arr.append(token.text)

            abss[index] = ' '.join(strs_after_stop_arr)

        if x == 0:
            infersent2.build_vocab(abss, tokenize=True)
        else:
            infersent2.update_vocab(abss, tokenize=True)

        embed = infersent2.encode(abss, tokenize=True)
        df2 = pd.DataFrame({
            'embds': embed.tolist(),
            'set': sets,
            'catg': catg
        })
        pdss = pdss.append(df2, ignore_index=True)

        current_idx = crix
    end = time.time() - start
    print("Time without stopwords", end)
    pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")

Example #24

0

Show file

    def init_infersent_model(self):
        model_version = 1
        MODEL_PATH = "encoder/infersent%s.pkl" % model_version
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # Keep it on CPU or put it on GPU
        use_cuda = False
        model = model.cuda() if use_cuda else model

        # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
        W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        model.build_vocab_k_words(K=100000)
        self.model = model

Example #25

0

Show file

File: get_sentence_embeddings.py Project: ganesh292/Quora_QuestionPairs

def infersent_glove():
    #Set Model for InferSent+Glove
    V = 1
    MODEL_PATH = '/tmp/GloVe/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    modelg = InferSent(params_model)
    modelg.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = True
    modelg = modelg.cuda() if use_cuda else modelg

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/tmp/GloVe/glove.840B.300d.txt' if V == 1 else '/home/ganesh/Quora_dev/tmp/GloVe/glove.840B.300d.txt'
    modelg.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    modelg.build_vocab_k_words(K=100000)
    return modelg

Example #26

0

Show file

#curl -Lo encoder/infersent2.pkl https://s3.amazonaws.com/senteval/infersent/infersent2.pkl
# Next, set the W2V_PATH variable below and clickbait sentences accordingly.
# output is numpy array of 4096 dim. space.
name = "InferSent"
V = 2
MODEL_PATH = '/data/InferSent/encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = '/home/saradhix/glove.840B/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)


def get_vector(sentence):
    return get_vectors([sentence])


def get_vectors(sentences):
    sentences = [s.lower() for s in sentences]
    #sentences = ["Hello, I am bakhtiyar", "wow here is a cake for you!"]
    infersent.build_vocab(sentences, tokenize=True)
    embeddings = infersent.encode(sentences, tokenize=True)
    return embeddings

Example #27

0

Show file

File: run_senteval.py Project: codeaudit/DeCLUTR

def infersent(
    path_to_senteval: str,
    path_to_vectors: str,
    output_filepath: str = None,
    cuda_device: int = -1,
    prototyping_config: bool = False,
    verbose: bool = False,
) -> None:
    """Evaluates an InferSent model against the SentEval benchmark
    (see: https://github.com/facebookresearch/InferSent for information on the pre-trained model).
    Adapted from: https://github.com/facebookresearch/SentEval/blob/master/examples/infersent.py.
    """
    from models import InferSent

    def prepare(params, samples):
        samples = _cleanup_batch(samples)
        params.infersent.build_vocab([" ".join(tokens) for tokens in samples],
                                     tokenize=False)

    def batcher(params, batch):
        batch = _cleanup_batch(batch)
        sentences = [" ".join(tokens) for tokens in batch]
        embeddings = params.infersent.encode(sentences,
                                             bsize=params.batch_size,
                                             tokenize=False)
        return embeddings

    # Determine the torch device
    device = _get_device(cuda_device)

    # Load InferSent model
    # TODO (John): Hardcoded these to move things along, but that should be fixed.
    V = 2
    MODEL_PATH = "resources/encoder/infersent%s.pkl" % V
    params_model = {
        "bsize": 64,
        "word_emb_dim": 300,
        "enc_lstm_dim": 2048,
        "pool_type": "max",
        "dpout_model": 0.0,
        "version": V,
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent.to(device)
    # Load and initialize the model with word vectors
    infersent.set_w2v_path(path_to_vectors)

    trainable_params = sum(p.numel() for p in infersent.parameters()
                           if p.requires_grad)
    typer.secho(
        (f"{SUCCESS} Loaded InferSent model {MODEL_PATH}"
         f" with {trainable_params} trainable parameters."),
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Performs a few setup steps and returns the SentEval params
    params_senteval = _setup_senteval(path_to_senteval, prototyping_config,
                                      verbose)
    params_senteval["infersent"] = infersent
    _run_senteval(params_senteval, path_to_senteval, batcher, prepare,
                  output_filepath)

    return

Example #28

0

Show file

File: evaluate.py Project: zmddzf/BinarySentEmb


# define senteval params
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5,
                   'classifier' :{'nhid': 0, 'optim': 'adam', 'batch_size': 64,
                                  'tenacity': 5, 'epoch_size': 4}
                }
# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    # Load InferSent model
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(PATH_TO_CONT_ENCODER))
    model.set_w2v_path(PATH_TO_W2V)

    model_name = config.encoder_type
    if config.encoder_type == 'AE':
        dis_encoder = DisEnc.LinearAutoEncoder(config.dim)
        dis_encoder.load_state_dict(torch.load(PATH_TO_B_ENCODER))
        model_name = model_name + '_' + config.model_name #+'V'+str(config.INFERSENT_VERSION)
    elif config.encoder_type == 'PCA':
        dis_encoder = DisEnc.PCAEncoder(config.dim,config.PCA_LOAD_PATH)
    elif config.encoder_type == 'Random':
        dis_encoder = DisEnc.RandomEncoder(config.dim,config.RAN_LOAD_PATH)
    elif config.encoder_type == 'Id':
        dis_encoder = DisEnc.IdEncoder()
    elif config.encoder_type == 'HT':
        dis_encoder = DisEnc.HTEncoder(config.RAN_LOAD_PATH)

Example #29

0

Show file

    for line in f:
        sentences.append(line.strip())

V = 1
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}

model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'final_text_vectors.txt'
model.set_w2v_path(W2V_PATH)
model.build_vocab(sentences, tokenize=True)  #build_vocab_k_words(K=100000)

embeddings = model.encode(
    sentences,
    tokenize=True)  #(sentences, bsize=168, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

sen_vec = preprocessing.normalize(embeddings)
sen_vec = Variable(torch.from_numpy(sen_vec))
#sen_vec = nn.Linear(4096,300)
model = net()
n = (1, 300)
nparray = np.zeros(n)

Example #30

0

Show file

File: questionanswering.py Project: Peter-32/questionanswering1

import wikipedia
import numpy as np
from models import InferSent
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#### Parameters
PARSER = argparse.ArgumentParser(description='Ask a question')
PARSER.add_argument('--question', metavar='string', required=True, help="The question you want answered")
ARGS = PARSER.parse_args()
question = ARGS.question
sentences = [question]
#### Load Facebook's InferSent (download the files from the internet)
infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1})
infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl'))
infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt')

# Extract the most relevant Wikipedia page
#### Wikipedia recommends 10 pages
wikipedia_pages = wikipedia.search(question)
sentences = sentences + wikipedia_pages
#### Convert sentences to numbers
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True, verbose=False)
#### Choose the most relevant pages
distances = pdist(np.array(embeddings), metric='euclidean')
sentence_similarity_matrix = squareform(distances)
most_relevant_pages = np.argsort(sentence_similarity_matrix[0][1:])
#### Extract the content on the most relevant page (tries multiple pages in case of failure)
for page in most_relevant_pages:

Example #31

0

Show file

File: extract_features.py Project: chingyi-lin/NLP-commitment-search

                        help='Passes tokenize=True to build_vocab()')
    parser.add_argument('-o', '--out-dir', type=str, required=True,
                        help='Output folder to save feature files')
    parser.add_argument('-c', '--cpu', action='store_true',
                        help='Use CPU instead of GPU.')
    parser.add_argument('-b', '--batch-size', type=int, default=64,
                        help='Batch size (default: 64)')
    parser.add_argument('files', nargs='+',
                        help='List of files to extract sentence embeddings')

    args = parser.parse_args()

    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(args.model_path))

    if not args.cpu:
        model = model.cuda()

    model.set_w2v_path(args.w2v_path)

    # Ensure directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    # Read files and extract features
    for fpath in args.files:
        print('Reading file {}'.format(fpath))
        sents = []
        with open(fpath) as f: