def prepare_infersent(params, texts):
    from infersent.models import InferSent
    texts = [' '.join(text) for text in texts]
    V = 2
    MODEL_PATH = 'infersent/encoder/infersent%s.pickle' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    W2V_PATH = 'infersent/dataset/fastText/crawl-300d-2M-subword.vec'
    if params['infersent']['trained']:
        logging.info("Loading pretrained weights")
        infersent.load_state_dict(torch.load(MODEL_PATH))
        infersent.set_w2v_path(W2V_PATH)
    else:
        RANDOM_PATH = "infersent/dataset/fastText/random.vec"
        if not os.path.isfile(RANDOM_PATH):
            random_embeddings(W2V_PATH, RANDOM_PATH)
            logging.info("Written random word embeddings to file")
        infersent.set_w2v_path(RANDOM_PATH)
    infersent.build_vocab(texts, tokenize=False)
    params['encoder'] = infersent
Ejemplo n.º 2
0
    def load_infersent_model(self) -> nn.Module:
        # TODO: This module should not know such high-level info...
        project_path = self.config.firelab.project_path
        infersent_config = self.config.metrics.infersent
        repo_path = os.path.join(project_path, infersent_config.repo_path)
        model_path = os.path.join(project_path, infersent_config.model_path)
        fasttext_path = os.path.join(project_path,
                                     infersent_config.fasttext_path)

        # TODO: Is there any other way to load model class?
        sys.path.append(os.path.dirname(repo_path))
        from infersent.models import InferSent

        print('Loading InferSent model...', end='')
        model = InferSent(INFERSENT_PARAMS).to(self.config.device_name)
        model.load_state_dict(torch.load(model_path))
        print('Done!')

        print('Loading fastText embeddings...', end='')
        model.set_w2v_path(fasttext_path)
        print('Done!')

        print('Building vocab...', end='')
        model.build_vocab_k_words(K=VOCAB_SIZE)
        print('Done!')

        return model
Ejemplo n.º 3
0
def infersent_encoding(fidlist, reflist, predlist):
    import torch
    from infersent.models import InferSent
    model_version = 1
    model_path = "infersent/encoder/infersent%s.pkl" % model_version
    params_model = {'bsize':64, 'word_emb_dim':300, 'enc_lstm_dim':2048, 'pool_type':'max',
                    'dpout_model':0.0, 'version':model_version}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    glove_path = 'infersent/GloVe/glove.840B.300d.txt'
    fasttext_path = 'infersent/fastText/crawl-300d-2M.vec'
    w2v_path = glove_path if model_version == 1 else fasttest_path
    model.set_w2v_path(w2v_path)
    model.build_vocab_k_words(K=100000)
    cosine_score_dict = dict()
    euclidean_distance_dict = dict()

    for fid, ref, pred in zip(fidlist, reflist, predlist):
        ref = ' '.join(ref).strip()
        pred = ' '.join(pred).strip()
        if pred == '':
            cosine_score_dict[fid] = 0
            continue    
        data = [ref, pred]
        data_emb = model.encode(data)
        
        css = cosine_similarity_score(data_emb[0].reshape(1, -1), data_emb[1].reshape(1, -1))[0][0]
        ess = euclidean_distance_score(data_emb[0].reshape(1, -1), data_emb[1].reshape(1, -1))[0][0]
        cosine_score_dict[fid] = css
        euclidean_distance_dict[fid] = ess
    pickle.dump(cosine_score_dict, open(outpath+'/iS_cosine.pkl', 'wb'))
    pickle.dump(euclidean_distance_dict, open(outpath+'/iS_euclidean.pkl', 'wb'))
    cosine_score_list = list(cosine_score_dict.values())
    euclidean_distance_list = list(euclidean_distance_dict.values())
    avg_css = sum(cosine_score_list)/len(cosine_score_list)
    avg_ess = sum(euclidean_distance_list)/len(euclidean_distance_list)

    ret = ('for %s functions\n' % (len(predlist)))
    ret+= 'Cosine similarity score with sentence infersent encoder embedding is %s\n' % (round(avg_css*100, 2))
    ret+= 'Euclidean distance score with sentence infersent encoder embedding is %s\n' % (round(avg_ess*100, 2))
    return ret    
Ejemplo n.º 4
0
class InfersentSimilarityUtils:
    def __init__(self):
        self.model = InferSent(MODEL_PARAMS)
        self.model.load_state_dict(torch.load(MODEL_PATH))
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab_k_words(K=100000)

    def sentencize(self, input_string):
        """Produces a list of sentences"""
        nlp = English()
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        doc = nlp(input_string)
        sentences = [s.text.strip() for s in doc.sents if s.text.strip() != '']
        return sentences

    def cosine(self, u, v):
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

    def get_similarity(self, sentence1, sentence2):
        encoding1 = self.model.encode([sentence1])[0]
        encoding2 = self.model.encode([sentence2])[0]
        similarity = self.cosine(encoding1, encoding2)
        return similarity
def encode_infersent(texts, trained=True, tokenize=True):
    from infersent.models import InferSent
    V = 2
    MODEL_PATH = 'infersent/encoder/infersent%s.pickle' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    W2V_PATH = 'infersent/dataset/fastText/crawl-300d-2M-subword.vec'
    if trained:
        logging.info("Loading pretrained weights")
        infersent.load_state_dict(torch.load(MODEL_PATH))
        infersent.set_w2v_path(W2V_PATH)
    else:
        RANDOM_PATH = "infersent/dataset/fastText/random.vec"
        if not os.path.isfile(RANDOM_PATH):
            random_embeddings(W2V_PATH, RANDOM_PATH)
            logging.info("Written random word embeddings to file")
        infersent.set_w2v_path(RANDOM_PATH)
    infersent.build_vocab(texts, tokenize=tokenize)
    return torch.tensor(infersent.encode(texts, tokenize=tokenize))
Ejemplo n.º 6
0
 def __init__(self):
     self.model = InferSent(MODEL_PARAMS)
     self.model.load_state_dict(torch.load(MODEL_PATH))
     self.model.set_w2v_path(W2V_PATH)
     self.model.build_vocab_k_words(K=100000)
Ejemplo n.º 7
0
            fasttext_tensors = fasttext_vectors
            fasttext_tensors = np.vstack([fasttext_tensors, np.zeros(300)])
            fasttext_tensors = np.vstack([fasttext_tensors, -1*np.ones(300)])
            fasttext_tensors = np.vstack([fasttext_tensors, np.ones(300)])
            fasttext_tensors = torch.Tensor(fasttext_tensors)
            if USE_CUDA: fasttext_tensors = fasttext_tensors.cuda()

if INPUT_EMBEDDING in ['INFERSENT']:
    import torch
    from infersent.models import InferSent
    #from InferSent import models
    V = 2
    MODEL_PATH = 'infersent/encoder/infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    #infersent = models.InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'infersent/fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)

    # get core sentences

    X=[]
    print('Building InferSent vocabulary...')
    with open(input_filename, 'r') as f:
        for line in f:
            sentence = line.strip('\n').strip()
            if len(sentence) > 1:
                X.append(sentence)