예제 #1
0
class _InferSent:
    def __init__(self):
        from InferSent.models import InferSent
        import torch
        V = 1
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 256,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
        self.infersent.set_w2v_path(W2V_PATH)

    def build_vocab(self, queries):
        self.infersent.build_vocab(queries, tokenize=True)

    def update_vocab(self, text):
        self.infersent.update_vocab(text, tokenize=True)

    def predict(self, text):
        # self.update_vocab(text)
        return self.infersent.encode(text, tokenize=True)
예제 #2
0
    def compute_intent_vectors(self, sentences):
        # TODO IMPLEMENT CACHING!
        from InferSent.models import InferSent
        infersent_folder = Path('./InferSent')
        infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl')
        MODEL_PARAMETERS = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt'

        model = InferSent(MODEL_PARAMETERS)
        model.load_state_dict(torch.load(infersent_path))
        if torch.cuda.is_available():
            model.cuda()
        model.set_w2v_path(W2V_PATH)
        model.build_vocab_k_words(K=100000)

        utterances_dict = self.get_utterances_dict(sentences)

        vectors = {}
        for i, (intent, sentences) in enumerate(utterances_dict.items()):
            LOGGER.info('{}/{} done'.format(i + 1,
                                            len(utterances_dict.items())))
            embeddings = model.encode(sentences)
            avg_embedding = np.mean(embeddings, axis=0)
            vectors[intent] = avg_embedding

        return vectors
예제 #3
0
class UniversalSentenceEncoder:
    def __init__(self):
        super().__init__()
        model_version = 1
        MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
        W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))
        self.model.eval()
        use_cuda = True
        self.model = self.model.cuda() if use_cuda else self.model
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab_k_words(K=100000)

    def semantic_sim(self, sents1, sents2):
        embed1 = self.model.encode(sents1, tokenize=False)
        embed2 = self.model.encode(sents2, tokenize=False)
        embed1 = torch.tensor(embed1)
        embed2 = torch.tensor(embed2)
        sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True)
        sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True)
        cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1)
        clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0)
        scores = 1.0 - torch.acos(clip_cosine_similarities)
        return scores.cpu().numpy()
def inferSent():
    import nltk
    # nltk.download('punkt')
    from InferSent.models import InferSent
    import torch

    # use_cuda = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model = model.cuda() if use_cuda else model

    # V = 2
    MODEL_PATH = 'encoder/infersent2.pkl'
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0,
        'version': 2
    }
    infersent = InferSent(params_model).to(device)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    print('set w2v')

    infersent.build_vocab(dataset, tokenize=True)
    embeddings = infersent.encode(dataset, bsize=64, tokenize=True)
    idx = randint(0, len(dataset))
    _, _ = infersent.visualize(dataset[idx])
    print('done')
    return embeddings
    def __init__(self,
                 bsize=64,
                 word_emb_dim=300,
                 enc_lstm_dim=2048,
                 pool_type='max',
                 dpout_model=0.0,
                 version=2,
                 model_path='../infersent/infersent2.pkl',
                 path_to_w2v='../fasttext/crawl-300d-2M.vec',
                 use_cuda=True):
        self.version = version
        self.dpout_model = dpout_model
        self.pool_type = pool_type
        self.enc_lstm_dim = enc_lstm_dim
        self.word_emb_dim = word_emb_dim
        self.bsize = bsize
        model = InferSent({
            'bsize': bsize,
            'word_emb_dim': word_emb_dim,
            'enc_lstm_dim': enc_lstm_dim,
            'pool_type': pool_type,
            'dpout_model': dpout_model,
            'version': version
        })
        model.load_state_dict(torch.load(model_path))
        model.set_w2v_path(path_to_w2v)

        if not use_cuda:
            self.model = model
        else:
            self.model = model.cuda()

        self.first_call = True
예제 #6
0
def _load_pretrained_model(verbose=True):
    if verbose:
        print(f">>> Loading pretrained model from {_MODEL_PATH}")
    infersent = InferSent(_PARAMS_MODEL)
    infersent.load_state_dict(torch.load(_MODEL_PATH))
    infersent.set_w2v_path(_W2V_PATH)
    infersent.build_vocab_k_words(K=_K_WORDS_VOCAB)
    return infersent
예제 #7
0
def get_infersent(V=2):
    '''
    Builds the infersent model using either GloVe or fastText
    '''
    MODEL_PATH = 'encoder/infersent%s.pkl' %V
    if V == 2:
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
    elif V == 1:
        W2V_PATH = 'GloVe/glove.840B.300d.txt'
    
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, \
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent.set_w2v_path(W2V_PATH)

    return infersent
    def __init__(self):
        #print("Initializing Infersent..")
        model_version = 1
        MODEL_PATH = get_project_root() / Path("encoder/infersent%s.pkl" % model_version)
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # word vector path for the model:
        W2V_PATH = get_project_root() / Path('GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec')
        model.set_w2v_path(W2V_PATH)

        # build the vocabulary of word vectors
        model.build_vocab_k_words(K=100000)

        self.model = model
예제 #9
0
def Start_chatbot():
    model_version = 1
    MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    use_cuda = False
    model = model.cuda() if use_cuda else model

    W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)

    model.build_vocab_k_words(K=570000)

    dict = {}
    embeddings = {}
    questions = []
    answers = []

    with open('../data/questions.txt') as f:
        content = f.readlines()
    questions = [x.strip() for x in content]

    with open('../data/answers.txt') as f:
        content = f.readlines()
    answers = [x.strip() for x in content]

    for i in range(len(questions)):
        dict[questions[i]] = answers[i]
        embeddings[questions[i]] = model.encode([questions[i]])[0]

    return model, dict, embeddings
def generate_embeddings(df):
    paras = list(df["context"].drop_duplicates().reset_index(drop=True))

    print("Paragraph count:", len(paras))

    blob = TextBlob(" ".join(paras))
    sentences = [item.raw for item in blob.sentences]

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(infersent_pretrained_path))
    infersent.set_w2v_path(glove_path)

    print("Building Infersent vocabulary")
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}

    print("Building sentence embeddings")
    print("Sentence count:", len(sentences))
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)

    print("Building question embeddings")
    questions = df["question"].tolist()
    print("Questions count:", len(questions))
    for i in range(len(questions)):
        dict_embeddings[questions[i]] = infersent.encode([questions[i]],
                                                         tokenize=True)

    return dict_embeddings
from InferSent.models import InferSent

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

sentences = [
    "I am an engineer now.", "You can be an engineer.",
    "Building stuff is very fun.", "Stuff breaks often too though."
]
infersent.build_vocab(sentences, tokenize=True)

embeddings = infersent.encode(sentences, tokenize=True)

infersent.visualize('A man plays an instrument.', tokenize=True)

encoded_sentences = embeddings


# greedy decoder
예제 #12
0
def process(channel):
    # Load the Classifier
    tf.reset_default_graph()
    NN = classifer()
    NN.load('nn-classifier-v2')

    # Load the sentence embedder
    model_version = 1
    MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    all_files = glob.glob('../files/CableNews/%s/*.p' % channel)
    read_files = pickle.load(open('%s_visit.p' % (channel), 'rb'))
    counter = len(read_files)

    for file in tqdm(all_files):
        if file in read_files:
            continue
        else:
            read_files.append(file)
            if np.random.rand() < 0.3:
                pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb'))

        res = pickle.load(open(file, 'rb'))
        results = {}
        prev_text = ""
        all_text = []
        all_keys = []
        for key in res.keys():
            meta_data = res[key][0]  # First in the list
            if len(meta_data['text']) < 10:
                continue

            # Make sure we drop the duplicates: Texts should be differents
            current_text = meta_data['text'][:10]
            if current_text == prev_text:
                continue
            else:
                prev_text = current_text

            text = tokenizer.tokenize(meta_data['text'])
            if len(text) <= 2:
                continue
            # Drop the first sentence
            text = text[1:]
            senteces = []
            for s in text:  #Drop super small and super large senteces
                if len(s.split()) > 30 and len(s.split()) < 50:
                    senteces.append(s)
            if len(senteces) == 0:
                continue
            # Calculate the embedding
            all_text.extend(senteces)
            all_keys.extend([key] * len(senteces))
        if len(all_text) == 0:
            continue
        all_embed = model.encode(all_text,
                                 bsize=128,
                                 tokenize=True,
                                 verbose=False)
        all_predictions = NN.predict(all_embed)[
            0]  # Merge the probabilties and take top 2:
        prev_key = None
        total_prob = np.zeros((13, 1))
        key_counter = 0
        for current_key in all_keys:
            if current_key == prev_key:
                total_prob[:, 0] += all_predictions[key_counter, :]
            else:
                Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])]
                Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100
                results[current_key] = {
                    'Topics': list(Topics),
                    'Probs': list(Probs),
                    'gender': res[current_key][0]['gender'],
                    'persons': res[current_key][0]['persons'],
                    'locations': res[current_key][0]['locations']
                }
                prev_key = current_key
                total_prob = np.zeros((13, 1))
                total_prob[:, 0] += all_predictions[key_counter, :]
            key_counter += 1
        pickle.dump(results,
                    open('processed_data/%s/%d.p' % (channel, counter), 'wb'))
        counter += 1
model_pkl = '../InferSent/encoder/infersent1.pkl'
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
}
infer_sent_model = InferSent(params_model)
infer_sent_model.load_state_dict(torch.load(model_pkl))

# In[111]:

infer_sent_model.set_w2v_path(glove_w2v_loc)
infer_sent_model.build_vocab_k_words(K=100000)

# infer_sent_model.to(torch.device("cuda:0"))

# In[112]:

infer_sent_model.encode(["This man is playing computer games"], tokenize=True)

# In[113]:


def get_embedding_for_context(ctx):
    if not isinstance(ctx, list):
        #       print("ctx is not list")
        ctx = [ctx]
예제 #14
0
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# CUDA
use_cuda = True
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
print('Load glove')
model.build_vocab_k_words(K=2000000)

# load sentence
dis = '/home/shl183/nlp4note/classified_txt/discharge-sep'
res = '/home/shl183/nlp4note/infersent'
patients = os.listdir(dis)
# exist = os.listdir(res)
# with open('./tmp.pkl','wb') as f:
#     pickle.dump(exist,f)
for patient in patients:
    # if patient in exist:
    #     continue
예제 #15
0
        return json.JSONEncoder.default(self, obj)


if __name__ == "__main__":
    # Load InferSent model
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.set_w2v_path(PATH_TO_W2V)

    params_senteval['infersent'] = model.cuda()

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
    #                   'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
    #                   'SICKEntailment', 'SICKRelatedness', 'STSBenchmark']
    transfer_tasks = ['AmenitySimilarEvents']
    results = se.eval(transfer_tasks)
    print(results)

    if not os.path.exists(PATH_TO_RESULTS):
        os.mkdir(PATH_TO_RESULTS)

    with open(os.path.join(PATH_TO_RESULTS, 'infersent.json'),
예제 #16
0
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': VERSION
}
if VERSION == 1:
    W2V = 'C:/users/georg/Desktop/GloVe/glove.840B.300d.txt'
else:
    W2V = 'C:/Users/georg/Desktop/fastText/crawl-300d-2M.vec'
VOCAB_SIZE = 100000
NUM_STEPS = 300

# set up model
model = InferSent(PARAMS).to(DEVICE)
model.load_state_dict(torch.load(WEIGHTS))
model.set_w2v_path(W2V)
word2vec = model.build_vocab_k_words(K=VOCAB_SIZE)

# setup the NN-classifer
vec2word = KNeighborsClassifier(n_neighbors=1)
vecs = []
words = []
for key, val in word2vec.items():
    if val.shape == (300, ):
        vecs.append(val)
        words.append(key)
X = np.vstack(vecs)
y = np.array(words)
vec2word.fit(X, y)