Example #1
0
class UniversalSentenceEncoder:
    def __init__(self):
        super().__init__()
        model_version = 1
        MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
        W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))
        self.model.eval()
        use_cuda = True
        self.model = self.model.cuda() if use_cuda else self.model
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab_k_words(K=100000)

    def semantic_sim(self, sents1, sents2):
        embed1 = self.model.encode(sents1, tokenize=False)
        embed2 = self.model.encode(sents2, tokenize=False)
        embed1 = torch.tensor(embed1)
        embed2 = torch.tensor(embed2)
        sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True)
        sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True)
        cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1)
        clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0)
        scores = 1.0 - torch.acos(clip_cosine_similarities)
        return scores.cpu().numpy()
Example #2
0
    def compute_intent_vectors(self, sentences):
        # TODO IMPLEMENT CACHING!
        from InferSent.models import InferSent
        infersent_folder = Path('./InferSent')
        infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl')
        MODEL_PARAMETERS = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt'

        model = InferSent(MODEL_PARAMETERS)
        model.load_state_dict(torch.load(infersent_path))
        if torch.cuda.is_available():
            model.cuda()
        model.set_w2v_path(W2V_PATH)
        model.build_vocab_k_words(K=100000)

        utterances_dict = self.get_utterances_dict(sentences)

        vectors = {}
        for i, (intent, sentences) in enumerate(utterances_dict.items()):
            LOGGER.info('{}/{} done'.format(i + 1,
                                            len(utterances_dict.items())))
            embeddings = model.encode(sentences)
            avg_embedding = np.mean(embeddings, axis=0)
            vectors[intent] = avg_embedding

        return vectors
Example #3
0
class _InferSent:
    def __init__(self):
        from InferSent.models import InferSent
        import torch
        V = 1
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 256,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
        self.infersent.set_w2v_path(W2V_PATH)

    def build_vocab(self, queries):
        self.infersent.build_vocab(queries, tokenize=True)

    def update_vocab(self, text):
        self.infersent.update_vocab(text, tokenize=True)

    def predict(self, text):
        # self.update_vocab(text)
        return self.infersent.encode(text, tokenize=True)
def inferSent():
    import nltk
    # nltk.download('punkt')
    from InferSent.models import InferSent
    import torch

    # use_cuda = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model = model.cuda() if use_cuda else model

    # V = 2
    MODEL_PATH = 'encoder/infersent2.pkl'
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0,
        'version': 2
    }
    infersent = InferSent(params_model).to(device)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    print('set w2v')

    infersent.build_vocab(dataset, tokenize=True)
    embeddings = infersent.encode(dataset, bsize=64, tokenize=True)
    idx = randint(0, len(dataset))
    _, _ = infersent.visualize(dataset[idx])
    print('done')
    return embeddings
    def __init__(self,
                 bsize=64,
                 word_emb_dim=300,
                 enc_lstm_dim=2048,
                 pool_type='max',
                 dpout_model=0.0,
                 version=2,
                 model_path='../infersent/infersent2.pkl',
                 path_to_w2v='../fasttext/crawl-300d-2M.vec',
                 use_cuda=True):
        self.version = version
        self.dpout_model = dpout_model
        self.pool_type = pool_type
        self.enc_lstm_dim = enc_lstm_dim
        self.word_emb_dim = word_emb_dim
        self.bsize = bsize
        model = InferSent({
            'bsize': bsize,
            'word_emb_dim': word_emb_dim,
            'enc_lstm_dim': enc_lstm_dim,
            'pool_type': pool_type,
            'dpout_model': dpout_model,
            'version': version
        })
        model.load_state_dict(torch.load(model_path))
        model.set_w2v_path(path_to_w2v)

        if not use_cuda:
            self.model = model
        else:
            self.model = model.cuda()

        self.first_call = True
Example #6
0
def _load_pretrained_model(verbose=True):
    if verbose:
        print(f">>> Loading pretrained model from {_MODEL_PATH}")
    infersent = InferSent(_PARAMS_MODEL)
    infersent.load_state_dict(torch.load(_MODEL_PATH))
    infersent.set_w2v_path(_W2V_PATH)
    infersent.build_vocab_k_words(K=_K_WORDS_VOCAB)
    return infersent
Example #7
0
def get_infersent(V=2):
    '''
    Builds the infersent model using either GloVe or fastText
    '''
    MODEL_PATH = 'encoder/infersent%s.pkl' %V
    if V == 2:
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
    elif V == 1:
        W2V_PATH = 'GloVe/glove.840B.300d.txt'
    
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, \
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent.set_w2v_path(W2V_PATH)

    return infersent
    def __init__(self):
        #print("Initializing Infersent..")
        model_version = 1
        MODEL_PATH = get_project_root() / Path("encoder/infersent%s.pkl" % model_version)
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # word vector path for the model:
        W2V_PATH = get_project_root() / Path('GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec')
        model.set_w2v_path(W2V_PATH)

        # build the vocabulary of word vectors
        model.build_vocab_k_words(K=100000)

        self.model = model
Example #9
0
def Start_chatbot():
    model_version = 1
    MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    use_cuda = False
    model = model.cuda() if use_cuda else model

    W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)

    model.build_vocab_k_words(K=570000)

    dict = {}
    embeddings = {}
    questions = []
    answers = []

    with open('../data/questions.txt') as f:
        content = f.readlines()
    questions = [x.strip() for x in content]

    with open('../data/answers.txt') as f:
        content = f.readlines()
    answers = [x.strip() for x in content]

    for i in range(len(questions)):
        dict[questions[i]] = answers[i]
        embeddings[questions[i]] = model.encode([questions[i]])[0]

    return model, dict, embeddings
def generate_embeddings(df):
    paras = list(df["context"].drop_duplicates().reset_index(drop=True))

    print("Paragraph count:", len(paras))

    blob = TextBlob(" ".join(paras))
    sentences = [item.raw for item in blob.sentences]

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(infersent_pretrained_path))
    infersent.set_w2v_path(glove_path)

    print("Building Infersent vocabulary")
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}

    print("Building sentence embeddings")
    print("Sentence count:", len(sentences))
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)

    print("Building question embeddings")
    questions = df["question"].tolist()
    print("Questions count:", len(questions))
    for i in range(len(questions)):
        dict_embeddings[questions[i]] = infersent.encode([questions[i]],
                                                         tokenize=True)

    return dict_embeddings
import torch

from InferSent.models import InferSent

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

sentences = [
    "I am an engineer now.", "You can be an engineer.",
    "Building stuff is very fun.", "Stuff breaks often too though."
]
infersent.build_vocab(sentences, tokenize=True)

embeddings = infersent.encode(sentences, tokenize=True)

infersent.visualize('A man plays an instrument.', tokenize=True)

encoded_sentences = embeddings
Example #12
0
def process(channel):
    # Load the Classifier
    tf.reset_default_graph()
    NN = classifer()
    NN.load('nn-classifier-v2')

    # Load the sentence embedder
    model_version = 1
    MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    all_files = glob.glob('../files/CableNews/%s/*.p' % channel)
    read_files = pickle.load(open('%s_visit.p' % (channel), 'rb'))
    counter = len(read_files)

    for file in tqdm(all_files):
        if file in read_files:
            continue
        else:
            read_files.append(file)
            if np.random.rand() < 0.3:
                pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb'))

        res = pickle.load(open(file, 'rb'))
        results = {}
        prev_text = ""
        all_text = []
        all_keys = []
        for key in res.keys():
            meta_data = res[key][0]  # First in the list
            if len(meta_data['text']) < 10:
                continue

            # Make sure we drop the duplicates: Texts should be differents
            current_text = meta_data['text'][:10]
            if current_text == prev_text:
                continue
            else:
                prev_text = current_text

            text = tokenizer.tokenize(meta_data['text'])
            if len(text) <= 2:
                continue
            # Drop the first sentence
            text = text[1:]
            senteces = []
            for s in text:  #Drop super small and super large senteces
                if len(s.split()) > 30 and len(s.split()) < 50:
                    senteces.append(s)
            if len(senteces) == 0:
                continue
            # Calculate the embedding
            all_text.extend(senteces)
            all_keys.extend([key] * len(senteces))
        if len(all_text) == 0:
            continue
        all_embed = model.encode(all_text,
                                 bsize=128,
                                 tokenize=True,
                                 verbose=False)
        all_predictions = NN.predict(all_embed)[
            0]  # Merge the probabilties and take top 2:
        prev_key = None
        total_prob = np.zeros((13, 1))
        key_counter = 0
        for current_key in all_keys:
            if current_key == prev_key:
                total_prob[:, 0] += all_predictions[key_counter, :]
            else:
                Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])]
                Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100
                results[current_key] = {
                    'Topics': list(Topics),
                    'Probs': list(Probs),
                    'gender': res[current_key][0]['gender'],
                    'persons': res[current_key][0]['persons'],
                    'locations': res[current_key][0]['locations']
                }
                prev_key = current_key
                total_prob = np.zeros((13, 1))
                total_prob[:, 0] += all_predictions[key_counter, :]
            key_counter += 1
        pickle.dump(results,
                    open('processed_data/%s/%d.p' % (channel, counter), 'wb'))
        counter += 1
from InferSent.models import InferSent

# In[110]:

model_pkl = '../InferSent/encoder/infersent1.pkl'
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
}
infer_sent_model = InferSent(params_model)
infer_sent_model.load_state_dict(torch.load(model_pkl))

# In[111]:

infer_sent_model.set_w2v_path(glove_w2v_loc)
infer_sent_model.build_vocab_k_words(K=100000)

# infer_sent_model.to(torch.device("cuda:0"))

# In[112]:

infer_sent_model.encode(["This man is playing computer games"], tokenize=True)

# In[113]:

Example #14
0
import pickle
from nltk.tokenize import sent_tokenize
# Load Model
from InferSent.models import InferSent

MODEL_PATH = "encoder/infersent1.pkl"
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# CUDA
use_cuda = True
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
print('Load glove')
model.build_vocab_k_words(K=2000000)

# load sentence
dis = '/home/shl183/nlp4note/classified_txt/discharge-sep'
class SentEnc():
    def __init__(self,
                 model_path,
                 w2v_path,
                 vocab_size=100000,
                 device='cpu',
                 hp=False):
        self.vocab = set()
        self.device = device
        self.hp = hp

        self.tokenizer = word_tokenize
        self.vocab_size = vocab_size
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.sent_model = InferSent(params_model).to(self.device).eval()
        self.sent_model.load_state_dict(
            torch.load(model_path, map_location=device))
        if hp:
            self.sent_model = self.sent_model.half()
        # self.sent_model = self.sent_model.cuda()
        # self.sent_model.set_w2v_path(w2v_path)
        self.init_vocab(w2v_path)
        self.cache = dict()

    def init_vocab(self, w2v_path):
        self.w2v, self.bs, self.es = load_WE(w2v_path, self.vocab_size)
        self.vocab.update(self.w2v.keys())
        # sentences = list(self.vocab) + [self.unk, self.bs, self.es]
        # self.sent_model.build_vocab(sentences, tokenize=True)

    def prune_sent(self, sent):
        if sent is None:
            sent_tok = []
        else:
            sent_tok = self.tokenizer(sent)
        sent_tok = [x for x in sent_tok if x in self.w2v]
        sent_rec = ' '.join(sent_tok)
        return sent_rec

    def get_number_encoding(self, num):
        d = 256
        device = 'cpu'
        # consider 4 decimal points
        a = int(num)
        b = int((num - a) * 1e2)

        a = str(a)[::-1]
        b = str(b)

        J = torch.arange(0, d)
        J_even = (J % 2 == 0).float().to(device)
        J = J.float().to(device)
        Ia = torch.arange(0, len(a)).float().to(device)
        Ib = (torch.arange(0, len(b)) + 1).float().to(device)
        A = torch.FloatTensor([float(x) for x in a]).to(device)
        B = torch.FloatTensor([float(x) for x in b]).to(device)

        J = J.float()

        A = torch.cat([A.view(1, -1)] * d, dim=0).T
        Ia = torch.cat([Ia.view(1, -1)] * d, dim=0).T

        B = torch.cat([B.view(1, -1)] * d, dim=0).T
        Ib = torch.cat([Ib.view(1, -1)] * d, dim=0).T

        resA = A * (2**Ia) / 10 * (J_even * torch.sin(Ia / (10000**(J / d))) +
                                   (1 - J_even) * torch.cos(Ia / (10000**(
                                       (J - 1) / d))))
        resB = B * (2.0**(-Ib)) / 10 * (
            J_even * torch.sin(-Ib / (10000**(J / d))) +
            (1 - J_even) * torch.cos(-Ib / (10000**((J - 1) / d))))

        res = torch.sum(resA, axis=0) + torch.sum(resB, axis=0)
        res = res / (len(a) + len(b))
        res = res.numpy()
        if d < 4096:
            res = np.append(res, np.zeros(4096 - d)).astype('float32')
        return res

    def is_number(self, string):
        # try:
        #     num = abs(float(string))
        #     return num
        # except:
        #     return None
        string = string.strip()
        if re.match('^[-+]?\d+\.\d+$', string): return abs(float(string))
        if re.match('^[-+]?\d+$', string) and len(string) < 5:
            return abs(int(string))
        if re.match('^[-+]?[\d,]+$', string) and len(string) < 5:
            return abs(int(string.replace(',', '')))
        return None

    def get_text_encoding(self, sent):
        sent_tok = sent.split()
        s = np.array([self.w2v[self.bs]] + [self.w2v[x] for x in sent_tok] +
                     [self.w2v[self.es]])
        l = len(s)
        s = s.reshape([l, 1, 300])
        s = torch.from_numpy(s).float()
        if self.hp:
            s = s.half()
        s = s.to(self.device)
        l = np.array([l], dtype='int64')

        # s = ' '.join(s)
        with torch.no_grad():
            res = self.sent_model.forward((s, l))
            # # v = torch.zeros([1,4096])
            if res.shape[1] != 4096:
                print(v, s)
            v = res.cpu().detach().numpy()[0]
            del res
            return v

    def cache_sentences(self, sentences):
        self.sent_cache = set()
        self.num_cache = set()
        for s in sentences:
            if s is None: s = ''
            if REG:
                num = self.is_number(s)
                if num is not None:
                    self.num_cache.add((num, s))
                else:
                    pruned_s = self.prune_sent(s)
                    self.sent_cache.add(pruned_s)
            else:
                pruned_s = self.prune_sent(s)
                self.sent_cache.add(pruned_s)
        print(f'initialize {len(self.sent_cache)} text sentences...')
        for s in tqdm.tqdm(self.sent_cache):
            self.cache[s] = self.get_text_encoding(s)
        print(f'initialize {len(self.num_cache)} numeric sentences...')
        self.num_cache = list(self.num_cache)
        for n, n_str in tqdm.tqdm(self.num_cache):
            self.cache[n_str] = self.get_number_encoding(n)
        # p = Pool(NUM_THREADS)
        # vecs = p.map(self.get_number_encoding, self.num_cache)
        # p.terminate()
        # for n, v in zip(self.num_cache, vecs):
        #     self.cache[str(num)] = v

    def __getitem__(self, sent):
        if sent is None: sent = ''
        if REG:
            num = self.is_number(sent)
            if num is not None:
                num_str = sent
                return self.cache[num_str]
            else:
                pruned_s = self.prune_sent(sent)
                return self.cache[pruned_s]
        pruned_s = self.prune_sent(sent)
        return self.cache[pruned_s]
Example #16
0
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': VERSION
}
if VERSION == 1:
    W2V = 'C:/users/georg/Desktop/GloVe/glove.840B.300d.txt'
else:
    W2V = 'C:/Users/georg/Desktop/fastText/crawl-300d-2M.vec'
VOCAB_SIZE = 100000
NUM_STEPS = 300

# set up model
model = InferSent(PARAMS).to(DEVICE)
model.load_state_dict(torch.load(WEIGHTS))
model.set_w2v_path(W2V)
word2vec = model.build_vocab_k_words(K=VOCAB_SIZE)

# setup the NN-classifer
vec2word = KNeighborsClassifier(n_neighbors=1)
vecs = []
words = []
for key, val in word2vec.items():
    if val.shape == (300, ):
        vecs.append(val)
        words.append(key)
X = np.vstack(vecs)
y = np.array(words)
vec2word.fit(X, y)