def inferSent():
    import nltk
    # nltk.download('punkt')
    from InferSent.models import InferSent
    import torch

    # use_cuda = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model = model.cuda() if use_cuda else model

    # V = 2
    MODEL_PATH = 'encoder/infersent2.pkl'
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0,
        'version': 2
    }
    infersent = InferSent(params_model).to(device)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    print('set w2v')

    infersent.build_vocab(dataset, tokenize=True)
    embeddings = infersent.encode(dataset, bsize=64, tokenize=True)
    idx = randint(0, len(dataset))
    _, _ = infersent.visualize(dataset[idx])
    print('done')
    return embeddings
Beispiel #2
0
    def compute_intent_vectors(self, sentences):
        # TODO IMPLEMENT CACHING!
        from InferSent.models import InferSent
        infersent_folder = Path('./InferSent')
        infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl')
        MODEL_PARAMETERS = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt'

        model = InferSent(MODEL_PARAMETERS)
        model.load_state_dict(torch.load(infersent_path))
        if torch.cuda.is_available():
            model.cuda()
        model.set_w2v_path(W2V_PATH)
        model.build_vocab_k_words(K=100000)

        utterances_dict = self.get_utterances_dict(sentences)

        vectors = {}
        for i, (intent, sentences) in enumerate(utterances_dict.items()):
            LOGGER.info('{}/{} done'.format(i + 1,
                                            len(utterances_dict.items())))
            embeddings = model.encode(sentences)
            avg_embedding = np.mean(embeddings, axis=0)
            vectors[intent] = avg_embedding

        return vectors
Beispiel #3
0
class _InferSent:
    def __init__(self):
        from InferSent.models import InferSent
        import torch
        V = 1
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 256,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
        self.infersent.set_w2v_path(W2V_PATH)

    def build_vocab(self, queries):
        self.infersent.build_vocab(queries, tokenize=True)

    def update_vocab(self, text):
        self.infersent.update_vocab(text, tokenize=True)

    def predict(self, text):
        # self.update_vocab(text)
        return self.infersent.encode(text, tokenize=True)
Beispiel #4
0
class UniversalSentenceEncoder:
    def __init__(self):
        super().__init__()
        model_version = 1
        MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
        W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))
        self.model.eval()
        use_cuda = True
        self.model = self.model.cuda() if use_cuda else self.model
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab_k_words(K=100000)

    def semantic_sim(self, sents1, sents2):
        embed1 = self.model.encode(sents1, tokenize=False)
        embed2 = self.model.encode(sents2, tokenize=False)
        embed1 = torch.tensor(embed1)
        embed2 = torch.tensor(embed2)
        sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True)
        sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True)
        cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1)
        clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0)
        scores = 1.0 - torch.acos(clip_cosine_similarities)
        return scores.cpu().numpy()
def generate_embeddings(df):
    paras = list(df["context"].drop_duplicates().reset_index(drop=True))

    print("Paragraph count:", len(paras))

    blob = TextBlob(" ".join(paras))
    sentences = [item.raw for item in blob.sentences]

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(infersent_pretrained_path))
    infersent.set_w2v_path(glove_path)

    print("Building Infersent vocabulary")
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}

    print("Building sentence embeddings")
    print("Sentence count:", len(sentences))
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)

    print("Building question embeddings")
    questions = df["question"].tolist()
    print("Questions count:", len(questions))
    for i in range(len(questions)):
        dict_embeddings[questions[i]] = infersent.encode([questions[i]],
                                                         tokenize=True)

    return dict_embeddings
Beispiel #6
0
def Start_chatbot():
    model_version = 1
    MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    use_cuda = False
    model = model.cuda() if use_cuda else model

    W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)

    model.build_vocab_k_words(K=570000)

    dict = {}
    embeddings = {}
    questions = []
    answers = []

    with open('../data/questions.txt') as f:
        content = f.readlines()
    questions = [x.strip() for x in content]

    with open('../data/answers.txt') as f:
        content = f.readlines()
    answers = [x.strip() for x in content]

    for i in range(len(questions)):
        dict[questions[i]] = answers[i]
        embeddings[questions[i]] = model.encode([questions[i]])[0]

    return model, dict, embeddings
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

sentences = [
    "I am an engineer now.", "You can be an engineer.",
    "Building stuff is very fun.", "Stuff breaks often too though."
]
infersent.build_vocab(sentences, tokenize=True)

embeddings = infersent.encode(sentences, tokenize=True)

infersent.visualize('A man plays an instrument.', tokenize=True)

encoded_sentences = embeddings


# greedy decoder
def greedy_decoder(data):
    # index for largest probability each row
    return [np.argmax(s) for s in data]


# decode sequence
result = greedy_decoder(encoded_sentences)
print(result)
Beispiel #8
0
def process(channel):
    # Load the Classifier
    tf.reset_default_graph()
    NN = classifer()
    NN.load('nn-classifier-v2')

    # Load the sentence embedder
    model_version = 1
    MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    all_files = glob.glob('../files/CableNews/%s/*.p' % channel)
    read_files = pickle.load(open('%s_visit.p' % (channel), 'rb'))
    counter = len(read_files)

    for file in tqdm(all_files):
        if file in read_files:
            continue
        else:
            read_files.append(file)
            if np.random.rand() < 0.3:
                pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb'))

        res = pickle.load(open(file, 'rb'))
        results = {}
        prev_text = ""
        all_text = []
        all_keys = []
        for key in res.keys():
            meta_data = res[key][0]  # First in the list
            if len(meta_data['text']) < 10:
                continue

            # Make sure we drop the duplicates: Texts should be differents
            current_text = meta_data['text'][:10]
            if current_text == prev_text:
                continue
            else:
                prev_text = current_text

            text = tokenizer.tokenize(meta_data['text'])
            if len(text) <= 2:
                continue
            # Drop the first sentence
            text = text[1:]
            senteces = []
            for s in text:  #Drop super small and super large senteces
                if len(s.split()) > 30 and len(s.split()) < 50:
                    senteces.append(s)
            if len(senteces) == 0:
                continue
            # Calculate the embedding
            all_text.extend(senteces)
            all_keys.extend([key] * len(senteces))
        if len(all_text) == 0:
            continue
        all_embed = model.encode(all_text,
                                 bsize=128,
                                 tokenize=True,
                                 verbose=False)
        all_predictions = NN.predict(all_embed)[
            0]  # Merge the probabilties and take top 2:
        prev_key = None
        total_prob = np.zeros((13, 1))
        key_counter = 0
        for current_key in all_keys:
            if current_key == prev_key:
                total_prob[:, 0] += all_predictions[key_counter, :]
            else:
                Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])]
                Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100
                results[current_key] = {
                    'Topics': list(Topics),
                    'Probs': list(Probs),
                    'gender': res[current_key][0]['gender'],
                    'persons': res[current_key][0]['persons'],
                    'locations': res[current_key][0]['locations']
                }
                prev_key = current_key
                total_prob = np.zeros((13, 1))
                total_prob[:, 0] += all_predictions[key_counter, :]
            key_counter += 1
        pickle.dump(results,
                    open('processed_data/%s/%d.p' % (channel, counter), 'wb'))
        counter += 1
    'dpout_model': 0.0,
    'version': 1
}
infer_sent_model = InferSent(params_model)
infer_sent_model.load_state_dict(torch.load(model_pkl))

# In[111]:

infer_sent_model.set_w2v_path(glove_w2v_loc)
infer_sent_model.build_vocab_k_words(K=100000)

# infer_sent_model.to(torch.device("cuda:0"))

# In[112]:

infer_sent_model.encode(["This man is playing computer games"], tokenize=True)

# In[113]:


def get_embedding_for_context(ctx):
    if not isinstance(ctx, list):
        #       print("ctx is not list")
        ctx = [ctx]
    return infer_sent_model.encode(ctx, tokenize=True)


# In[114]:

from sklearn.metrics.pairwise import cosine_similarity
Beispiel #10
0
    notes = os.listdir('{}/{}'.format(dis, patient))
    for note in notes:
        tps = os.listdir('{}/{}/{}'.format(dis, patient, note))
        if os.path.exists('{}/{}/{}'.format(res, patient, note)):
            shutil.rmtree('{}/{}/{}'.format(res, patient, note))
        os.makedirs('{}/{}/{}'.format(res, patient, note))
        for tp in tps:
            print('{}/{}/{}/{}'.format(dis, patient, note, tp))
            with open('{}/{}/{}/{}'.format(dis, patient, note, tp), 'r') as f:
                sents = f.read()
            t_sents = sent_tokenize(sents)
            #                 print(len(t_sent))
            #             time.sleep(0.4)
            val_sent = []
            with open('{}/{}/{}/{}pkl'.format(res, patient, note, tp[:-3]),
                      'wb') as f:
                for sent in t_sents:
                    # print(sent)
                    length = len(sent.split())
                    if length < 10:
                        continue
                    val_sent.append(sent)
                #print(val_sent)
                if val_sent == []:
                    continue
                embedding = model.encode(val_sent,
                                         bsize=128,
                                         tokenize=False,
                                         verbose=True)
                pickle.dump(embedding, f)
Beispiel #11
0
    from InferSent.models import InferSent
    model_version = 1
    dirname = os.path.dirname(__file__)
    MODEL_PATH = os.path.join(
        dirname, "InferSent/encoder/infersent%s.pkl" % model_version)
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = os.path.join(dirname, 'InferSent/embs/glove.840B.300d.txt')
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)

    for file_name in files:
        with open(file_name) as istr:
            data = map(str.strip, istr)
            data = list(data)
        output_file = output_dir / file_name.with_suffix('.emb.tsv').name
        embeddings = model.encode(data, tokenize=True)
        with open(output_file, "w") as ostr:
            writer = csv.writer(ostr, delimiter="\t")
            for sent, emb in zip(data, embeddings):
                _ = writer.writerow([sent, " ".join(map(str, emb.tolist()))])
Beispiel #12
0
paras = list(train_df["contexts"].drop_duplicates().reset_index(drop= True))
blob = TextBlob(" ".join(paras))
sentences = get_all_sentences(train_df['sentences'])
infersent.build_vocab(sentences, tokenize=True)


# # Build Embeddings

# In[15]:


# Sentence Embeddings
dict_embeddings = {}
for i in range(len(sentences)):
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)[0]
  
# Question Embeddings
questions = list(train_df["questions"])    
for i in range(len(questions)):
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)[0]
    


# # Save Embeddings

# In[156]:


# Todo
# This will help to save the computation time
Beispiel #13
0
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)

    # infersent.build_vocab(sentences, tokenize=True)
    infersent.build_vocab_k_words(K=100000)

    embeddings = infersent.encode(sentences,
                                  bsize=128,
                                  tokenize=False,
                                  verbose=True)
    print('nb sentences encoded : {0}'.format(len(embeddings)))
    #### End Paste

    parsedQs = []
    with open(questions, "r+") as f:
        for q in f.readlines():
            parsedQs.append(preprocessQs(q))
    # print(parsedQs)s

    qEmbeddings = infersent.encode(parsedQs,
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)
    print('nb sentences encoded : {0}'.format(len(embeddings)))