Ejemplo n.º 1
0
    def compute_intent_vectors(self, sentences):
        # TODO IMPLEMENT CACHING!
        from InferSent.models import InferSent
        infersent_folder = Path('./InferSent')
        infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl')
        MODEL_PARAMETERS = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt'

        model = InferSent(MODEL_PARAMETERS)
        model.load_state_dict(torch.load(infersent_path))
        if torch.cuda.is_available():
            model.cuda()
        model.set_w2v_path(W2V_PATH)
        model.build_vocab_k_words(K=100000)

        utterances_dict = self.get_utterances_dict(sentences)

        vectors = {}
        for i, (intent, sentences) in enumerate(utterances_dict.items()):
            LOGGER.info('{}/{} done'.format(i + 1,
                                            len(utterances_dict.items())))
            embeddings = model.encode(sentences)
            avg_embedding = np.mean(embeddings, axis=0)
            vectors[intent] = avg_embedding

        return vectors
Ejemplo n.º 2
0
class UniversalSentenceEncoder:
    def __init__(self):
        super().__init__()
        model_version = 1
        MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
        W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))
        self.model.eval()
        use_cuda = True
        self.model = self.model.cuda() if use_cuda else self.model
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab_k_words(K=100000)

    def semantic_sim(self, sents1, sents2):
        embed1 = self.model.encode(sents1, tokenize=False)
        embed2 = self.model.encode(sents2, tokenize=False)
        embed1 = torch.tensor(embed1)
        embed2 = torch.tensor(embed2)
        sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True)
        sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True)
        cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1)
        clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0)
        scores = 1.0 - torch.acos(clip_cosine_similarities)
        return scores.cpu().numpy()
    def __init__(self,
                 bsize=64,
                 word_emb_dim=300,
                 enc_lstm_dim=2048,
                 pool_type='max',
                 dpout_model=0.0,
                 version=2,
                 model_path='../infersent/infersent2.pkl',
                 path_to_w2v='../fasttext/crawl-300d-2M.vec',
                 use_cuda=True):
        self.version = version
        self.dpout_model = dpout_model
        self.pool_type = pool_type
        self.enc_lstm_dim = enc_lstm_dim
        self.word_emb_dim = word_emb_dim
        self.bsize = bsize
        model = InferSent({
            'bsize': bsize,
            'word_emb_dim': word_emb_dim,
            'enc_lstm_dim': enc_lstm_dim,
            'pool_type': pool_type,
            'dpout_model': dpout_model,
            'version': version
        })
        model.load_state_dict(torch.load(model_path))
        model.set_w2v_path(path_to_w2v)

        if not use_cuda:
            self.model = model
        else:
            self.model = model.cuda()

        self.first_call = True
Ejemplo n.º 4
0
def Start_chatbot():
    model_version = 1
    MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    use_cuda = False
    model = model.cuda() if use_cuda else model

    W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)

    model.build_vocab_k_words(K=570000)

    dict = {}
    embeddings = {}
    questions = []
    answers = []

    with open('../data/questions.txt') as f:
        content = f.readlines()
    questions = [x.strip() for x in content]

    with open('../data/answers.txt') as f:
        content = f.readlines()
    answers = [x.strip() for x in content]

    for i in range(len(questions)):
        dict[questions[i]] = answers[i]
        embeddings[questions[i]] = model.encode([questions[i]])[0]

    return model, dict, embeddings
Ejemplo n.º 5
0
def process(channel):
    # Load the Classifier
    tf.reset_default_graph()
    NN = classifer()
    NN.load('nn-classifier-v2')

    # Load the sentence embedder
    model_version = 1
    MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    all_files = glob.glob('../files/CableNews/%s/*.p' % channel)
    read_files = pickle.load(open('%s_visit.p' % (channel), 'rb'))
    counter = len(read_files)

    for file in tqdm(all_files):
        if file in read_files:
            continue
        else:
            read_files.append(file)
            if np.random.rand() < 0.3:
                pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb'))

        res = pickle.load(open(file, 'rb'))
        results = {}
        prev_text = ""
        all_text = []
        all_keys = []
        for key in res.keys():
            meta_data = res[key][0]  # First in the list
            if len(meta_data['text']) < 10:
                continue

            # Make sure we drop the duplicates: Texts should be differents
            current_text = meta_data['text'][:10]
            if current_text == prev_text:
                continue
            else:
                prev_text = current_text

            text = tokenizer.tokenize(meta_data['text'])
            if len(text) <= 2:
                continue
            # Drop the first sentence
            text = text[1:]
            senteces = []
            for s in text:  #Drop super small and super large senteces
                if len(s.split()) > 30 and len(s.split()) < 50:
                    senteces.append(s)
            if len(senteces) == 0:
                continue
            # Calculate the embedding
            all_text.extend(senteces)
            all_keys.extend([key] * len(senteces))
        if len(all_text) == 0:
            continue
        all_embed = model.encode(all_text,
                                 bsize=128,
                                 tokenize=True,
                                 verbose=False)
        all_predictions = NN.predict(all_embed)[
            0]  # Merge the probabilties and take top 2:
        prev_key = None
        total_prob = np.zeros((13, 1))
        key_counter = 0
        for current_key in all_keys:
            if current_key == prev_key:
                total_prob[:, 0] += all_predictions[key_counter, :]
            else:
                Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])]
                Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100
                results[current_key] = {
                    'Topics': list(Topics),
                    'Probs': list(Probs),
                    'gender': res[current_key][0]['gender'],
                    'persons': res[current_key][0]['persons'],
                    'locations': res[current_key][0]['locations']
                }
                prev_key = current_key
                total_prob = np.zeros((13, 1))
                total_prob[:, 0] += all_predictions[key_counter, :]
            key_counter += 1
        pickle.dump(results,
                    open('processed_data/%s/%d.p' % (channel, counter), 'wb'))
        counter += 1
Ejemplo n.º 6
0
MODEL_PATH = "encoder/infersent1.pkl"
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# CUDA
use_cuda = True
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
print('Load glove')
model.build_vocab_k_words(K=2000000)

# load sentence
dis = '/home/shl183/nlp4note/classified_txt/discharge-sep'
res = '/home/shl183/nlp4note/infersent'
patients = os.listdir(dis)
# exist = os.listdir(res)
# with open('./tmp.pkl','wb') as f:
Ejemplo n.º 7
0
if __name__ == "__main__":
    # Load InferSent model
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.set_w2v_path(PATH_TO_W2V)

    params_senteval['infersent'] = model.cuda()

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
    #                   'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
    #                   'SICKEntailment', 'SICKRelatedness', 'STSBenchmark']
    transfer_tasks = ['AmenitySimilarEvents']
    results = se.eval(transfer_tasks)
    print(results)

    if not os.path.exists(PATH_TO_RESULTS):
        os.mkdir(PATH_TO_RESULTS)

    with open(os.path.join(PATH_TO_RESULTS, 'infersent.json'),
              'w') as out_file:
        json.dump(results, out_file, cls=NumpyEncoder)