Esempio n. 1
0
class _InferSent:
    def __init__(self):
        from InferSent.models import InferSent
        import torch
        V = 1
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 256,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
        self.infersent.set_w2v_path(W2V_PATH)

    def build_vocab(self, queries):
        self.infersent.build_vocab(queries, tokenize=True)

    def update_vocab(self, text):
        self.infersent.update_vocab(text, tokenize=True)

    def predict(self, text):
        # self.update_vocab(text)
        return self.infersent.encode(text, tokenize=True)
def inferSent():
    import nltk
    # nltk.download('punkt')
    from InferSent.models import InferSent
    import torch

    # use_cuda = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model = model.cuda() if use_cuda else model

    # V = 2
    MODEL_PATH = 'encoder/infersent2.pkl'
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0,
        'version': 2
    }
    infersent = InferSent(params_model).to(device)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    print('set w2v')

    infersent.build_vocab(dataset, tokenize=True)
    embeddings = infersent.encode(dataset, bsize=64, tokenize=True)
    idx = randint(0, len(dataset))
    _, _ = infersent.visualize(dataset[idx])
    print('done')
    return embeddings
def generate_embeddings(df):
    paras = list(df["context"].drop_duplicates().reset_index(drop=True))

    print("Paragraph count:", len(paras))

    blob = TextBlob(" ".join(paras))
    sentences = [item.raw for item in blob.sentences]

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(infersent_pretrained_path))
    infersent.set_w2v_path(glove_path)

    print("Building Infersent vocabulary")
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}

    print("Building sentence embeddings")
    print("Sentence count:", len(sentences))
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)

    print("Building question embeddings")
    questions = df["question"].tolist()
    print("Questions count:", len(questions))
    for i in range(len(questions)):
        dict_embeddings[questions[i]] = infersent.encode([questions[i]],
                                                         tokenize=True)

    return dict_embeddings
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

sentences = [
    "I am an engineer now.", "You can be an engineer.",
    "Building stuff is very fun.", "Stuff breaks often too though."
]
infersent.build_vocab(sentences, tokenize=True)

embeddings = infersent.encode(sentences, tokenize=True)

infersent.visualize('A man plays an instrument.', tokenize=True)

encoded_sentences = embeddings


# greedy decoder
def greedy_decoder(data):
    # index for largest probability each row
    return [np.argmax(s) for s in data]


# decode sequence