def calcule_eucl(text, question):
    blob = TextBlob("".join(text))
    sentences = [item.raw for item in blob.sentences]

    V = 2
    MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'InferSent/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)
        encode_question = infersent.encode([question], tokenize=True)
    eucl = eucl_sim(dict_embeddings, encode_question)

    return sentences, eucl
def embed_sent(datafile):
    sentences = []
    with open(datafile, 'r') as f:
        i = 0
        for line in f:
            line = line.replace('\n', '')
            sentences.append(line)
            i += 1
            if i == 455820:
                        break
    V = 1
    MODEL_PATH = 'infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(sentences, tokenize=True)

    embeddings = infersent.encode(sentences, tokenize=True)

    np.savetxt("../../wiki-split/Data/Infersent_vectors/complex_sent", embeddings)
Exemple #3
0
class InferSentFeatures:
    def __init__(self, lang_enc_dir, sentences):
        sys.path.insert(0, os.path.join(lang_enc_dir, 'InferSent/'))
        from models import InferSent

        version = 1
        MODEL_PATH = os.path.join(
            lang_enc_dir, 'InferSent/encoder/infersent%s.pkl' % version)
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))

        W2V_PATH = os.path.join(lang_enc_dir, 'glove/glove.6B.300d.txt')
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab(sentences, tokenize=True)

    def generate_embeddings(self, sentences):
        embeddings = self.model.encode(sentences, tokenize=True)
        return embeddings
Exemple #4
0
def create_embeddings(infer_path, data_path, em_type):
    yt_titles = yt.get_yt_titles()
    with open("data/whtitles", "r") as f:
        wh_titles = [line.rstrip('\n') for line in f]

    if em_type == "yt":  # Youtube
        save_f = os.path.join(data_path, "yt_embed")
        titles = yt_titles
    elif em_type == "wh":  # Wikihow
        save_f = os.path.join(data_path, "wh_embed")
        titles = wh_titles
    else:
        raise "Unknown embedding type: {}".format(em_type)

    nltk.download('punkt')
    V = 1
    MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V)
    params_model = {
        'bsize': 256,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent = infersent.cuda()

    W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt')
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(yt_titles + wh_titles, tokenize=True)
    embed = infersent.encode(titles, tokenize=True)
    np.save(save_f, embed)
Exemple #5
0
class Encoder2:
    ''' Encoder based on InferSent '''

    WORD_VECTORS_FILE = 'crawl-300d-2M.vec'
    MODEL_FILE = 'infersent2.pkl'

    def __init__(self, word_vectors_dir, models_dir):
        word_vectors = os.path.join(word_vectors_dir, self.WORD_VECTORS_FILE)
        model_file = os.path.join(models_dir, self.MODEL_FILE)

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 2
        }

        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(model_file))
        self.model.set_w2v_path(word_vectors)

    def start(self, texts):
        texts_list = texts.values.tolist()
        self.model.build_vocab(texts_list, tokenize=True)

    def close(self):
        pass

    def encode(self, texts_batch):
        texts_batch_list = texts_batch.values.tolist()
        texts_batch_vec = self.model.encode(texts_batch_list, tokenize=True)

        return texts_batch_vec
Exemple #6
0
def infersent_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path):
    model_path = data_fold_path + 'word_sent_embed/infersent2.pickle'
    word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec'
    posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim))

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    model.set_w2v_path(word_emb_path)

    all_sents = []
    for sens in posts:
        all_sents.extend(sens)

    model.build_vocab(all_sents, tokenize=False)

    for ind, sentences in enumerate(posts):
        embeddings = model.encode(sentences, tokenize=False, verbose=False)
        l = min(max_sent_cnt, len(sentences))
        posts_arr[ind, :l, :] = embeddings[:l]

    return posts_arr
Exemple #7
0
  def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ):
  
    sp = spacy.load('en_core_web_sm')
    tokenized = sp(doc)
    sentences = []
    for token in tokenized.sents:
      sentences.append(token.text)

    if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']:
      # Use encoder for mapping tokens to embeddings
      word_embedding_model = models.Transformer(model_name, 
                  tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {})
      # Apply mean pooling to get one fixed sized sentence vector
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                     pooling_mode_mean_tokens=True,
                                     pooling_mode_cls_token=False,
                                     pooling_mode_max_tokens=False)
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
      sentence_embeddings = model.encode(sentences)
    

    elif encoder == 'use':
      #!pip install embedding-as-service
      from embedding_as_service.text.encode import Encoder
      en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
      sentence_embeddings = en.encode(texts=sentences)


    elif encoder == 'infersent':
      import nltk
      nltk.download('punkt')
      from models import InferSent
      params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                      'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
      infersent = InferSent(params_model)
      W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
      infersent.set_w2v_path(W2V_PATH)
      infersent.build_vocab(sentences, tokenize=True)
      sentence_embeddings = infersent.encode(sentences, tokenize=True)


    elif encoder == 'sent2vec':
      import sent2vec
      model = sent2vec.Sent2vecModel()
      model.load_model('drive/My Drive/torontobooks_unigram.bin') 
      sentence_embeddings = model.embed_sentences(sentences)
   

    elif encoder == 'laser':
      from laserembeddings import Laser
      laser = Laser()  ## Also used for multilingual sentence embeddings
      sentence_embeddings = laser.embed_sentences(sentences, lang='en') 
  
  
    else:
      raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder))  
  
    return list(zip(sentences, sentence_embeddings))
def infersent_embeddings():
    train_data_list = []
    test_data_list = []
    sys.path.append(
        '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master')
    # Load model
    from models import InferSent
    model_version = 1
    MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model
    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)
    train_data_list = model.encode(final_train['text'].tolist(),
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)
    print('nb sentences encoded : {0}'.format(len(train_data_list)))
    test_data_list = model.encode(final_test['text'].tolist(),
                                  bsize=128,
                                  tokenize=False,
                                  verbose=True)
    print('nb sentences encoded : {0}'.format(len(test_data_list)))
    return train_data_list, test_data_list
Exemple #9
0
def infersent_flat_embed_posts(posts, embed_dim, data_fold_path):
    model_path = data_fold_path + 'word_sent_embed/infersent2.pickle'
    word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec'

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    model.set_w2v_path(word_emb_path)

    model.build_vocab(posts, tokenize=False)
    return model.encode(posts, tokenize=False, verbose=False)
Exemple #10
0
class Infersent:
    def __init__(self):

        V = 2
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }

        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        self.infersent.set_w2v_path('fastText/crawl-300d-2M.vec')

    def get(self, sentences):
        self.infersent.build_vocab(sentences, tokenize=True)

        return self.infersent.encode(sentences, tokenize=True)
Exemple #11
0
def embed_dataset(dataset_path, infersent_path, force_cpu=False):
    """
    To make this work, first run ./get_infersent.sh
    """
    MODEL_PATH = infersent_path / "encoder/infersent1.pkl"
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
    model = InferSent(params_model)
    if force_cpu:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
    else:
        model.load_state_dict(torch.load(MODEL_PATH))
        model.cuda()

    W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)

    csv_data = read_csv(dataset_path / 'train.csv')
    csv_data = csv_data[1:]  # skip header
    data = defaultdict(list)

    for irow, row in enumerate(csv_data):
        if 'snips' in str(dataset_path):
            utterance, labels, delexicalised, intent = row
        else:
            raise TypeError(
                "Unknown dataset type. Implement your own first. See the "
                "README")
        data[intent].append(utterance)

    vectors = {}
    for i, (intent, sentences) in enumerate(data.items()):
        print('{}/{} done'.format(i, len(data.items())))
        embeddings = model.encode(sentences)
        avg_embedding = np.mean(embeddings, axis=0)
        vectors[intent] = avg_embedding

    return vectors
Exemple #12
0
def no_stopwords():
    infersent2 = InferSent(params_model)
    infersent2.load_state_dict(torch.load(MODEL_PATH))
    infersent2.set_w2v_path(W2V_PATH)
    use_cuda = True
    infersent2 = infersent.cuda() if use_cuda else infersent
    pdss = pd.DataFrame(columns=['embds', 'set', 'catg'])
    start = time.time()
    global current_idx
    for x in range(3):
        crix = current_idx
        abss, catg, sets, crix = get_batch_from_dataframe(crix)
        for index in range(len(abss)):
            doc = nlp(abss[index])
            strs_after_stop_arr = []
            for token in doc:
                if not token.is_stop:
                    strs_after_stop_arr.append(token.text)

            abss[index] = ' '.join(strs_after_stop_arr)

        if x == 0:
            infersent2.build_vocab(abss, tokenize=True)
        else:
            infersent2.update_vocab(abss, tokenize=True)

        embed = infersent2.encode(abss, tokenize=True)
        df2 = pd.DataFrame({
            'embds': embed.tolist(),
            'set': sets,
            'catg': catg
        })
        pdss = pdss.append(df2, ignore_index=True)

        current_idx = crix
    end = time.time() - start
    print("Time without stopwords", end)
    pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
Exemple #13
0
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}

model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'final_text_vectors.txt'
model.set_w2v_path(W2V_PATH)
model.build_vocab(sentences, tokenize=True)  #build_vocab_k_words(K=100000)

embeddings = model.encode(
    sentences,
    tokenize=True)  #(sentences, bsize=168, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

sen_vec = preprocessing.normalize(embeddings)
sen_vec = Variable(torch.from_numpy(sen_vec))
#sen_vec = nn.Linear(4096,300)
model = net()
n = (1, 300)
nparray = np.zeros(n)
for i in sen_vec:
    out = model(i)
    out = out.data.numpy()
    #print(out)
    nparray = np.append(nparray, [out], axis=0)
nparray = np.delete(nparray, 0, axis=0)
Exemple #14
0
    'dpout_model': 0.0,
    'version': model_version
}
model = InferSent(hyperparameters)
model.load_state_dict(torch.load(MODEL_PATH))
use_cuda = False
model = model.cuda() if use_cuda else model
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)
model.build_vocab_k_words(K=10000)  #10000

train_df = pandas.read_csv('/home/stefan/projects/disaster_tweets/train.csv')

tweets = train_df.text.to_list()

embeddings = model.encode(tweets)
# train_df['embedding'] = [np.zeros(4096) for i in range(train_df.shape[0])]

embeddings_list = [embeddings[x] for x in range(embeddings.shape[0])]

# train_df['embedding'] = embeddings_list
# print(train_df.head())
# for i in range(train_df.shape[0]):
# # for i in range(10):
# 	tweet_text = train_df['text'][i]
# 	tweet_embedding = model.encode(tweet_text)

# train_df.to_csv('train_w_embeddings.csv')

train_dict = {}
for i in range(train_df.shape[0]):
    model.set_w2v_path(args.w2v_path)

    # Ensure directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    # Read files and extract features
    for fpath in args.files:
        print('Reading file {}'.format(fpath))
        sents = []
        with open(fpath) as f:
            for line in f:
                line = line.strip()
                assert line, 'Empty line in {}'.format(fpath)
                sents.append(line)

        # Set output file name
        out_name = os.path.join(
            args.out_dir, "{}.embs.npy".format(os.path.basename(fpath)))

        # Build vocab
        print('Building vocabulary')
        model.build_vocab(sents, args.tokenize)

        # Get embeddings
        embs = model.encode(sents, tokenize=args.tokenize,
                            verbose=True, bsize=args.batch_size)

        print('Saving to {}'.format(out_name))
        np.save(out_name, embs)
    header=None)
relational.columns = ["vocab"]

culture.vocab = culture.vocab.apply(lambda x: re.sub(',', '_', x))
demographic.vocab = demographic.vocab.apply(lambda x: re.sub(',', '_', x))
relational.vocab = relational.vocab.apply(lambda x: re.sub(',', '_', x))
##################################################
##################################################
##################################################
##################################################
##################################################

#generating semantic embeddings for the inq terms
d = {'terms': culture.vocab}
culture_df = pd.DataFrame(d)
culture_embeddings = model.encode(culture_df['terms'], verbose=True)
d = {'terms': demographic.vocab}
demographic_df = pd.DataFrame(d)
demographic_embeddings = model.encode(demographic_df['terms'], verbose=True)
d = {'terms': relational.vocab}
relational_df = pd.DataFrame(d)
relational_embeddings = model.encode(relational_df['terms'], verbose=True)

print("Dictionaries embeddings generated!")

#generating embeddings
embeddings = model.encode(df.text, verbose=True)
print('documents encoded : {0}'.format(len(embeddings)))

try:
    np.savez_compressed(
Exemple #17
0
class LCPR_I:
    def __init__(self):
        self.filename = "LCP/lcpr_i.sav"
        self.cmudict = cmudict.dict()
        self.wnlp = WonderlicNLP()
        self.embeddings_index = {}
        self.wiki_top10 = [
            word[0].split()[0]
            for word in pd.read_csv("LCP/wiki_top10.csv").values
        ][:10001]
        self.infersent_model_path = 'LCP/infersent%s.pkl' % 1
        self.infersent_model_params = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(self.infersent_model_params)
        self.model = RandomForestRegressor(n_estimators=100)

    #InferSent setup (boilerplate code from InferSent's repository):
    def initialize_infersent(self, sentences):
        print("INITIALIZING INFERSENT...", datetime.now().strftime("%H:%M:%S"))
        self.infersent.load_state_dict(torch.load(self.infersent_model_path))
        w2v_path = 'LCP/glove.42B.300d.txt'
        self.infersent.set_w2v_path(w2v_path)
        self.infersent.build_vocab(sentences, tokenize=True)
        print("INFERSENT READY!", datetime.now().strftime("%H:%M:%S"))

    def infersent_embedding(self, sentence):
        return self.infersent.encode(sentence, tokenize=True)

    # GloVe setup:
    def initialize_glove(self):
        print("INITIALIZING GLOVE...", datetime.now().strftime("%H:%M:%S"))
        f = open('LCP/glove.42B.300d.txt', encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()
        print("GLOVE READY!", datetime.now().strftime("%H:%M:%S"))

    def glove_embedding(self, word):
        embedding = [
            emb for emb in self.embeddings_index[str(word).lower()]
        ] if str(word).lower() in self.embeddings_index.keys() else [
            -1 for i in range(300)
        ]
        return embedding

    def find_word_pos(self, word, tokens):
        lemmatizer = WordNetLemmatizer()
        search_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        if word in tokens:
            return tokens.index(word)
        elif word in search_tokens:
            return search_tokens.index(word)
        else:
            return None

    # Used to find the index of the word in the sentence
    def extract_features(self, data):
        features = defaultdict(list)
        for id in tqdm(data.index, desc="PROCESSING DATA"):
            raw_token = "null" if str(data.loc[id]["token"]) == "nan" else str(
                data.loc[id]["token"])
            token = raw_token.lower()
            sent = data.loc[id]["sentence"]
            mrc_features = self.wnlp.get_mrc_features(token)
            glove = self.glove_embedding(token)
            infersent = self.infersent_embedding([sent])[0]

            # Sentence InferSent embedding:
            for i in range(1, 4097):
                features[f"infersent{i}"].append(infersent[i - 1])

            # Word GloVe embedding:
            for i in range(1, 301):
                features[f"glove{i}"].append(glove[i - 1])

            # MRC features:
            features["word_length"].append(mrc_features["Nlet"])
            features["syl_count"].append(mrc_features["Nsyl"])
            features["brown_freq"].append(mrc_features["Brown-freq"])
            features["familiarity"].append(mrc_features["Fam"])
            features["concreteness"].append(mrc_features["Conc"])
            features["imagability"].append(mrc_features["Imag"])
            features["meaningfulness_c"].append(mrc_features["Meanc"])
            features["meaningfulness_p"].append(mrc_features["Meanp"])
            features["age_of_aquisition"].append(mrc_features["AOA"])

            features["wiki_freq"].append(int(token in self.wiki_top10))

        return features

    def fit(self, train_data, train_labels):
        print("TRAINING...", datetime.now().strftime("%H:%M:%S"))
        self.initialize_glove()
        self.initialize_infersent(train_data["sentence"])
        features = self.extract_features(train_data)
        self.model.fit(pd.DataFrame(features), train_labels)
        print("TRAINING DONE!", datetime.now().strftime("%H:%M:%S"))

    def to_likert(self, prediction):
        if prediction >= 0 and prediction < 0.2:
            return 1
        elif prediction >= 0.2 and prediction < 0.4:
            return 2
        elif prediction >= 0.4 and prediction < 0.6:
            return 3
        elif prediction >= 0.6 and prediction < 0.8:
            return 4
        else:
            return 5

    def predict(self, test_data, development=False):
        print("LOOKING INTO THE ORB...", datetime.now().strftime("%H:%M:%S"))
        self.infersent.update_vocab(test_data)
        tokens = test_data["token"]
        predictions = self.model.predict(
            pd.DataFrame(self.extract_features(test_data)))
        if not development:
            for i in range(len(predictions)):
                print(
                    f"{tokens[i]} is a {self.to_likert(predictions[i])} on the Likert scale."
                )
        return predictions

    def score(self, train_data, train_labels):
        print("SCORING MODEL...", datetime.now().strftime("%H:%M:%S"))
        return self.model.score(
            pd.DataFrame(self.extract_features(train_data)), train_labels)

    def metrics(self, test_data, test_labels):
        labels_pred = self.predict(test_data, True)
        mae = mean_absolute_error(test_labels, labels_pred)
        rmse = math.sqrt(mean_squared_error(test_labels, labels_pred))
        print("MAE:", mae)
        print("RMSE:", rmse)

    def save(self):
        pickle.dump([self.model, self.embeddings_index, self.infersent],
                    open(self.filename, "wb"))

    def load(self):
        data = pickle.load(open(self.filename, "rb"))
        self.model = data[0]
        self.embeddings_index = data[1]
        self.infersent = data[2]
Exemple #18
0
print(len(sentences))

# In[7]:

sentences[:5]

# ## Encode sentences

# In[8]:

# gpu mode : >> 1000 sentences/s
# cpu mode : ~100 sentences/s

# In[9]:

embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

# ## Visualization

# In[10]:

np.linalg.norm(model.encode(['the cat eats.']))

# In[11]:


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

 temp = clean_debian(temp)
 if temp == '':
     cnt += 1
     print('NULL')
     continue
 temp = obj.replace_tokens(temp)
 if flag==0:
     t = temp
     flag = 1
     continue
 t=t.strip()
 print(str(t))
 print('---------------------------------------')
 #calculate sentence embedding for body and average it into 4096 sized vector
 if t!='' :
     embedding =infermodel.encode( str(t), bsize=1, tokenize=False, verbose=True)
     sent_vec =[]
     numw = 0
     for w in embedding:
         try:
             if numw == 0:
                 sent_vec = w
             else:
                 sent_vec = np.add(sent_vec, w)
             numw+=1
         except:
             pass
     v = np.asarray(sent_vec) / numw
     print(v.shape)
     print(v)
     v=np.transpose(v)
Exemple #20
0
class InferSentEmbeddings(EmbeddingBaseClass, FlairDocumentEmbeddings):
    """
    Class to infer the InferSent embeddings to flair sentences. cf.
    `here <https://github.com/facebookresearch/InferSent>`_
    """
    def __init__(self, version=1):
        super().__init__()

        self.version = version
        if version == 1:
            self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                            'word_embeddings',
                                            'glove.840B.300d',
                                            'glove.840B.300d.txt')
        if version == 2:
            self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                            'word_embeddings', 'crawl-300d-2M',
                                            'crawl-300d-2M.vec')

        self.MODEL_PATH = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                       'word_embeddings',
                                       'infersent%s' % version,
                                       'infersent%s.pkl' % version)

        # Set up logger
        logging.basicConfig(format='%(asctime)s : %(message)s',
                            level=logging.DEBUG)

        # Load InferSent model
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': version
        }

        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(self.MODEL_PATH))
        self.model.set_w2v_path(self.PATH_TO_W2V)

        self._embedding_length: int = params_model['enc_lstm_dim']

        self.name = f"{self.__class__.__name__ }_v{self.version}"
        self.static_embeddings = True

    @property
    def embedding_length(self) -> int:
        return self._embedding_length

    def _add_embeddings_internal(self, sentences: List[Sentence]):
        everything_embedded: bool = True
        infersent_sentences = []

        for sentence in sentences:
            if self.name not in sentence._embeddings.keys():
                everything_embedded = False

        if not everything_embedded:
            for sentence in sentences:
                infersent_sentences.append(sentence.to_tokenized_string())

            self.model.build_vocab(infersent_sentences, tokenize=False)
            self.model.update_vocab(infersent_sentences, tokenize=False)
            embeddings = self.model.encode(infersent_sentences, tokenize=False)

            for sentence, sentence_embedding in zip(sentences, embeddings):
                sentence.set_embedding(self.name,
                                       torch.tensor(sentence_embedding))
Exemple #21
0
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
print('Load our pre-trained model (in encoder/)')

# Set word vector path for the model
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)
print('Set word vector path for the model')

# Build the vocabulary of word vectors (i.e keep only those needed)
infersent.build_vocab(all_captions, tokenize=True)
print('Build the vocabulary of word vectors')

# Start encoding captions
caption2id = {}
f = open('pascal-sentences-dataset/text_features.txt', 'w+')
for caption in all_captions:
    current_feature = list(
        infersent.encode([caption], tokenize=True).squeeze())
    if not caption in caption2id:
        caption2id[caption] = 'caption_' + str(len(caption2id))
    current_feature = [str(feature) for feature in current_feature]
    current_feature_str = ' '.join(current_feature)
    f.write('%s %s\n' % (caption2id[caption], current_feature_str))
f.close()

with open('pascal-sentences-dataset/caption2id.json', 'w') as outfile:
    json.dump(caption2id, outfile)
import pandas as pd
import spacy
import nltk
import numpy as np
import torch
from models import InferSent
df=pd.read_csv("/home/psrivastava/Intern_Summer/data/new_output.csv")
abs_arr=df.ix[:4,'clean_text']
nlp=spacy.load("en_core_web_sm")
MODEL_PATH="/home/psrivastava/Intern_Summer/infersent/encoder/infersent2.pkl"
W2V_PATH="/home/psrivastava/Intern_Summer/infersent/fastText/crawl-300d-2M.vec"
params_model={'bsize':64,'word_emb_dim':300,'enc_lstm_dim':2048,'pool_type':'max','dpout_model':0.0,'version':2}
infersent=InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
infersent.set_w2v_path(W2V_PATH)

for index in range(len(abs_arr)):
    doc=nlp(abs_arr[index])
    strs_after_stop_arr=[]
    for token in doc:
        if not token.is_stop:
            strs_after_stop_arr.append(token.text)
    
    abs_arr[index]=' '.join(strs_after_stop_arr)
    
infersent.build_vocab(abs_arr) #But Actually they are abstracts of diffrent papers
print(infersent.encode(abs_arr)[0][:])
Exemple #23
0
if params.encoder_path and params.encoder_type == 'InferSent':
    
    params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version}
    encoder = InferSent(params_model)
    encoder.load_state_dict(torch.load(params.encoder_path))
    encoder.set_w2v_path(params.vector_rep)
    
    if params.vocab_samples.isdigit() :
        print("Build vocab from K samples")
        encoder.build_vocab_k_words(K=int(params.vocab_samples))
    else:
        print("Build vocab from full file")
        encoder.build_vocab(K=params.vocab_samples)

    print("========TEST encoder=======")
    print(encoder.encode(['the cat eats.']))
    
    encoder.to(device)
    
    


# model config
config_nli_model = {
    'n_words'        :  len(word_vec)         ,
    'word_emb_dim'   :  params.word_emb_dim   ,
    'enc_lstm_dim'   :  params.enc_lstm_dim   ,
    'n_enc_layers'   :  params.n_enc_layers   ,
    'dpout_model'    :  params.dpout_model    ,
    'dpout_fc'       :  params.dpout_fc       ,
    'fc_dim'         :  params.fc_dim         ,
Exemple #24
0
    'Question': [],
    'Answer': [],
    'Question_Emb': [],
    'Answer_Emb': [],
    'Label': [],
    'Cosine_Dist': [],
    'Euclidean_Dist': [],
    'Predicted_label_Cos': [],
    'Predicted_label_Euc': []
}

pred_labels_cos = []
pred_labels_euc = []
for i_q, this_q in enumerate(quetsions):

    embeddings_q = infersent.encode([this_q], tokenize=True, verbose=False)
    dist_cos_group = []
    dist_euc_group = []

    for i_a, this_a in enumerate(answers[i_q]):

        print(f'Question  {i_q: <10} Answer {i_a: <10} is done!')

        embeddings_a = infersent.encode([this_a], tokenize=True, verbose=False)

        # calculate the distances
        this_dist_cos = distance.cosine(embeddings_q, embeddings_a)
        this_dist_euc = distance.euclidean(embeddings_q, embeddings_a)
        dist_cos_group.append(this_dist_cos)
        dist_euc_group.append(this_dist_euc)
Exemple #25
0
from models import InferSent

V = 1
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' \
    if V == 1 else 'dataset/fastText/crawl-300d-2M-subword.vec'
W2V_PATH = 'dataset/fastText/crawl-300d-2M-subword.vec'

infersent.set_w2v_path(W2V_PATH)

infersent.build_vocab(sentences, tokenize=True)

embeddings = infersent.encode(sentences, tokenize=True)

# infersent.visualize('A man plays an instrument.', tokenize=True)

print(embeddings.shape)

joblib.dump(embeddings, os.path.join(args.file_path, 'data/embeddings.pkl'))
with open(ORI_PATH) as f:
    ori = f.read()
    ori = ori.replace('[[[[Premise]]]]: ',
                      '').replace('>>>>[[[[Hypothesis]]]]:', '')
    ori = ori.replace('[[', '').replace(']]', '')
    ori = ori.splitlines()

params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
infersent.set_w2v_path(W2V_PATH)
infersent.build_vocab_k_words(K)

adv_emb = infersent.encode(adv, tokenize=True)
ori_emb = infersent.encode(ori, tokenize=True)

result = [cos_sim(i, j) for i, j in zip(adv_emb, ori_emb)]
with open('../results/InferSent.txt', 'w') as f:
    f.write('\n'.join([str(i) for i in result]))

result = [distance(i, j) for i, j in zip(adv_emb, ori_emb)]
with open('../results/InferSent_distance.txt', 'w') as f:
    f.write('\n'.join([str(i) for i in result]))
Exemple #27
0
        model.load_state_dict(torch.load(MODEL_PATH))
        # Keep it on CPU or put it on GPU
        use_cuda = True
        model = model.cuda() if use_cuda else model
        W2V_PATH = '/home1/InferSent/oov_train_model.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        # model.build_vocab_k_words(K=100000)
        model.build_vocab_k_words(K=2051129)  # Extract embedding word .

        # Load test sentences

        train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8')
        source_s = train_test[0][1:]
        target_s = train_test[1][1:]
        embeddings_source = model.encode(source_s, bsize=128, tokenize=False, verbose=True)
        print('nb source_s encoded : {0}'.format(len(embeddings_source)))
        embeddings_target = model.encode(target_s, bsize=128, tokenize=False, verbose=True)
        print('nb target_s encoded : {0}'.format(len(embeddings_target)))
        np.save('embeddings_source.npy', embeddings_source)
        np.save('embeddings_target.npy', embeddings_target)

    if args.cosine == True:
        source_np = np.load('embeddings_source.npy')
        target_np = np.load('embeddings_target.npy')
        print('Success vector load')
        # Load for checking the vector name.
        train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8')
        # ground-truth dataset
        source_s = train_test[0][1:]
        target_s = train_test[1][1:]
Exemple #28
0
def infer(inputs):
    radius = 0.09
    nlp = spacy.load("en_core_web_sm")
    sentences = []
    locations = []

    import json
    pass_in = json.loads(inputs)

    for call in pass_in:
        sentences.append(call['transcript'])
        locations.append((call['latitude'], call['longitude']))

    from models import InferSent
    V = 2
    MODEL_PATH = 'encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)

    ## The old, bag of filtered words, implementation follows
    # for i, sentence in enumerate(sentences):
    #     sentences[i] = nlp(' '.join([str(t) for t in nlp(sentence) if t.pos_ in ['NOUN', 'PROPN', 'ADJ']]))
    #
    # sentences_matrix = np.vstack([x.vector / norm(x.vector) for x in sentences])
    # ling_compatibility = np.matmul(sentences_matrix, np.transpose(sentences_matrix))
    # print(ling_compatibility)

    infersent.build_vocab(sentences, tokenize=True)
    embeddings = infersent.encode(sentences, tokenize=True)
    embeddings = embeddings / np.linalg.norm(
        embeddings, ord=2, axis=1, keepdims=True)

    ling_compatibility = np.matmul(embeddings, np.transpose(embeddings))

    #print(ling_compatibility)

    def intersection_area(d, r):
        if d == 0:  # the circles are the same
            return np.pi * r**2
        if d >= 2 * r:  # The circles don't overlap at all.
            return 0

        r2, d2 = r**2, d**2
        alpha = np.arccos(d2 / (2 * d * r))
        wow = 2 * r2 * alpha - r2 * np.sin(2 * alpha)
        return wow

    geo_compatibility = np.zeros((len(locations), len(locations)))
    for i in range(len(locations)):
        for k in range(i, len(locations)):
            geo_compatibility[i][k] = intersection_area(
                math.sqrt((locations[i][0] - locations[k][0])**2 +
                          (locations[i][1] - locations[k][1])**2),
                radius) / (math.pi * (2**2))

    from sklearn.cluster import KMeans
    total = np.multiply(ling_compatibility, geo_compatibility)
    #print(total.shape)
    #for i in range(len(locations)):
    #    for k in range(len(locations)):
    #        if i != k and total[i][k] > 0.65:
    #            print(str(i) + " and " + str(k) + " are the same incident")
    kmeany = KMeans(init='k-means++').fit(total)
    labels = kmeany.labels_.tolist()

    mapper = {}
    for call, label in enumerate(labels):
        mapper[call] = label

    class Analysis:
        def __init__(self, sentence):
            self.sentence = sentence
            self.nlpped = nlp(sentence)

            self.nouns = [
                str(t.lemma_) for t in self.nlpped if
                (t.pos_ in ['PROPN', 'NOUN'] and t.lemma_ not in ['I', 'help'])
            ]

            self.verbs = [
                str(t.lemma_) for t in self.nlpped if
                (t.pos_ in ['VERB', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
                 and t.lemma_ not in [
                     'be', 'have', 'do', 'say', 'go', 'get', 'make', 'know',
                     'think', 'take', 'help', 'may', 'fear', 'see', 'stop',
                     'reach', 'seem', 'hope', 'want', 'would', 'cause', 'let',
                     'like', 'will'
                 ])
            ]

    analyses = []
    for sentence in sentences:
        analyses.append(Analysis(sentence))
    d = []
    for n in set(mapper.values()):
        nouns = []
        for k in mapper.keys():
            if mapper[k] == n:
                nouns += analyses[k].nouns

        noun_counter = Counter(nouns)

        verbs = []
        for k in mapper.keys():
            if mapper[k] == n:
                verbs += analyses[k].verbs

        verb_counter = Counter(verbs)

        calls = []
        for k in mapper.keys():
            if mapper[k] == n:
                call = {
                    'transcript': sentences[k],
                    'file': pass_in[k]['file'],
                    'lat': locations[k][0],
                    'lon': locations[k][1],
                    'id': pass_in[k]['id']
                }

                calls.append(call)

        blah = [x[0] for x in verb_counter.most_common(3) if x[1] > 1
                ] + [x[0] for x in noun_counter.most_common(3) if x[1] > 1]
        if len(blah) == 0:
            blah = [x[0] for x in verb_counter.most_common(1)
                    ] + [x[0] for x in noun_counter.most_common(1)]

        d.append({'name': ' '.join(blah), 'calls': calls})

    return json.dumps(d)
ARGS = PARSER.parse_args()
question = ARGS.question
sentences = [question]
#### Load Facebook's InferSent (download the files from the internet)
infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1})
infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl'))
infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt')

# Extract the most relevant Wikipedia page
#### Wikipedia recommends 10 pages
wikipedia_pages = wikipedia.search(question)
sentences = sentences + wikipedia_pages
#### Convert sentences to numbers
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True, verbose=False)
#### Choose the most relevant pages
distances = pdist(np.array(embeddings), metric='euclidean')
sentence_similarity_matrix = squareform(distances)
most_relevant_pages = np.argsort(sentence_similarity_matrix[0][1:])
#### Extract the content on the most relevant page (tries multiple pages in case of failure)
for page in most_relevant_pages:
    try:
        content_on_the_page = wikipedia.page(wikipedia_pages[page]).content
        break
    except:
        pass

# Find and print the most relevant sentences
#### Split the content into sentences
sents = nltk.sent_tokenize(content_on_the_page)
Exemple #30
0
def extract_answer_IFST(story_data, question_and_ans_data, story_ids,
                        model_version, Vocab_Size):
    """ (1) get answer, then modify self.question_and_ans_data by add the answer to it. 
        (2) for each story id, extract its question, then look up in story_data, find the best sentence"""
    import re
    import pandas as pd

    import torch
    import numpy as np
    from models import InferSent

    #sentence_list=build_vocabulary(story_data)
    W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec'
    MODEL_PATH = 'encoder/infersent%s.pkl' % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.set_w2v_path(W2V_PATH)
    if model_version == 3:
        sentence_list = build_vocabulary(story_data)
        model.build_vocab(sentence_list)
    else:
        model.build_vocab_k_words(K=Vocab_Size)

    for story_id in story_ids:
        story = story_data.loc[lambda df: df.story_id == story_id,
                               'story'].values[0]
        question_ids = question_and_ans_data.loc[
            lambda df: df.story_id == story_id, 'question_id']

        for question_id in question_ids:
            # get the question and answer
            question = question_and_ans_data.loc[
                lambda df: df.question_id == question_id, 'question'].values[0]
            if 'answer' in question_and_ans_data:
                answer = question_and_ans_data.loc[
                    lambda df: df.question_id == question_id,
                    'answer'].values[0]

            question_encoded = model.encode(
                str(question_and_ans_data.loc[question_and_ans_data.index[
                    question_and_ans_data['question_id'] == question_id][0],
                                              'question']))[0]

            ans = []
            for sent in story.sents:
                #sim = sent.similarity(question)
                sim = cosine(question_encoded, model.encode(str(sent))[0])

                ans.append({
                    'question_id': question_id,
                    'answer_pred': sent,
                    'similarity': sim
                })

            ans = pd.DataFrame(ans).reindex(
                ['question_id', 'answer_pred', 'similarity'], axis=1)
            ans.sort_values(by=['similarity'], ascending=False, inplace=True)

            question_and_ans_data.loc[lambda df: df.question_id == question_id,
                                      'answer_pred'] = str(
                                          ans.iloc[0]['answer_pred']).replace(
                                              '\n', ' ')  #.text

    #question_and_ans_data['answer_pred'] = question_and_ans_data['answer_pred'].apply(TextBlob)

    return question_and_ans_data