Example #1
0
def create_embeddings(infer_path, data_path, em_type):
    yt_titles = yt.get_yt_titles()
    with open("data/whtitles", "r") as f:
        wh_titles = [line.rstrip('\n') for line in f]

    if em_type == "yt":  # Youtube
        save_f = os.path.join(data_path, "yt_embed")
        titles = yt_titles
    elif em_type == "wh":  # Wikihow
        save_f = os.path.join(data_path, "wh_embed")
        titles = wh_titles
    else:
        raise "Unknown embedding type: {}".format(em_type)

    nltk.download('punkt')
    V = 1
    MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V)
    params_model = {
        'bsize': 256,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent = infersent.cuda()

    W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt')
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(yt_titles + wh_titles, tokenize=True)
    embed = infersent.encode(titles, tokenize=True)
    np.save(save_f, embed)
def load_inferSent(sentences):
    logger.info('load InferSent')
    V = 2
    MODEL_PATH = 'Infersent/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    if torch.cuda.is_available():
        infersent.cuda()

    # set word vector
    if V == 1:
        W2V_PATH = 'Infersent/Glove/glove.840B.300d.txt'
        logger.warning('Use Glove Embedding')
    elif V == 2:
        W2V_PATH = 'Infersen/fastText/crawl-300d-2M.vec'
        logger.warning('Use fastText Embedding')
    else:
        raise NotImplementedError
    infersent.set_w2v_path(W2V_PATH)

    # build voceb
    infersent.build_vocab(sentences, tokenize=True)

    return infersent
Example #3
0
class InferSentFeatures:
    def __init__(self, lang_enc_dir, sentences):
        sys.path.insert(0, os.path.join(lang_enc_dir, 'InferSent/'))
        from models import InferSent

        version = 1
        MODEL_PATH = os.path.join(
            lang_enc_dir, 'InferSent/encoder/infersent%s.pkl' % version)
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))

        W2V_PATH = os.path.join(lang_enc_dir, 'glove/glove.6B.300d.txt')
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab(sentences, tokenize=True)

    def generate_embeddings(self, sentences):
        embeddings = self.model.encode(sentences, tokenize=True)
        return embeddings
def embed_sent(datafile):
    sentences = []
    with open(datafile, 'r') as f:
        i = 0
        for line in f:
            line = line.replace('\n', '')
            sentences.append(line)
            i += 1
            if i == 455820:
                        break
    V = 1
    MODEL_PATH = 'infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(sentences, tokenize=True)

    embeddings = infersent.encode(sentences, tokenize=True)

    np.savetxt("../../wiki-split/Data/Infersent_vectors/complex_sent", embeddings)
Example #5
0
class Encoder2:
    ''' Encoder based on InferSent '''

    WORD_VECTORS_FILE = 'crawl-300d-2M.vec'
    MODEL_FILE = 'infersent2.pkl'

    def __init__(self, word_vectors_dir, models_dir):
        word_vectors = os.path.join(word_vectors_dir, self.WORD_VECTORS_FILE)
        model_file = os.path.join(models_dir, self.MODEL_FILE)

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 2
        }

        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(model_file))
        self.model.set_w2v_path(word_vectors)

    def start(self, texts):
        texts_list = texts.values.tolist()
        self.model.build_vocab(texts_list, tokenize=True)

    def close(self):
        pass

    def encode(self, texts_batch):
        texts_batch_list = texts_batch.values.tolist()
        texts_batch_vec = self.model.encode(texts_batch_list, tokenize=True)

        return texts_batch_vec
Example #6
0
def infersent_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path):
    model_path = data_fold_path + 'word_sent_embed/infersent2.pickle'
    word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec'
    posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim))

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    model.set_w2v_path(word_emb_path)

    all_sents = []
    for sens in posts:
        all_sents.extend(sens)

    model.build_vocab(all_sents, tokenize=False)

    for ind, sentences in enumerate(posts):
        embeddings = model.encode(sentences, tokenize=False, verbose=False)
        l = min(max_sent_cnt, len(sentences))
        posts_arr[ind, :l, :] = embeddings[:l]

    return posts_arr
Example #7
0
    def prepare(model_path: str,
                word_vecs: str,
                out_path: str,
                sentences: Union[str, List[str]] = None,
                max_vocab: int = 0):
        """
        this method is for adapting the vocabulary,
        :param model_path: unadapted model state
        :param word_vecs: word vectors
        :param out_path: where to store the state
        :param sentences: training sentences for scanning the vocabulary
        :param max_vocab: maximum vocabulary size (optional)
        :return:
        """
        assert bool(sentences) != bool(
            max_vocab), 'Either sentences or max_vocab should be given'

        model = InferSent(config=MODEL_CONF)
        log.info(f"Loading state from {out_path}")

        model.load_state_dict(torch.load(model_path))
        log.info(f"Loading word vecs from {out_path}")
        model.set_w2v_path(word_vecs)
        if sentences:
            if type(sentences) is not list:
                sentences = list(read_lines(sentences))
            log.info("Building vocabulary from sentences")
            model.build_vocab(sentences, tokenize=True)
        if max_vocab:
            log.info(f"Pruning vocabulary to top {max_vocab} types")
            model.build_vocab_k_words(K=max_vocab)
        log.info(f"Saving at {out_path}")

        state = SentenceEncoder._get_state(model)
        torch.save(state, out_path)
def calcule_eucl(text, question):
    blob = TextBlob("".join(text))
    sentences = [item.raw for item in blob.sentences]

    V = 2
    MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'InferSent/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}
    for i in range(len(sentences)):
        dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                         tokenize=True)
        encode_question = infersent.encode([question], tokenize=True)
    eucl = eucl_sim(dict_embeddings, encode_question)

    return sentences, eucl
Example #9
0
  def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ):
  
    sp = spacy.load('en_core_web_sm')
    tokenized = sp(doc)
    sentences = []
    for token in tokenized.sents:
      sentences.append(token.text)

    if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']:
      # Use encoder for mapping tokens to embeddings
      word_embedding_model = models.Transformer(model_name, 
                  tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {})
      # Apply mean pooling to get one fixed sized sentence vector
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                     pooling_mode_mean_tokens=True,
                                     pooling_mode_cls_token=False,
                                     pooling_mode_max_tokens=False)
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
      sentence_embeddings = model.encode(sentences)
    

    elif encoder == 'use':
      #!pip install embedding-as-service
      from embedding_as_service.text.encode import Encoder
      en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
      sentence_embeddings = en.encode(texts=sentences)


    elif encoder == 'infersent':
      import nltk
      nltk.download('punkt')
      from models import InferSent
      params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                      'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
      infersent = InferSent(params_model)
      W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
      infersent.set_w2v_path(W2V_PATH)
      infersent.build_vocab(sentences, tokenize=True)
      sentence_embeddings = infersent.encode(sentences, tokenize=True)


    elif encoder == 'sent2vec':
      import sent2vec
      model = sent2vec.Sent2vecModel()
      model.load_model('drive/My Drive/torontobooks_unigram.bin') 
      sentence_embeddings = model.embed_sentences(sentences)
   

    elif encoder == 'laser':
      from laserembeddings import Laser
      laser = Laser()  ## Also used for multilingual sentence embeddings
      sentence_embeddings = laser.embed_sentences(sentences, lang='en') 
  
  
    else:
      raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder))  
  
    return list(zip(sentences, sentence_embeddings))
Example #10
0
def infersent_flat_embed_posts(posts, embed_dim, data_fold_path):
    model_path = data_fold_path + 'word_sent_embed/infersent2.pickle'
    word_emb_path = data_fold_path + 'word_sent_embed/fasttext.vec'

    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 2
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))
    model.set_w2v_path(word_emb_path)

    model.build_vocab(posts, tokenize=False)
    return model.encode(posts, tokenize=False, verbose=False)
Example #11
0
class Infersent:
    def __init__(self):

        V = 2
        MODEL_PATH = 'encoder/infersent%s.pkl' % V
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': V
        }

        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(torch.load(MODEL_PATH))
        self.infersent.set_w2v_path('fastText/crawl-300d-2M.vec')

    def get(self, sentences):
        self.infersent.build_vocab(sentences, tokenize=True)

        return self.infersent.encode(sentences, tokenize=True)
Example #12
0
def no_stopwords():
    infersent2 = InferSent(params_model)
    infersent2.load_state_dict(torch.load(MODEL_PATH))
    infersent2.set_w2v_path(W2V_PATH)
    use_cuda = True
    infersent2 = infersent.cuda() if use_cuda else infersent
    pdss = pd.DataFrame(columns=['embds', 'set', 'catg'])
    start = time.time()
    global current_idx
    for x in range(3):
        crix = current_idx
        abss, catg, sets, crix = get_batch_from_dataframe(crix)
        for index in range(len(abss)):
            doc = nlp(abss[index])
            strs_after_stop_arr = []
            for token in doc:
                if not token.is_stop:
                    strs_after_stop_arr.append(token.text)

            abss[index] = ' '.join(strs_after_stop_arr)

        if x == 0:
            infersent2.build_vocab(abss, tokenize=True)
        else:
            infersent2.update_vocab(abss, tokenize=True)

        embed = infersent2.encode(abss, tokenize=True)
        df2 = pd.DataFrame({
            'embds': embed.tolist(),
            'set': sets,
            'catg': catg
        })
        pdss = pdss.append(df2, ignore_index=True)

        current_idx = crix
    end = time.time() - start
    print("Time without stopwords", end)
    pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
Example #13
0
# For Load encoder
encoder = None
if params.encoder_path and params.encoder_type == 'InferSent':
    
    params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version}
    encoder = InferSent(params_model)
    encoder.load_state_dict(torch.load(params.encoder_path))
    encoder.set_w2v_path(params.vector_rep)
    
    if params.vocab_samples.isdigit() :
        print("Build vocab from K samples")
        encoder.build_vocab_k_words(K=int(params.vocab_samples))
    else:
        print("Build vocab from full file")
        encoder.build_vocab(K=params.vocab_samples)

    print("========TEST encoder=======")
    print(encoder.encode(['the cat eats.']))
    
    encoder.to(device)
    
    


# model config
config_nli_model = {
    'n_words'        :  len(word_vec)         ,
    'word_emb_dim'   :  params.word_emb_dim   ,
    'enc_lstm_dim'   :  params.enc_lstm_dim   ,
    'n_enc_layers'   :  params.n_enc_layers   ,
    model.set_w2v_path(args.w2v_path)

    # Ensure directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    # Read files and extract features
    for fpath in args.files:
        print('Reading file {}'.format(fpath))
        sents = []
        with open(fpath) as f:
            for line in f:
                line = line.strip()
                assert line, 'Empty line in {}'.format(fpath)
                sents.append(line)

        # Set output file name
        out_name = os.path.join(
            args.out_dir, "{}.embs.npy".format(os.path.basename(fpath)))

        # Build vocab
        print('Building vocabulary')
        model.build_vocab(sents, args.tokenize)

        # Get embeddings
        embs = model.encode(sents, tokenize=args.tokenize,
                            verbose=True, bsize=args.batch_size)

        print('Saving to {}'.format(out_name))
        np.save(out_name, embs)
Example #15
0
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))


# In[19]:


W2V_PATH = 'Documents/FastText/crawl-300d-2M.vec/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)


# In[20]:


infersent.build_vocab(train_doc, tokenize=True)


# In[21]:


embeddings = infersent.encode(train_doc, tokenize=True)


# In[22]:


infersent.visualize('A man plays an instrument.', tokenize=True)


# In[31]:
Example #16
0
    for word in f:
        english_long.add(word.strip())

df = pd.read_csv(
    "../../../models_storage/word_embeddings_data/ocr_text_with_tags_10000.csv"
)
df = df[df.text.isna() == False]  #filtering out rows with NA's for text
#df = df[:50] #take this line out if it works
#df.text = df.text.apply(lambda x: x[:10000] if len(x) > 10000 else x)

# Create useful lists using above functions:
stop_words_list = stopwords_make()
punctstr = punctstr_make()
unicode_list = unicode_make()

model.build_vocab(df.text)
print("Vocabulary loading complete!")


#writing function for common cosine similarity
def doc_words_cosine(i, t):
    emb = embeddings[i]
    if t == 'culture':
        word_vec_avg = np.sum(culture_embeddings, axis=0) / len(culture)
    elif t == 'demographic':
        word_vec_avg = np.sum(demographic_embeddings,
                              axis=0) / len(demographic)
    elif t == 'relational':
        word_vec_avg = np.sum(relational_embeddings, axis=0) / len(relational)
    return absolute(dot(emb, word_vec_avg) / (norm(emb) * norm(word_vec_avg)))
Example #17
0
def answer_the_question():
    print("***********************************************")

    #print(request.form)

    print("***********************************************")

    input_info = request.form['cont']
    #print(request.args['data'])
    question = request.form['question']
    #question = 'where are you.'
    #input_info = [['I am here.'],['f**k ooff']]
    print("___________________________________________________________")
    print(question)
    print(len(input_info))
    print("_________________________________________________________________")
    MODEL_PATH = 'encoder/infersent1.pkl'
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': 1
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'Glove/glove/glove.42B.300d.txt'
    infersent.set_w2v_path(W2V_PATH)

    sentences = []
    sentences.append(convert(question))
    li = input_info.split('.')
    for k in li:
        if len(k) > 4:
            k = convert(k)
            sentences.append(k)
    print(
        "_____________________________________________________________________________"
    )
    print(len(sentences))
    print(
        '__________________________________________________________________________'
    )
    infersent.build_vocab(sentences, tokenize=True)

    dict_embeddings = {}
    for i in range(len(sentences)):
        try:
            dict_embeddings[sentences[i]] = infersent.encode([sentences[i]],
                                                             tokenize=True)
        except:
            continue

#print(dict_embeddings[sentences[i]])
    li_of_dis = []
    for a2 in sentences:
        try:
            li_of_dis.append(
                spatial.distance.cosine(dict_embeddings[sentences[0]],
                                        dict_embeddings[a2]))
        except:
            li_of_dis.append(1.00)
    mini_d = 1
    x = 0
    print(li_of_dis)

    for i in range(1, len(li_of_dis)):
        if (li_of_dis[i] < mini_d and li_of_dis[i] > 0.05):
            mini_d = li_of_dis[i]
            x = i

    ans_s = sentences[x]
    print(
        "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
    )
    if (x + 3 < len(sentences)):
        ans_s = ' ' + sentences[x + 1] + ' ' + sentences[
            x + 2] + ' ' + sentences[x + 3]

    return jsonify(ans=ans_s)
Example #18
0
    raise NotImplementedError
infersent.set_w2v_path(W2V_PATH)

# read data
refs = []
with open(args.golden, 'r') as f:
    for line in f:
        refs.append(line[:-1])

hyps = []
with open(args.generated, 'r') as f:
    for line in f:
        hyps.append(line[:-1])

# build voceb
infersent.build_vocab(refs+hyps, tokenize=True)

# get embeddings
refs_embeds = infersent.encode(refs, tokenize=True)
hyps_embeds = infersent.encode(hyps, tokenize=True)

# compute cosine similarity
refs_norm = np.linalg.norm(refs_embeds, ord=2, axis=1)
hyps_norm = np.linalg.norm(hyps_embeds, ord=2, axis=1)  

cosine = np.sum((refs_embeds*hyps_embeds), axis=1)/refs_norm/hyps_norm

if args.output_file is not None:
    with open(args.output_file, 'a') as f:
        print(json.dumps({'embedding_cosin':float(np.mean(cosine))}), file=f)
else:
Example #19
0
class LCPR_I:
    def __init__(self):
        self.filename = "LCP/lcpr_i.sav"
        self.cmudict = cmudict.dict()
        self.wnlp = WonderlicNLP()
        self.embeddings_index = {}
        self.wiki_top10 = [
            word[0].split()[0]
            for word in pd.read_csv("LCP/wiki_top10.csv").values
        ][:10001]
        self.infersent_model_path = 'LCP/infersent%s.pkl' % 1
        self.infersent_model_params = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(self.infersent_model_params)
        self.model = RandomForestRegressor(n_estimators=100)

    #InferSent setup (boilerplate code from InferSent's repository):
    def initialize_infersent(self, sentences):
        print("INITIALIZING INFERSENT...", datetime.now().strftime("%H:%M:%S"))
        self.infersent.load_state_dict(torch.load(self.infersent_model_path))
        w2v_path = 'LCP/glove.42B.300d.txt'
        self.infersent.set_w2v_path(w2v_path)
        self.infersent.build_vocab(sentences, tokenize=True)
        print("INFERSENT READY!", datetime.now().strftime("%H:%M:%S"))

    def infersent_embedding(self, sentence):
        return self.infersent.encode(sentence, tokenize=True)

    # GloVe setup:
    def initialize_glove(self):
        print("INITIALIZING GLOVE...", datetime.now().strftime("%H:%M:%S"))
        f = open('LCP/glove.42B.300d.txt', encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()
        print("GLOVE READY!", datetime.now().strftime("%H:%M:%S"))

    def glove_embedding(self, word):
        embedding = [
            emb for emb in self.embeddings_index[str(word).lower()]
        ] if str(word).lower() in self.embeddings_index.keys() else [
            -1 for i in range(300)
        ]
        return embedding

    def find_word_pos(self, word, tokens):
        lemmatizer = WordNetLemmatizer()
        search_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        if word in tokens:
            return tokens.index(word)
        elif word in search_tokens:
            return search_tokens.index(word)
        else:
            return None

    # Used to find the index of the word in the sentence
    def extract_features(self, data):
        features = defaultdict(list)
        for id in tqdm(data.index, desc="PROCESSING DATA"):
            raw_token = "null" if str(data.loc[id]["token"]) == "nan" else str(
                data.loc[id]["token"])
            token = raw_token.lower()
            sent = data.loc[id]["sentence"]
            mrc_features = self.wnlp.get_mrc_features(token)
            glove = self.glove_embedding(token)
            infersent = self.infersent_embedding([sent])[0]

            # Sentence InferSent embedding:
            for i in range(1, 4097):
                features[f"infersent{i}"].append(infersent[i - 1])

            # Word GloVe embedding:
            for i in range(1, 301):
                features[f"glove{i}"].append(glove[i - 1])

            # MRC features:
            features["word_length"].append(mrc_features["Nlet"])
            features["syl_count"].append(mrc_features["Nsyl"])
            features["brown_freq"].append(mrc_features["Brown-freq"])
            features["familiarity"].append(mrc_features["Fam"])
            features["concreteness"].append(mrc_features["Conc"])
            features["imagability"].append(mrc_features["Imag"])
            features["meaningfulness_c"].append(mrc_features["Meanc"])
            features["meaningfulness_p"].append(mrc_features["Meanp"])
            features["age_of_aquisition"].append(mrc_features["AOA"])

            features["wiki_freq"].append(int(token in self.wiki_top10))

        return features

    def fit(self, train_data, train_labels):
        print("TRAINING...", datetime.now().strftime("%H:%M:%S"))
        self.initialize_glove()
        self.initialize_infersent(train_data["sentence"])
        features = self.extract_features(train_data)
        self.model.fit(pd.DataFrame(features), train_labels)
        print("TRAINING DONE!", datetime.now().strftime("%H:%M:%S"))

    def to_likert(self, prediction):
        if prediction >= 0 and prediction < 0.2:
            return 1
        elif prediction >= 0.2 and prediction < 0.4:
            return 2
        elif prediction >= 0.4 and prediction < 0.6:
            return 3
        elif prediction >= 0.6 and prediction < 0.8:
            return 4
        else:
            return 5

    def predict(self, test_data, development=False):
        print("LOOKING INTO THE ORB...", datetime.now().strftime("%H:%M:%S"))
        self.infersent.update_vocab(test_data)
        tokens = test_data["token"]
        predictions = self.model.predict(
            pd.DataFrame(self.extract_features(test_data)))
        if not development:
            for i in range(len(predictions)):
                print(
                    f"{tokens[i]} is a {self.to_likert(predictions[i])} on the Likert scale."
                )
        return predictions

    def score(self, train_data, train_labels):
        print("SCORING MODEL...", datetime.now().strftime("%H:%M:%S"))
        return self.model.score(
            pd.DataFrame(self.extract_features(train_data)), train_labels)

    def metrics(self, test_data, test_labels):
        labels_pred = self.predict(test_data, True)
        mae = mean_absolute_error(test_labels, labels_pred)
        rmse = math.sqrt(mean_squared_error(test_labels, labels_pred))
        print("MAE:", mae)
        print("RMSE:", rmse)

    def save(self):
        pickle.dump([self.model, self.embeddings_index, self.infersent],
                    open(self.filename, "wb"))

    def load(self):
        data = pickle.load(open(self.filename, "rb"))
        self.model = data[0]
        self.embeddings_index = data[1]
        self.infersent = data[2]
Example #20
0
def main():

    # Dictionary for Final Rankings.
    ranking = dict()

    print("\n CSI 4107 - Microblog information retrieval system \n")

    print("\n Importing Query Files and Documents... \n")

    # Load the tweet list.
    # {'34952194402811904': 'Save BBC World Service from Savage Cuts http://www.petitionbuzz.com/petitions/savews', ...}
    tweets_dict = importTweets()

    # Load the list of queries.
    # {1: ['bbc', 'world', 'servic', 'staff', 'cut'], ...}
    queries_dict = importQuery()

    print("\n Importing Done! \n")

    print("\n Initializing InferSent Model... \n")

    # Initialize InferSent Model.
    infersent = InferSent(params_model)

    # Load Infersent v1 Model Encoder.
    infersent.load_state_dict(torch.load(MODEL_PATH))

    # Use GPU Mode
    infersent = infersent.cuda() if USE_CUDA else infersent

    # Load Pre-trained GloVe Model.
    infersent.set_w2v_path(W2V_PATH)

    print("\n InferSent Initialization Done! \n")

    print("\n Building Vocabulary from Tweets... \n")

    # Deconstruct the dictionary of Documents to Document ID, and Document Contents.
    tweets = list(tweets_dict.values())
    tweet_ids = list(tweets_dict.keys())

    # Deconstruct the dictionary of Queries to Query Contents, since we can replicate Query ID.
    queries = list(queries_dict.values())

    # Build the Infersent Vocabulary based on all the Documents' Contents.
    infersent.build_vocab(tweets, tokenize=False)

    print("\n Vocabulary Completed! \n")

    print("\n Building Document & Query Vectors... \n")

    doc_embeddings = infersent.encode(tweets,
                                      bsize=128,
                                      tokenize=False,
                                      verbose=True)
    query_embeddings = infersent.encode(queries,
                                        bsize=128,
                                        tokenize=False,
                                        verbose=True)

    print("\n Building Document & Query Vectors Done! \n")

    print("\n Retrieval and Ranking... \n")

    dranking = dict()

    for query_id in range(len(queries)):
        print(dranking)
        # Encoded array starts at 0 for first chronological document.
        current_document = 0

        # Calculate the Cosine Similarity between the current Query, and corpus of Documents.
        for tweet_id in tweet_ids:
            # Calculate the Cossine Sim
            dranking[tweet_id] = cosine(doc_embeddings[current_document],
                                        query_embeddings[query_id])
            current_document += 1

        # Put the ranking of Documents in Descending order into ranking.
        ranking[query_id + 1] = {
            k: v
            for k, v in sorted(dranking.items(),
                               key=lambda dranking: dranking[1],
                               reverse=True)[:1000]
        }

        # Create the resulting file.
        print("Query " + str(query_id) + " Done.")
        dranking.clear()

    resultFileCreation(ranking)

    print("\n Retrieval and Ranking Done! \n")
Example #21
0
def extract_answer_IFST(story_data, question_and_ans_data, story_ids,
                        model_version, Vocab_Size):
    """ (1) get answer, then modify self.question_and_ans_data by add the answer to it. 
        (2) for each story id, extract its question, then look up in story_data, find the best sentence"""
    import re
    import pandas as pd

    import torch
    import numpy as np
    from models import InferSent

    #sentence_list=build_vocabulary(story_data)
    W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec'
    MODEL_PATH = 'encoder/infersent%s.pkl' % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.set_w2v_path(W2V_PATH)
    if model_version == 3:
        sentence_list = build_vocabulary(story_data)
        model.build_vocab(sentence_list)
    else:
        model.build_vocab_k_words(K=Vocab_Size)

    for story_id in story_ids:
        story = story_data.loc[lambda df: df.story_id == story_id,
                               'story'].values[0]
        question_ids = question_and_ans_data.loc[
            lambda df: df.story_id == story_id, 'question_id']

        for question_id in question_ids:
            # get the question and answer
            question = question_and_ans_data.loc[
                lambda df: df.question_id == question_id, 'question'].values[0]
            if 'answer' in question_and_ans_data:
                answer = question_and_ans_data.loc[
                    lambda df: df.question_id == question_id,
                    'answer'].values[0]

            question_encoded = model.encode(
                str(question_and_ans_data.loc[question_and_ans_data.index[
                    question_and_ans_data['question_id'] == question_id][0],
                                              'question']))[0]

            ans = []
            for sent in story.sents:
                #sim = sent.similarity(question)
                sim = cosine(question_encoded, model.encode(str(sent))[0])

                ans.append({
                    'question_id': question_id,
                    'answer_pred': sent,
                    'similarity': sim
                })

            ans = pd.DataFrame(ans).reindex(
                ['question_id', 'answer_pred', 'similarity'], axis=1)
            ans.sort_values(by=['similarity'], ascending=False, inplace=True)

            question_and_ans_data.loc[lambda df: df.question_id == question_id,
                                      'answer_pred'] = str(
                                          ans.iloc[0]['answer_pred']).replace(
                                              '\n', ' ')  #.text

    #question_and_ans_data['answer_pred'] = question_and_ans_data['answer_pred'].apply(TextBlob)

    return question_and_ans_data
Example #22
0
class InferSentEmbeddings(EmbeddingBaseClass, FlairDocumentEmbeddings):
    """
    Class to infer the InferSent embeddings to flair sentences. cf.
    `here <https://github.com/facebookresearch/InferSent>`_
    """
    def __init__(self, version=1):
        super().__init__()

        self.version = version
        if version == 1:
            self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                            'word_embeddings',
                                            'glove.840B.300d',
                                            'glove.840B.300d.txt')
        if version == 2:
            self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                            'word_embeddings', 'crawl-300d-2M',
                                            'crawl-300d-2M.vec')

        self.MODEL_PATH = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                       'word_embeddings',
                                       'infersent%s' % version,
                                       'infersent%s.pkl' % version)

        # Set up logger
        logging.basicConfig(format='%(asctime)s : %(message)s',
                            level=logging.DEBUG)

        # Load InferSent model
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': version
        }

        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(self.MODEL_PATH))
        self.model.set_w2v_path(self.PATH_TO_W2V)

        self._embedding_length: int = params_model['enc_lstm_dim']

        self.name = f"{self.__class__.__name__ }_v{self.version}"
        self.static_embeddings = True

    @property
    def embedding_length(self) -> int:
        return self._embedding_length

    def _add_embeddings_internal(self, sentences: List[Sentence]):
        everything_embedded: bool = True
        infersent_sentences = []

        for sentence in sentences:
            if self.name not in sentence._embeddings.keys():
                everything_embedded = False

        if not everything_embedded:
            for sentence in sentences:
                infersent_sentences.append(sentence.to_tokenized_string())

            self.model.build_vocab(infersent_sentences, tokenize=False)
            self.model.update_vocab(infersent_sentences, tokenize=False)
            embeddings = self.model.encode(infersent_sentences, tokenize=False)

            for sentence, sentence_embedding in zip(sentences, embeddings):
                sentence.set_embedding(self.name,
                                       torch.tensor(sentence_embedding))
Example #23
0
# text_unpacked_short = [t[:2000] for t in text_unpacked]
# df.text_unpacked = [' '.join(t) for t in text_unpacked_short]


def shorten_text(x):
    t = ast.literal_eval(x)
    if len(t) > 2000:
        return ' '.join(t[:2000])
    else:
        return ' '.join(t)


df['text_unpacked'] = df.text.apply(shorten_text)

#df.text_unpacked = df.text.apply(lambda x: ' '.join(ast.literal_eval(x)))
model.build_vocab(df.text_unpacked)
print("Vocabulary loading complete!")


#writing function for common cosine similarity
def doc_words_cosine(i, t):
    emb = embeddings[i]
    if t == 'culture':
        word_vec_avg = np.sum(culture_embeddings, axis=0) / len(culture)
    elif t == 'demographic':
        word_vec_avg = np.sum(demographic_embeddings,
                              axis=0) / len(demographic)
    elif t == 'relational':
        word_vec_avg = np.sum(relational_embeddings, axis=0) / len(relational)
    return absolute(dot(emb, word_vec_avg) / (norm(emb) * norm(word_vec_avg)))
Example #24
0
V = 1
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}

model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'final_text_vectors.txt'
model.set_w2v_path(W2V_PATH)
model.build_vocab(sentences, tokenize=True)  #build_vocab_k_words(K=100000)

embeddings = model.encode(
    sentences,
    tokenize=True)  #(sentences, bsize=168, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

sen_vec = preprocessing.normalize(embeddings)
sen_vec = Variable(torch.from_numpy(sen_vec))
#sen_vec = nn.Linear(4096,300)
model = net()
n = (1, 300)
nparray = np.zeros(n)
for i in sen_vec:
    out = model(i)
    out = out.data.numpy()
Example #25
0
def getDocumentEmbedding(doc, model_params: dict = {}, encoder = 'xlnet', model_name = 'xlnet-base-uncased'):
  #model = SentenceTransformer(model_name, model_params)
  #sentence_embedding = model.encode(doc)

  ## Word tokenizer
  from spacy.lang.en import English
  nlp = English()
  # Create a Tokenizer with the default settings for English including punctuation rules and exceptions
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  tokens = tokenizer("This is a sentence")
  if len(tokens) > getMaxLength(encoder):
    warnings.warn("The input sequence length exceeds the maximum limit.", Warning)



  if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart', 'finbert']:
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
    sentence_embeddings = model.encode(doc)
    

  elif encoder == 'use':
    #!pip install embedding-as-service
    from embedding_as_service.text.encode import Encoder
    en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
    sentence_embeddings = en.encode(texts=doc)


  elif encoder == 'infersent':
    import nltk
    nltk.download('punkt')
    from models import InferSent
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
    infersent = InferSent(params_model)
    W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab(sentences, tokenize=True)
    sentence_embeddings = infersent.encode(doc, tokenize=True)


  elif encoder == 'sent2vec':
    import sent2vec
    model = sent2vec.Sent2vecModel()
    model.load_model('drive/My Drive/torontobooks_unigram.bin') 
    sentence_embeddings = model.embed_sentences(doc)



  elif encoder == 'laser':
    from laserembeddings import Laser
    laser = Laser()  ## Also used for multilingual sentence embeddings
    sentence_embeddings = laser.embed_sentences(sentences, lang='en') 


  return sentence_embeddings
Example #26
0
def infer(inputs):
    radius = 0.09
    nlp = spacy.load("en_core_web_sm")
    sentences = []
    locations = []

    import json
    pass_in = json.loads(inputs)

    for call in pass_in:
        sentences.append(call['transcript'])
        locations.append((call['latitude'], call['longitude']))

    from models import InferSent
    V = 2
    MODEL_PATH = 'encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)

    ## The old, bag of filtered words, implementation follows
    # for i, sentence in enumerate(sentences):
    #     sentences[i] = nlp(' '.join([str(t) for t in nlp(sentence) if t.pos_ in ['NOUN', 'PROPN', 'ADJ']]))
    #
    # sentences_matrix = np.vstack([x.vector / norm(x.vector) for x in sentences])
    # ling_compatibility = np.matmul(sentences_matrix, np.transpose(sentences_matrix))
    # print(ling_compatibility)

    infersent.build_vocab(sentences, tokenize=True)
    embeddings = infersent.encode(sentences, tokenize=True)
    embeddings = embeddings / np.linalg.norm(
        embeddings, ord=2, axis=1, keepdims=True)

    ling_compatibility = np.matmul(embeddings, np.transpose(embeddings))

    #print(ling_compatibility)

    def intersection_area(d, r):
        if d == 0:  # the circles are the same
            return np.pi * r**2
        if d >= 2 * r:  # The circles don't overlap at all.
            return 0

        r2, d2 = r**2, d**2
        alpha = np.arccos(d2 / (2 * d * r))
        wow = 2 * r2 * alpha - r2 * np.sin(2 * alpha)
        return wow

    geo_compatibility = np.zeros((len(locations), len(locations)))
    for i in range(len(locations)):
        for k in range(i, len(locations)):
            geo_compatibility[i][k] = intersection_area(
                math.sqrt((locations[i][0] - locations[k][0])**2 +
                          (locations[i][1] - locations[k][1])**2),
                radius) / (math.pi * (2**2))

    from sklearn.cluster import KMeans
    total = np.multiply(ling_compatibility, geo_compatibility)
    #print(total.shape)
    #for i in range(len(locations)):
    #    for k in range(len(locations)):
    #        if i != k and total[i][k] > 0.65:
    #            print(str(i) + " and " + str(k) + " are the same incident")
    kmeany = KMeans(init='k-means++').fit(total)
    labels = kmeany.labels_.tolist()

    mapper = {}
    for call, label in enumerate(labels):
        mapper[call] = label

    class Analysis:
        def __init__(self, sentence):
            self.sentence = sentence
            self.nlpped = nlp(sentence)

            self.nouns = [
                str(t.lemma_) for t in self.nlpped if
                (t.pos_ in ['PROPN', 'NOUN'] and t.lemma_ not in ['I', 'help'])
            ]

            self.verbs = [
                str(t.lemma_) for t in self.nlpped if
                (t.pos_ in ['VERB', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
                 and t.lemma_ not in [
                     'be', 'have', 'do', 'say', 'go', 'get', 'make', 'know',
                     'think', 'take', 'help', 'may', 'fear', 'see', 'stop',
                     'reach', 'seem', 'hope', 'want', 'would', 'cause', 'let',
                     'like', 'will'
                 ])
            ]

    analyses = []
    for sentence in sentences:
        analyses.append(Analysis(sentence))
    d = []
    for n in set(mapper.values()):
        nouns = []
        for k in mapper.keys():
            if mapper[k] == n:
                nouns += analyses[k].nouns

        noun_counter = Counter(nouns)

        verbs = []
        for k in mapper.keys():
            if mapper[k] == n:
                verbs += analyses[k].verbs

        verb_counter = Counter(verbs)

        calls = []
        for k in mapper.keys():
            if mapper[k] == n:
                call = {
                    'transcript': sentences[k],
                    'file': pass_in[k]['file'],
                    'lat': locations[k][0],
                    'lon': locations[k][1],
                    'id': pass_in[k]['id']
                }

                calls.append(call)

        blah = [x[0] for x in verb_counter.most_common(3) if x[1] > 1
                ] + [x[0] for x in noun_counter.most_common(3) if x[1] > 1]
        if len(blah) == 0:
            blah = [x[0] for x in verb_counter.most_common(1)
                    ] + [x[0] for x in noun_counter.most_common(1)]

        d.append({'name': ' '.join(blah), 'calls': calls})

    return json.dumps(d)
import pandas as pd
import spacy
import nltk
import numpy as np
import torch
from models import InferSent
df=pd.read_csv("/home/psrivastava/Intern_Summer/data/new_output.csv")
abs_arr=df.ix[:4,'clean_text']
nlp=spacy.load("en_core_web_sm")
MODEL_PATH="/home/psrivastava/Intern_Summer/infersent/encoder/infersent2.pkl"
W2V_PATH="/home/psrivastava/Intern_Summer/infersent/fastText/crawl-300d-2M.vec"
params_model={'bsize':64,'word_emb_dim':300,'enc_lstm_dim':2048,'pool_type':'max','dpout_model':0.0,'version':2}
infersent=InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
infersent.set_w2v_path(W2V_PATH)

for index in range(len(abs_arr)):
    doc=nlp(abs_arr[index])
    strs_after_stop_arr=[]
    for token in doc:
        if not token.is_stop:
            strs_after_stop_arr.append(token.text)
    
    abs_arr[index]=' '.join(strs_after_stop_arr)
    
infersent.build_vocab(abs_arr) #But Actually they are abstracts of diffrent papers
print(infersent.encode(abs_arr)[0][:])
Example #28
0
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
print('Load our pre-trained model (in encoder/)')

# Set word vector path for the model
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)
print('Set word vector path for the model')

# Build the vocabulary of word vectors (i.e keep only those needed)
infersent.build_vocab(all_captions, tokenize=True)
print('Build the vocabulary of word vectors')

# Start encoding captions
caption2id = {}
f = open('pascal-sentences-dataset/text_features.txt', 'w+')
for caption in all_captions:
    current_feature = list(
        infersent.encode([caption], tokenize=True).squeeze())
    if not caption in caption2id:
        caption2id[caption] = 'caption_' + str(len(caption2id))
    current_feature = [str(feature) for feature in current_feature]
    current_feature_str = ' '.join(current_feature)
    f.write('%s %s\n' % (caption2id[caption], current_feature_str))
f.close()
PARSER.add_argument('--question', metavar='string', required=True, help="The question you want answered")
ARGS = PARSER.parse_args()
question = ARGS.question
sentences = [question]
#### Load Facebook's InferSent (download the files from the internet)
infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1})
infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl'))
infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt')

# Extract the most relevant Wikipedia page
#### Wikipedia recommends 10 pages
wikipedia_pages = wikipedia.search(question)
sentences = sentences + wikipedia_pages
#### Convert sentences to numbers
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True, verbose=False)
#### Choose the most relevant pages
distances = pdist(np.array(embeddings), metric='euclidean')
sentence_similarity_matrix = squareform(distances)
most_relevant_pages = np.argsort(sentence_similarity_matrix[0][1:])
#### Extract the content on the most relevant page (tries multiple pages in case of failure)
for page in most_relevant_pages:
    try:
        content_on_the_page = wikipedia.page(wikipedia_pages[page]).content
        break
    except:
        pass

# Find and print the most relevant sentences
#### Split the content into sentences
from models import InferSent
import torch
import pandas as pd
from textblob import TextBlob

df = pd.read_csv('data/train.csv')

blob = TextBlob(" ".join(df['context'].drop_duplicates().reset_index(drop=True))) # Droping all dupliacte context from the dataframe
sentences = [item.raw for item in blob.sentences]

MODEL_PATH = 'models/infersent_untrained.pkl'
GLOVE_PATH = 'data/glove.840B.300d.txt'
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
model.set_w2v_path(GLOVE_PATH)
model.build_vocab(sentences, tokenize=True)

torch.save(model, 'models/infersent_trained.pt')