Beispiel #1
0
def load_test():
    model = fasttext.load_model("/home/zhoutong/nlp/data/cc.en.300.bin")
    vec1 = model.get_word_vector("china")
    vec2 = model.get_word_vector("america")
    similarity(vec1,vec2)

    sen_vec1 = model.get_sentence_vector("I come from china")
    sen_vec2 = model.get_sentence_vector("I am chinese")
    np.concatenate([model.get_word_vector(i) for i in ["I","am","chinese"]]) / 3
    similarity(sen_vec1,sen_vec2)

    gensim_model = FastText.load_fasttext_format('/home/zhoutong/nlp/data/cc.en.300.bin') # 10min
    gensim_model.most_similar('teacher')
    gensim_model.similarity('teacher', 'teaches')
    gensim_model.init_sims(replace=True)
    gensim_model.save('/home/zhoutong/nlp/data/cc.en.300.bin.gensim')
    gensim_model_new = FastText.load('/home/zhoutong/nlp/data/cc.en.300.bin.gensim',mmap='r')
import argparse
Beispiel #3
0
 def __init__(self, model_path, normalized=True):
     self.k = 'normal' if normalized else '_text'
     from gensim.models.wrappers import FastText
     self.model = FastText.load(model_path)
     self.zero = np.zeros(self.model.vector_size)
Beispiel #4
0
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# idfs = pd.read_csv("datasets/idf_scores.csv")
ft_counts = pd.read_csv("datasets/ft_counts_0.9.csv")
ft_neighs = pd.read_csv("datasets/ft_neighbours_0.9.csv")
wv_counts = pd.read_csv("datasets/wv_counts_0.9.csv")
# wv_neighs = pd.read_csv("datasets/wv_neighbours_0.9.csv")

wv_model = Word2Vec.load('datasets/wv_model')
ft_model = FastText.load("datasets/ft_model")

wv_model.most_similar("electronic", topn=10)
top = ft_model.most_similar("electronic", topn=100)

df = pd.DataFrame(columns=('tag_name', 'sim', 'count', 'neighbours_count', 'is_neighbour_count'))

for i, row in enumerate(top):
    tag = row[0]
    sim = row[1]
    count = ft_counts[ft_counts.tag_name == tag]["count"].values[0]
    incount = ft_counts[ft_counts.tag_name == tag]["is_neighbour_count"].values[0]
    nhcount = ft_counts[ft_counts.tag_name == tag]["neighbour_count"].values[0]

    df.loc[len(df)] = [tag, sim, count, nhcount, incount]
Beispiel #5
0
    def precompute_similarity(self, model_path, option, feature_size):
        all_note_contents = self.all_note_contents.Contents.to_list()
        all_note_nids = self.all_note_contents.NoteID.to_list()
        similarity_matrix = {}

        if option == 'ft_word_emd+sif':
            #'cleaned_data/ft_model_incr'
            # Load pretrained FastText embeddings
            model = FastText.load(model_path)
            logging.info('[Preprocessor] Using model: %s', str(model))

            similarity_matrix = {}
            nlp = spacy.load("en_core_web_sm")
            note_ids = all_note_contents['NoteID'].values
            contents = all_note_contents['Contents'].values
            data_words = [[token.text for token in nlp(content)]
                          for note_id, content in zip(note_ids, contents)]
            post_tokens = pd.DataFrame(data={
                'NoteID': note_ids,
                'Tokens': data_words
            }).set_index('NoteID')

            sentence_list = []
            sentence_embs = {}
            for note_id, post in post_tokens.iterrows():
                word_list = []
                for word in post.values[0]:
                    word_emd = model[word]
                    word_list.append(Word(word, word_emd))
                if len(word_list
                       ) > 0:  # did we find any words (not an empty set)
                    sentence_list.append(Sentence(word_list))
                sentence_embs[note_id] = sentence_to_vec(
                    sentence_list, feature_size)

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        # apply l2-distance
                        #utils.l2_sim()
                        # apply cosine distance
                        sim = utils.cosine_sim(emb1[0], emb2[0])
                        similarity_matrix[(note_id1, note_id2)] = sim
                        similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix

        elif option == 'bert_word_emb+sif':
            # for BERT
            import torch
            from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
            from keras.preprocessing.sequence import pad_sequences

            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                      max_len=128)
            MAX_LEN = 512
            tokenized_texts_list = []
            indexed_tokens_list = []
            attention_masks = []

            for text in all_note_contents.Contents.values:
                marked_text = "[CLS] " + text + " [SEP]"
                tokenized_text = tokenizer.tokenize(marked_text)
                tokenized_texts_list.append(tokenized_text)
                indexed_tokens_list.append(
                    tokenizer.convert_tokens_to_ids(tokenized_text))

            input_ids_list = pad_sequences(indexed_tokens,
                                           maxlen=MAX_LEN,
                                           dtype="long",
                                           truncating="post",
                                           padding="post")
            for seq in input_ids:
                seq_mask = [int(float(i > 0)) for i in seq]
                attention_masks.append(seq_mask)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor(input_ids_list)
            segments_tensors = torch.tensor(attention_masks)

            # Load pre-trained model (weights)
            model = BertModel.from_pretrained('bert-base-uncased')

            # Put the model in "evaluation" mode, meaning feed-forward operation.
            model.eval()

            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)

            emb_layers = encoded_layers[-4:]
            sum_layers = torch.stack(emb_layers,
                                     dim=0).sum(dim=0)  # 434*512*768
            sentence_word_embs = {}
            for i in range(len(tokenized_texts_list)):
                sentence_word_embs[
                    nids[i]] = sum_layers[i][:len(tokenized_texts_list[i])]

            # Keep a look up dictionary [note id] --> text content
            tokenized_texts_ = {
                nid: tokenized_texts_list[i]
                for i, nid in enumerate(nids)
            }

            embedding_size = feature_size  # Set the shape of the sentence/post embeddings
            sentence_list = []
            note_ids_lookup = []
            for note_id in nids:
                #print(note_id)
                word_list = []
                for j in range(len(sentence_word_embs[note_id])):
                    word_emb = sentence_word_embs[note_id][j]
                    # Add here if to use only keywords
                    word_text = tokenized_texts_[note_id][j]
                    word_list.append(Word(word_text, word_emb.numpy()))
                if len(word_list) > 0:
                    sentence_list.append(Sentence(word_list))
                    note_ids_lookup.append(
                        note_id
                    )  # in case there are some posts of 0 length, thus not included in this

            # Encode sentences/posts with embeddigns
            sentence_embs = {}
            sentence_vectors = sentence_to_vec(
                sentence_list,
                embedding_size)  # all vectors converted together
            if len(sentence_vectors) == len(sentence_list):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[note_ids_lookup[i]] = sentence_vectors[i]

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        # apply l2-distance
                        #utils.l2_sim()
                        # apply cosine distance
                        sim = utils.cosine_sim(emb1[0], emb2[0])
                        similarity_matrix[(note_id1, note_id2)] = sim
                        similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix, sentence_embs

        elif option == 'sentence_emb':
            import tensorflow as tf
            import tensorflow_hub as hub

            embed = hub.load(model_path)

            logging.info(
                '[Preprocessor] using model: universal-sentence-encoder-1')
            sentence_embs = {}
            sentence_vectors = embed(all_note_contents)
            if len(sentence_vectors) == len(all_note_contents):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[
                        all_note_nids[i]] = sentence_vectors[i].numpy()

            #corr = np.inner(sentence_vectors, sentence_vectors)
            #cosine_similarities = tf.reduce_sum(tf.multiply(sentence_vectors, sentence_vectors), axis=1)
            #clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
            #sim_scores = 1.0 - tf.acos(clip_cosine_similarities)

            #print(sim_scores)
            #for i, sims in enumerate(sim_scores):
            #    for j, sim in enumerate(sims):
            ##        note_id1 = all_note_nids[i]
            #        note_id2 = all_note_nids[j]
            #        if not note_id1==note_id2:
            #            similarity_matrix[(note_id1, note_id2)] = sim

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        # apply l2-distance
                        #utils.l2_sim()
                        # apply cosine distance
                        sim = utils.cosine_sim(emb1, emb2)
                        similarity_matrix[(note_id1, note_id2)] = sim
                        similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix, sentence_embs

        elif option == 'tfidf+lsi':
            logging.info(
                '[Preprocessor] using TFIDF vectors, LSI for dimension reduction'
            )
            data_words, note_ids, id2word, corpus = utils.preprocess(
                self.all_note_contents, 10, ['NOUN', 'VERB'], STOP_WORDS,
                'tokens_phrases')
            #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID')
            logging.debug('[Preprocessor] - %d non-empty posts', len(corpus))
            logging.debug('[Preprocessor] - %s extracted %d tokens/phrases',
                          'tokens_phrases', len(id2word))
            tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words)

            word2id = {v: k for k, v in id2word.items()}
            tfidf_corpus = [[(word2id[pair[0]], pair[1])
                             for pair in post.items()]
                            for post in tfidf_matrix]

            model = LsiModel(tfidf_corpus,
                             num_topics=feature_size,
                             id2word=id2word)

            sentence_embs = {}
            for i, post_tfidf in enumerate(tfidf_corpus):
                note_id = note_ids[i]
                if not note_id in sentence_embs:
                    post_repr = model[post_tfidf]
                    #print(post_repr)
                    #print(i)
                    sentence_embs[note_id] = np.array([
                        p[1] for p in post_repr
                        if len(post_repr) == feature_size
                    ])

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        if len(emb1) and len(emb2):
                            # apply l2-distance
                            #utils.l2_sim()
                            # apply cosine distance
                            sim = utils.cosine_sim(emb1, emb2)
                            similarity_matrix[(note_id1, note_id2)] = sim
                            similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix, sentence_embs
Beispiel #6
0
model_w2v = KeyedVectors.load_word2vec_format(
    "/media/philippy/SSD/pretrained_wiki_de_w2v")

# %%
# model.build_vocab(sentences)
model.train(sentences, epochs=5)

# %%
print(model.most_similar('gut', topn=50))

# %%
model = KeyedVectors.load("/media/philippy/SSD/vectors.txt")

# %%
word_vectors = KeyedVectors.load('/media/philippy/SSD/vectors.txt',
                                 binary=False)

# %%

# %%

model = FastText.load('/media/philippy/SSD/wiki.de.vec')

# %%
print(model.most_similar('volk', topn=50))

# %%
model = KeyedVectors.load_word2vec_format('/media/philippy/SSD/wiki.de.vec')

# %%
Beispiel #7
0
import gensim
import os
import logging
import itertools

from gensim.models.word2vec import Text8Corpus
from gensim.models.wrappers import FastText

MODEL_FILE = './phonmodels/model4'
TEXT8_FILE = './fil9_phon'
QUIZ_FILE = './questions-words-phon.txt'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

if os.path.isfile(MODEL_FILE):
    model = FastText.load(MODEL_FILE)

else:
    corpus = Text8Corpus(TEXT8_FILE)
    # TODO: increase size and window *separately*
    model = FastText.train('./fasttext', corpus_file=TEXT8_FILE, size=300, window=10)
    model.save(MODEL_FILE)

model.accuracy(QUIZ_FILE)