def load_test(): model = fasttext.load_model("/home/zhoutong/nlp/data/cc.en.300.bin") vec1 = model.get_word_vector("china") vec2 = model.get_word_vector("america") similarity(vec1,vec2) sen_vec1 = model.get_sentence_vector("I come from china") sen_vec2 = model.get_sentence_vector("I am chinese") np.concatenate([model.get_word_vector(i) for i in ["I","am","chinese"]]) / 3 similarity(sen_vec1,sen_vec2) gensim_model = FastText.load_fasttext_format('/home/zhoutong/nlp/data/cc.en.300.bin') # 10min gensim_model.most_similar('teacher') gensim_model.similarity('teacher', 'teaches') gensim_model.init_sims(replace=True) gensim_model.save('/home/zhoutong/nlp/data/cc.en.300.bin.gensim') gensim_model_new = FastText.load('/home/zhoutong/nlp/data/cc.en.300.bin.gensim',mmap='r')
import argparse
def __init__(self, model_path, normalized=True): self.k = 'normal' if normalized else '_text' from gensim.models.wrappers import FastText self.model = FastText.load(model_path) self.zero = np.zeros(self.model.vector_size)
from scipy.cluster.hierarchy import dendrogram, linkage from scipy.cluster.hierarchy import cophenet from scipy.cluster.hierarchy import fcluster from sklearn.decomposition import TruncatedSVD from sklearn.metrics.pairwise import cosine_similarity # idfs = pd.read_csv("datasets/idf_scores.csv") ft_counts = pd.read_csv("datasets/ft_counts_0.9.csv") ft_neighs = pd.read_csv("datasets/ft_neighbours_0.9.csv") wv_counts = pd.read_csv("datasets/wv_counts_0.9.csv") # wv_neighs = pd.read_csv("datasets/wv_neighbours_0.9.csv") wv_model = Word2Vec.load('datasets/wv_model') ft_model = FastText.load("datasets/ft_model") wv_model.most_similar("electronic", topn=10) top = ft_model.most_similar("electronic", topn=100) df = pd.DataFrame(columns=('tag_name', 'sim', 'count', 'neighbours_count', 'is_neighbour_count')) for i, row in enumerate(top): tag = row[0] sim = row[1] count = ft_counts[ft_counts.tag_name == tag]["count"].values[0] incount = ft_counts[ft_counts.tag_name == tag]["is_neighbour_count"].values[0] nhcount = ft_counts[ft_counts.tag_name == tag]["neighbour_count"].values[0] df.loc[len(df)] = [tag, sim, count, nhcount, incount]
def precompute_similarity(self, model_path, option, feature_size): all_note_contents = self.all_note_contents.Contents.to_list() all_note_nids = self.all_note_contents.NoteID.to_list() similarity_matrix = {} if option == 'ft_word_emd+sif': #'cleaned_data/ft_model_incr' # Load pretrained FastText embeddings model = FastText.load(model_path) logging.info('[Preprocessor] Using model: %s', str(model)) similarity_matrix = {} nlp = spacy.load("en_core_web_sm") note_ids = all_note_contents['NoteID'].values contents = all_note_contents['Contents'].values data_words = [[token.text for token in nlp(content)] for note_id, content in zip(note_ids, contents)] post_tokens = pd.DataFrame(data={ 'NoteID': note_ids, 'Tokens': data_words }).set_index('NoteID') sentence_list = [] sentence_embs = {} for note_id, post in post_tokens.iterrows(): word_list = [] for word in post.values[0]: word_emd = model[word] word_list.append(Word(word, word_emd)) if len(word_list ) > 0: # did we find any words (not an empty set) sentence_list.append(Sentence(word_list)) sentence_embs[note_id] = sentence_to_vec( sentence_list, feature_size) # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1[0], emb2[0]) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix elif option == 'bert_word_emb+sif': # for BERT import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM from keras.preprocessing.sequence import pad_sequences tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_len=128) MAX_LEN = 512 tokenized_texts_list = [] indexed_tokens_list = [] attention_masks = [] for text in all_note_contents.Contents.values: marked_text = "[CLS] " + text + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) tokenized_texts_list.append(tokenized_text) indexed_tokens_list.append( tokenizer.convert_tokens_to_ids(tokenized_text)) input_ids_list = pad_sequences(indexed_tokens, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") for seq in input_ids: seq_mask = [int(float(i > 0)) for i in seq] attention_masks.append(seq_mask) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(input_ids_list) segments_tensors = torch.tensor(attention_masks) # Load pre-trained model (weights) model = BertModel.from_pretrained('bert-base-uncased') # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) emb_layers = encoded_layers[-4:] sum_layers = torch.stack(emb_layers, dim=0).sum(dim=0) # 434*512*768 sentence_word_embs = {} for i in range(len(tokenized_texts_list)): sentence_word_embs[ nids[i]] = sum_layers[i][:len(tokenized_texts_list[i])] # Keep a look up dictionary [note id] --> text content tokenized_texts_ = { nid: tokenized_texts_list[i] for i, nid in enumerate(nids) } embedding_size = feature_size # Set the shape of the sentence/post embeddings sentence_list = [] note_ids_lookup = [] for note_id in nids: #print(note_id) word_list = [] for j in range(len(sentence_word_embs[note_id])): word_emb = sentence_word_embs[note_id][j] # Add here if to use only keywords word_text = tokenized_texts_[note_id][j] word_list.append(Word(word_text, word_emb.numpy())) if len(word_list) > 0: sentence_list.append(Sentence(word_list)) note_ids_lookup.append( note_id ) # in case there are some posts of 0 length, thus not included in this # Encode sentences/posts with embeddigns sentence_embs = {} sentence_vectors = sentence_to_vec( sentence_list, embedding_size) # all vectors converted together if len(sentence_vectors) == len(sentence_list): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[note_ids_lookup[i]] = sentence_vectors[i] # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1[0], emb2[0]) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix, sentence_embs elif option == 'sentence_emb': import tensorflow as tf import tensorflow_hub as hub embed = hub.load(model_path) logging.info( '[Preprocessor] using model: universal-sentence-encoder-1') sentence_embs = {} sentence_vectors = embed(all_note_contents) if len(sentence_vectors) == len(all_note_contents): for i in range(len(sentence_vectors)): # map: note_id -> vector sentence_embs[ all_note_nids[i]] = sentence_vectors[i].numpy() #corr = np.inner(sentence_vectors, sentence_vectors) #cosine_similarities = tf.reduce_sum(tf.multiply(sentence_vectors, sentence_vectors), axis=1) #clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0) #sim_scores = 1.0 - tf.acos(clip_cosine_similarities) #print(sim_scores) #for i, sims in enumerate(sim_scores): # for j, sim in enumerate(sims): ## note_id1 = all_note_nids[i] # note_id2 = all_note_nids[j] # if not note_id1==note_id2: # similarity_matrix[(note_id1, note_id2)] = sim # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1, emb2) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix, sentence_embs elif option == 'tfidf+lsi': logging.info( '[Preprocessor] using TFIDF vectors, LSI for dimension reduction' ) data_words, note_ids, id2word, corpus = utils.preprocess( self.all_note_contents, 10, ['NOUN', 'VERB'], STOP_WORDS, 'tokens_phrases') #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID') logging.debug('[Preprocessor] - %d non-empty posts', len(corpus)) logging.debug('[Preprocessor] - %s extracted %d tokens/phrases', 'tokens_phrases', len(id2word)) tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words) word2id = {v: k for k, v in id2word.items()} tfidf_corpus = [[(word2id[pair[0]], pair[1]) for pair in post.items()] for post in tfidf_matrix] model = LsiModel(tfidf_corpus, num_topics=feature_size, id2word=id2word) sentence_embs = {} for i, post_tfidf in enumerate(tfidf_corpus): note_id = note_ids[i] if not note_id in sentence_embs: post_repr = model[post_tfidf] #print(post_repr) #print(i) sentence_embs[note_id] = np.array([ p[1] for p in post_repr if len(post_repr) == feature_size ]) # Compute post-wise cosine similarities for note_id1, emb1 in sentence_embs.items(): for note_id2, emb2 in sentence_embs.items(): if note_id1 != note_id2 and ( note_id2, note_id1) not in similarity_matrix: if len(emb1) and len(emb2): # apply l2-distance #utils.l2_sim() # apply cosine distance sim = utils.cosine_sim(emb1, emb2) similarity_matrix[(note_id1, note_id2)] = sim similarity_matrix[(note_id2, note_id1)] = sim return similarity_matrix, sentence_embs
model_w2v = KeyedVectors.load_word2vec_format( "/media/philippy/SSD/pretrained_wiki_de_w2v") # %% # model.build_vocab(sentences) model.train(sentences, epochs=5) # %% print(model.most_similar('gut', topn=50)) # %% model = KeyedVectors.load("/media/philippy/SSD/vectors.txt") # %% word_vectors = KeyedVectors.load('/media/philippy/SSD/vectors.txt', binary=False) # %% # %% model = FastText.load('/media/philippy/SSD/wiki.de.vec') # %% print(model.most_similar('volk', topn=50)) # %% model = KeyedVectors.load_word2vec_format('/media/philippy/SSD/wiki.de.vec') # %%
import gensim import os import logging import itertools from gensim.models.word2vec import Text8Corpus from gensim.models.wrappers import FastText MODEL_FILE = './phonmodels/model4' TEXT8_FILE = './fil9_phon' QUIZ_FILE = './questions-words-phon.txt' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if os.path.isfile(MODEL_FILE): model = FastText.load(MODEL_FILE) else: corpus = Text8Corpus(TEXT8_FILE) # TODO: increase size and window *separately* model = FastText.train('./fasttext', corpus_file=TEXT8_FILE, size=300, window=10) model.save(MODEL_FILE) model.accuracy(QUIZ_FILE)