def load(self, model_name='kyubyong', df_data=None): ''' Available pre-trained models: lwvlib, kyubyong, wikicorpus ''' if model_name == 'lwvlib': # http://dl.turkunlp.org/finnish-embeddings/ # https://turkunlp.org/finnish_nlp.html --> Finnish internet parsebank # http://bionlp-www.utu.fi/wv_demo/ --> online interface # https://github.com/fginter/wvlib_light/tree/3471a8db66883769c4e5398806876d5be3e3df24 --> library import lwvlib self.wv = lwvlib.load("finnish_4B_parsebank_skgram.bin") # #self.wv=lwvlib.load("finnish_s24_skgram.bin") #,10000,500000) elif model_name == 'kyubyong': #import gensim #https://github.com/Kyubyong/wordvectors self.wv = gensim.models.Word2Vec.load( 'kyubyong_fin_word2vec.bin').wv elif model_name == 'wikicorpus': wiki_corpus = WikiDumpToCorpus( 'fiwiki-latest-pages-articles.xml.bz2') if os.path.exists('wikicorpus_word2vec.bin'): # load model self.mv = gensim.models.Word2Vec.load( 'wikicorpus_word2vec.bin').wv elif wiki_corpus.check_corpus(): # train model # model = gensim.models.Word2Vec(['testi'], size=200, window=10, min_count=5, # workers=multiprocessing.cpu_count(), iter=2000) if not df_data.empty: # sentences = [] # for i, row in df_data.iterrows(): # sentence = gensim.utils.simple_preprocess(row['Nimi']) # if len(sentence) > 0: # sentences.append(sentence) # model.build_vocab(sentences) def train_func(train_data): model = gensim.models.Word2Vec( gensim.utils.simple_preprocess(train_data), size=200, window=10, min_count=5, workers=multiprocessing.cpu_count(), iter=1) model.save("wikicorpus_word2vec.model") model.wv.save_word2vec_format( "wikicorpus_word2vec.bin", binary=True) #data = gensim.utils.simple_preprocess(train_data) #model.train(data, total_examples=len(data), epochs=100) wiki_corpus.open_corpus(train_func) elif os.path.exists( 'fiwiki-latest-pages-articles.xml.bz2'): # make corpus wiki_corpus.make_corpus() else: #train print("Error in 'wikicorpus'")
def read_pretrained_vectors(vocabulary, vector_model, max_words=1000000): print("Loading pretrained word embeddings from " + vector_model) model = lwvlib.load(vector_model, max_words, max_words) vectors = np.zeros((len(vocabulary), model.vectors.shape[1]), np.float) for word, idx in vocabulary.items(): if idx == 0 or idx == 1 or idx == 2: continue # mask and unk and end midx = model.get(word) if midx is not None: vectors[idx] = model.vectors[midx] # TODO: normalize? return vectors
def read_vocab(self,fname,vsize=10000): """ read vocabulary from wvlib model """ model=lwvlib.load(fname,vsize,vsize) self.words=model.words self.words.insert(0,"<MASK>") self.words.insert(1,"<UNK>") self.shape=(len(self.words),model.vectors.shape[1]) self.word_to_dim=dict((w,i) for i,w in enumerate(self.words)) self.vectors=np.zeros((vsize+2,model.vectors.shape[1]),np.float) for i,row in enumerate(model.vectors): self.vectors[i+2]=row self.vectors[1]=self.vectors[np.random.randint(2,len(self.words))] # take a random vector for unk # TODO: average of something...
def read_vocab(self, fname, vsize=10000): """ read vocabulary from wvlib model """ model = lwvlib.load(fname, vsize, vsize) self.words = model.words self.words.insert(0, "<MASK>") self.words.insert(1, "<UNK>") self.shape = (len(self.words), model.vectors.shape[1]) self.word_to_dim = dict((w, i) for i, w in enumerate(self.words)) self.vectors = np.zeros((vsize + 2, model.vectors.shape[1]), np.float) for i, row in enumerate(model.vectors): self.vectors[i + 2] = row self.vectors[1] = self.vectors[np.random.randint( 2, len(self.words) )] # take a random vector for unk # TODO: average of something...
def main(args): print("Reading",args.char_model,"model",file=sys.stderr) model=lwvlib.load(args.char_model) vocabulary=read_vocab(args.vocab) # TODO normalize? # words: the words themselves all_features=[] # data: the vector matrix data=np.zeros((len(vocabulary),model.vsize),np.float32) for feature,wcount in vocabulary: vector=np.zeros(model.vsize,np.float32) norm=0 postag,feats=feature.split("|",1) for feat in feats.split("|"): vidx=model.get(postag+"|"+feat) if vidx!=None: vector=np.add(vector,model.vectors[vidx]) norm+=1 if norm==0: print("No vector for feature",feature,"with pos tag",postag,file=sys.stderr) continue vector=np.divide(vector,norm) data[len(all_features),:]=vector all_features.append(feature) # reshape data to len(words) data=data[:len(all_features),:] print("Original vector model:",len(model.words),model.vsize,file=sys.stderr) print("New vector model:",data.shape,file=sys.stderr) assert len(all_features)==data.shape[0] save_txt(all_features,data,args.output)
def replace_embeddings(host, donor, out): """ Replaces donor embeddings in host, writes to out. host: hnmt/bnas Embeddings submodel file name in donor: word2vec bin file name out: hnmt/bnas Embeddings submodel file name out """ with open(host, "rb") as hostf: host_config = pickle.load(hostf) host_data = pickle.load(hostf) host_matrix = host_data[("w", )] print("...{} loaded".format(host), file=sys.stderr, flush=True) donor_model = lwvlib.load(donor) donor_matrix = donor_model.vectors.astype(host_matrix.dtype) assert host_matrix.shape[1] == donor_matrix.shape[1], ( host_matrix.shape[1], donor_matrix.shape[1]) print("...{} loaded".format(donor), file=sys.stderr, flush=True) print("Sum before exchange", np.sum(host_data[("w", )]), file=sys.stderr, flush=True) #Replace for w, host_dim in host_config["src_encoder"].index.items(): donor_dim = donor_model.get(w) if donor_dim is None: print("Cannot map {}".format(w), file=sys.stderr, flush=True) continue host_matrix[host_dim] = donor_matrix[donor_dim] #Done replacing now with open(out, "wb") as outf: pickle.dump(host_config, outf) pickle.dump(host_data, outf) print("Sum after exchange", np.sum(host_data[("w", )]), file=sys.stderr, flush=True)
random.shuffle(shuffle) data_matrix=data_matrix[shuffle] class_indices=class_indices[shuffle].astype(np.int32) return class_indices, data_matrix if __name__=="__main__": import argparse parser = argparse.ArgumentParser(description='Sclassifier') parser.add_argument('--embeddings', help='embeddings file') parser.add_argument('--edim', type=int, default=200000, help='Embedding dimension') parser.add_argument('--tr-data', help='training data') parser.add_argument('--max-seq-len', type=int, default=30, help='training data') args = parser.parse_args() #source of embeddings wv_model=lwvlib.load(args.embeddings,args.edim,args.edim) d=Data(wv_model) with open(args.tr_data) as tr_f: class_indices, data_matrix=d.read(tr_f,args) class_indices_1hot=keras.utils.to_categorical(class_indices) inp_seq=Input(shape=(args.max_seq_len,), name="words", dtype='int32') inp_embeddings=Embedding(*wv_model.vectors.shape, input_length=args.max_seq_len, mask_zero=False, weights=[wv_model.vectors]) inp_embeddings.trainable=False text_src=inp_embeddings(inp_seq) #gru1_out=GRU(100,name="gru1")(text_src) cnn1_out=Conv1D(100,2,padding="same")(text_src)
"""Matrix words x dimensions, returns""" vectors = sklearn.preprocessing.normalize(vectors) top_n = np.argpartition(-np.abs(vectors), max_count, 0) top_per_dim = (top_n.T)[:, :max_count] #dims x max_count for dim in range(top_per_dim.shape[0]): print(dim, " ", end=' ') for w in top_per_dim[dim]: print(model.words[w], end=' ') print() if __name__ == "__main__": parser = argparse.ArgumentParser(description='ICA postprocessing for w2v') parser.add_argument('--w2v-model', required=True, action="store", help='W2V model') parser.add_argument('--ica-model', required=True, action="store", help='ICA model') parser.add_argument('--max-rank', type=int, default=200000, help='Max rank default %(default)d') args = parser.parse_args() model = lwvlib.load(args.w2v_model, args.max_rank, args.max_rank) ica_matrix = np.load(args.ica_model) dimension_extremes(ica_matrix, 15, model)
parser.add_argument("--max-rank-mem", type=int, default=None, help='Max vectors in memory') parser.add_argument("--max-rank", type=int, default=None, help='Max vectors total') parser.add_argument("--interactive", default=False, action="store_true", help='Read interactive') args = parser.parse_args() models = [ lwvlib.load(m, max_rank_mem=args.max_rank_mem, max_rank=args.max_rank) for m in args.models ] while True: if args.interactive: w = input("> ") else: w = sys.stdin.readline().strip() if not w: break nearest = [m.nearest(w, 200) for m in models] if any(n == None for n in nearest): print("Out of vocab") continue for nn in zip(*nearest): for sim, neighb in nn:
import finmeter, random, lwvlib, json, re from uralicNLP import semfi from uralicNLP import uralicApi from nltk.tokenize import RegexpTokenizer from collections import Counter import numpy as np from numpy.random import choice from markovchain import JsonStorage from markovchain.text import MarkovText, ReplyMode wv = lwvlib.load("fin-word2vec-lemma.bin", 10000, 500000) vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö'] markov = MarkovText.from_file('kalevala_and_others_markov.json') fallback_markov = MarkovText.from_file('fallback_markov.json') def count_syllables(verse): ''' Count syllables in a verse ''' tokenizer = RegexpTokenizer(r'\w+') n_syllables = 0 for word in tokenizer.tokenize(verse): try: n_syllables += len(finmeter.hyphenate(word).split("-")) except Exception as e: pass # print(e) # print(verse) # print("Error täällä: count_syllables")
import sys, json, gzip import lwvlib import numpy as np from sklearn.metrics.pairwise import cosine_similarity def check_nearest_words(vector, vector_i, wv): wv_vectors = wv.vectors wv_words = wv.words sims = cosine_similarity(np.array(vector).reshape(1, -1), wv_vectors)[0] indexes = np.argsort(sims)[::-1] for index in indexes[:10]: print(wv_words[index]) print() #print(vector) #print(wv.nearest(vector)) if __name__ == "__main__": wv = lwvlib.load("/home/ginter/w2v/pb34_wf_200_v2.bin", max_rank=400000) for line in sys.stdin: vectors = json.loads(line) for vector_i, vector in enumerate(vectors): check_nearest_words(vector, vector_i, wv)
ordered = [] #(major,minor) sents = {} #(major,minor) -> Sentiment() for line in inp: line = line.strip().lower() if not line: continue if line.startswith("## "): minor = line.split()[1] ordered.append((major, minor)) elif line.startswith("# "): major = line.split()[1] else: words = set(commaRe.split(line)) sents[(major, minor)] = Sentiment(major, minor, words) return ordered, sents if __name__ == "__main__": wv = lwvlib.load("../w2v/pb34_wf_200_v2_skgram.bin", 1000000, 1000000) ordered, sents = get_sents(sys.stdin) for s in sents.values(): s.expand(wv, 100) m = None for (major, minor) in ordered: if major != m: print("#", major) print() m = major print(sents[(major, minor)]) print()
import lwvlib if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Train') parser.add_argument('--orig', required=True, help='A (.bin) embedding file before training.') parser.add_argument( '--trained', required=True, help= 'A (.bin) embedding file after training (produced by tools/extract_embeddings).' ) args = parser.parse_args() orig_m = lwvlib.load(args.orig) trained_m = lwvlib.load(args.trained) df = orig_m.vectors - trained_m.vectors print(df)