def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5, sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0, dm_mean=0, train_words=True, train_lbls=True, **kwargs): """ Initialize the model from an iterable of `sentences`. Each sentence is a LabeledSentence object that will be used for training. The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used. Otherwise, `dbow` is employed. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate (will linearly drop to zero as training progresses). `seed` = for the random number generator. `min_count` = ignore all words with total frequency lower than this. `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5. `workers` = use this many worker threads to train the model (=faster training with multicore machines). `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). `negative` = if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. Only applies when dm is used. """ Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs) self.train_words = train_words self.train_lbls = train_lbls if sentences is not None: self.build_vocab(sentences) self.train(sentences)
def gene(): modelpre=Word2Vec.load('corpus/pretrain40.model') modelfield=Word2Vec.load('corpus/fieldtrained40.model') modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model') xlist=[] ylist=[] zlist=[] labellist=[] upperline=0.016 floor=0.008 #0.01 0.013 upperlinefield=0.06 floorfield=0.02 upperlinepre=0.019 floorpre=0.018 with open('corpus/word2pic2.txt') as fp: for row in fp: word=unicode(row[:-1]) x=(modelmerged.similarity(word,u"好")+modelmerged.similarity(word,u"快乐")+modelmerged.similarity(word,u"开心"))/3.0-(modelmerged.similarity(word,u"坏")+modelmerged.similarity(word,u"悲伤"))/2.0 y=(modelfield.similarity(word,u"好")+modelfield.similarity(word,u"快乐")+modelfield.similarity(word,u"开心"))/3.0-(modelfield.similarity(word,u"坏")+modelfield.similarity(word,u"悲伤"))/2.0 z=(modelpre.similarity(word,u"好")+modelpre.similarity(word,u"快乐")+modelpre.similarity(word,u"开心"))/3.0-(modelpre.similarity(word,u"坏")+modelpre.similarity(word,u"悲伤"))/2.0 labellist.append(word) # xlist.append(x-(upperline+floor)/2.0) xlist.append(x-0.016) ylist.append(y-(upperlinefield+floorfield)/2.0) zlist.append(z-(upperlinepre+floorpre)/2.0) # with open('corpus/word2picxyz.txt','w') as fp: # pickle.dump(labellist,xlist,ylist,zlist,fp) return labellist,xlist,ylist,zlist
def dis(vectorsize): # print model.similarity("今天","在") model=Word2Vec.load('corpus/mergedtrained'+str(vectorsize)+'iter1'+'.model') modelfield=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model') print model.similarity(u"分手",u"好") print model.similarity(u"分手",u"坏") print modelfield.similarity(u"分手",u"好") print modelfield.similarity(u"分手",u"坏")
def load_word2vec(w2v): if isinstance(w2v, str): print("Loading word vectors from '%s'..." % w2v, flush=True) try: w2v = Word2Vec.load_word2vec_format(w2v) except ValueError: w2v = Word2Vec.load_word2vec_format(w2v, binary=True) return w2v
def intersect(vectorsize): model=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model') # setwordwindow(vectorsize) print 'finish load' Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False) print 'finish intersect' model.save('corpus/merged'+str(vectorsize)+'.model') model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False) print 'finish save'
def get_model(model_num, model_names): if model_num < 10: model = Word2Vec.load(model_path + model_names) elif model_num < 99: model = Doc2Vec.load(model_path + model_names) else: model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format return model
def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0): self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. Word2Vec.train(self, sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) self.get_vocab_word_vecs()
def main(): # te() # teword() # intersect(40) # setwordwindow(40) # Word2Vec.load_word2vec_format('corpus/initindex40',binary=False) modelpre=Word2Vec.load('corpus/pretrain40.model') modelfield=Word2Vec.load('corpus/fieldtrained40.model') modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model') print 'finish load' classify(modelpre,modelfield,modelmerged,40)
def teword(): # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2) # print sim documents=[u"今天 天气 真是 好 啊",u"明天 就要 下雨 了,伐 开心"] model=Word2Vec(documents,size=20,window=5,min_count=1) sim=model.most_similar(positive=[u"好"],topn=2) # model.save('./tmp/tevec') print sim model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) Word2Vec.intersect_word2vec_format(model,'fieldvec.bin',binary=False) Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
def load_external(self, model_file_name): """ load a word2vec model from the file specified :param model_file_name: name of the model file :return: """ self.model = Word2Vec.load(model_file_name)
def main(): industry = sys.argv[1] vocab_file = "../data/" + industry + "/embed_vocab" model_file = "../data/" + industry + "/user_model" # load vocab list with open(vocab_file) as f: vocab_list = map(str.strip, f.readlines()) # load model model = Word2Vec.load(model_file) # build vocab index dict vob_index_dict = {} for i, vob in enumerate(vocab_list): vob_index_dict[vob] = i # calc vocab dist logging.info("calucating vocab dist matrix") dm = get_vocab_dist_matrix(vocab_list, model) # get company domain list dict comp_domain_file = "../data/" + industry + "/company_file" comp_dict = get_comp_dict(comp_domain_file) logging.info("company dict generated : " + str(comp_dict.keys())) # delete domain not exist in vocab list filter_company_by_vocab(comp_dict, vocab_list) # filter company domain by uv : default uv > 100 filter_action_by_uv(comp_dict, 100) # calc dist between two company res_file = "../data/" + industry + "/company_dist" calc_company_dist(res_file, comp_dict, dm, vob_index_dict)
def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=30 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram=None #self.trigram=None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') self.model = Sequential() self.model.add(Dropout(0.2,input_shape=(self.num_feature,))) self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal')) self.model.add(Activation('softmax')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical") print('Model has been built!')
def return_data(data_type, embed_dim=50): """Return the data specified by the inputted `data_type`. This function is built to allow for easier calls for the data from scripts external to this one. Args: ---- data_type: str embed_dim (optional): int Return: varied """ if data_type == "word_embedding": embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim) wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False) return wrd_embedding elif data_type == "articles": body_fp = 'data/articles/twenty_newsgroups/bodies.pkl' headline_fp = 'data/articles/twenty_newsgroups/headlines.pkl' with open(body_fp, 'rb') as f: bodies = pickle.load(f) with open(headline_fp, 'rb') as f: headlines = pickle.load(f) return bodies, headlines else: raise Exception('Invalid data type requested!')
def get_init_data(model_file, ark_file): from gensim.models.word2vec import Word2Vec model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) files = [resource_filename('twitter_dm', 'data/identity_dictionaries/identity/'+x) for x in resource_listdir('twitter_dm', 'data/identity_dictionaries/identity/')] files += [resource_filename('twitter_dm', 'data/identity_dictionaries/non_identity_words/'+x) for x in resource_listdir('twitter_dm', 'data/identity_dictionaries/non_identity_words/')] all_dictionaries = Dictionaries(list_of_files=files) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(None) for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords} twit_sets.append([twit_id,"twit_identities_"+str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"]) twit_sets.append([stopwords,"stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def load_vectors(): print("loading word2vec vectors...") t0 = time() model = Word2Vec.load_word2vec_format('/Volumes/Seagate Backup Plus Drive/MacFilesThatICantFit/GoogleNews-vectors-negative300.bin', binary = True) loadTime = time() - t0 print("word2vec vectors loaded in %0.3f seconds" % loadTime) print() # done "training" the model; we can do the following to trim uneeded memory t0 = time() print("trimming model memory...") model.init_sims(replace=True) trimTime = time() - t0 print("trimmed memory in %0.3f seconds" % trimTime) print() vec = model['hello'] print('type of vector') print(type(vec)) print('vector') print(vec) sys.exit(1) return model
def return_data(data_type, embed_dim=50): """Return the data specified by the inputted `data_type`. This function is built to allow for easier calls for the data from scripts external to this one. Args: ---- data_type: str embed_dim (optional): int Return: varied """ if data_type == "word_embedding": embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim) wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False) return wrd_embedding elif data_type == "reviews": reviews_fp = 'work/reviews/amazon/filtered_tokenized_reviews.pkl' ratios_fp = 'work/reviews/amazon/filtered_ratios.npy' with open(reviews_fp, 'rb') as f: reviews = pickle.load(f) ratios = np.load(ratios_fp) return reviews, ratios else: raise Exception('Invalid data type requested!')
def __init__(self, *args, **kwargs): ''' Computes various measures of central tendency of a document. For Z_X scores, the raw word tokens are summed over the partition function. For I_X scores, the same statistics are computed over the similarity of all word pairs for words with top 10% Z values. This will precompute the partition function if it doesn't exist. ''' cfg_embed = kwargs["embedding"] cfg_score = kwargs["score"] f_w2v = os.path.join( cfg_embed["output_data_directory"], cfg_embed["w2v_embedding"]["f_db"], ) f_partition_function = os.path.join( cfg_embed["output_data_directory"], cfg_score["document_log_probability"]["f_partition_function"], ) if not os.path.exists(f_partition_function): self.create_partition_function(f_w2v, f_partition_function) self.Z = self.load_partition_function(f_partition_function) self.scores = [] val = cfg_score["document_log_probability"]["intra_document_cutoff"] self.intra_document_cutoff = float(val) self.model = Word2Vec.load(f_w2v)
def initialize(self): sys.stdout.write("Metric initialization\n") sys.stdout.write("1 - Word2vec model") self.model = Word2Vec.load(model_path) sys.stdout.write("...loaded\n") sys.stdout.write("2 - Stop words") self.stop_words = [line.strip('\n') for line in open(stop_words_path)] sys.stdout.write("...loaded\n") sys.stdout.write("3 - Word-Averages model: ") self.wordAverages = defaultdict() for i in self.files_list: sys.stdout.write(str(i) + " - ") sys.stdout.flush() tweetsFile = tweets_path + str(i) + ".csv" wAvgsFile = wAvgs_path + str(i) + ".csv" tweets = [] values = [] with open(tweetsFile, 'r') as f1: tweets = f1.readlines() f1.close() with open(wAvgsFile, 'r') as f2: reader = csv.reader(f2) for r in reader: values.append( np.array([ float(v) for v in r ]) ) f2.close() for j in range(len(tweets)): self.wordAverages[ tweets[j].strip('\n') ] = values[j] sys.stdout.write("loaded\n")
def create_partition_function(self, f_w2v, f_h5): print "Building the partition function" # Load the model from disk M = Word2Vec.load(f_w2v) words = M.index2word ZT = [] INPUT_ITR = tqdm.tqdm(words) # Compute the partition function for each word for w in INPUT_ITR: UE = self.energy(M.syn0, M[w]) z = compute_partition_stats(UE) ZT.append(z) # Save the partition function to disk # (special care needed for h5py unicode strings) dt = h5py.special_dtype(vlen=unicode) with h5py.File(f_h5,'w') as h5: h5.create_dataset("words", (len(words),), dtype=dt, data=[w.encode('utf8') for w in words]) h5.attrs['vocab_N'] = len(words) h5['Z'] = ZT
def get_predict_vecs(words): n_dim = 300 imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl') #imdb_w2v.train(words) train_vecs = buildWordVector(words, n_dim,imdb_w2v) #print train_vecs.shape return train_vecs
def build_word_graph(model_fname, limiar=0.2): """ Constroi um grafo de walavras ponderado pela similaridade entre elas de acordo com o modelo. :param model_fname: Nome do arquivo com o modelo word2vec como foi salvo :return: objeto grafo """ m = Word2Vec.load(model_fname) g = Graph() freq = g.new_vertex_property("int") weight = g.new_edge_property("float") i = 0 vdict = {} for w1, w2 in combinations(m.vocab.keys(), 2): if w1 == '' or w2 == '': continue # print(w1,w2) v1 = g.add_vertex() if w1 not in vdict else vdict[w1] vdict[w1] = v1 freq[v1] = m.vocab[w1].count v2 = g.add_vertex() if w2 not in vdict else vdict[w2] vdict[w2] = v2 freq[v2] = m.vocab[w2].count sim = m.similarity(w1, w2) if sim > 0.1: e = g.add_edge(v1, v2) weight[e] = sim if i > 10000: break i += 1 g.vertex_properties['freq'] = freq g.edge_properties['sim'] = weight return g
def __init__(self,*args,**kwargs): super(generic_document_score, self).__init__(*args,**kwargs) f_w2v = os.path.join( kwargs["embedding"]["output_data_directory"], kwargs["embedding"]["w2v_embedding"]["f_db"], ) # Load the model from disk self.M = Word2Vec.load(f_w2v) self.shape = self.M.syn0.shape # Build the dictionary vocab_n = self.shape[0] self.word2index = dict(zip(self.M.index2word,range(vocab_n))) # Set parallel option (currently does nothing) self._PARALLEL = kwargs["_PARALLEL"] # Load the negative weights if "negative_weights" in kwargs: neg_W = kwargs["negative_weights"] self.neg_W = dict((k, float(v)) for k,v in neg_W.items()) else: self.neg_W = {}
def from_word2vec_model(cls, word2vec_model): """ WARNING: `gensim` is required to use this function! Load a word2vec vector model. :param word2vec_model: path to word2vec model or a fitted word2vec model :return: a `Vectors` object """ try: import gensim # gensim version hack if (int(gensim.__version__.split('.')[0]) < 1): from gensim.models.word2vec import Word2Vec as Word2VecLoader else: from gensim.models import KeyedVectors as Word2VecLoader except ImportError as ex: logging.error('Gensim is required to use this method!') raise ex if (isinstance(word2vec_model, str)): model = Word2VecLoader.load_word2vec_format(word2vec_model, binary=word2vec_model.endswith('bin')) else: model = word2vec_model vocab = model.vocab.keys() vectors = {} dims = len(model[next(iter(vocab))]) # vector dimensionality dimension_names = ['f%02d' % i for i in range(dims)] for word in vocab: vectors[word] = zip(dimension_names, model[word]) return Vectors(vectors)
def vectorize(model_file, dictionary_file, corpus_file): seterr(all='raise') # don't ignore numpy errors #load model from given file model = Word2Vec.load(model_file) dictionary = corpora.Dictionary().load(dictionary_file) corpus = corpora.MmCorpus(corpus_file) tfidf = models.TfidfModel(corpus) d = corpora.Dictionary() d = d.load(dictionary_file) corpus = corpora.MmCorpus(corpus_file) tf = models.TfidfModel(corpus) vectorize = [] for doc_no, tdoc in enumerate(tf[corpus]): tdoc.sort(key=lambda kv: kv[1], reverse=True) if doc_no % 100 == 0: logger.info("PROGRESS: vectorizing user #%i of %i" % (doc_no, len(corpus))) words_per_user = 8 word_vecs = [] for wordid, measure in tdoc: word = d[wordid] if word in model: word_vecs.append(model[word]) print word if len(word_vecs)>=words_per_user: break if len(word_vecs)==words_per_user: avg = matutils.unitvec(array(word_vecs).mean(axis=0)).astype(REAL) vectorize.append(avg) #print [word for word, measure in model.most_similar_from_array(avg, topn=5)] return vectorize
def __init__(self, word2vec_path=""): self.sentence = [] self.tfidf_sparse = [] self.bi_set = [-1 for i in range(1000000)] self.tfidf_model_dict = {} if word2vec_path != "": self.word2vec_model = Word2Vec.load(word2vec_path)
def term_expansion(fpath, terms, knn): '''Expand term list by creating list of nearest neighbors in provided embeddings representation. This is usually very noisy and there is a fuzzy distinction between semantic similarity and "relatedness". Bacteria names, for example, often neighbor diseases caused by those organisms. ''' model = Word2Vec.load(fpath) model.init_sims() nbrs = NearestNeighbors(n_neighbors=knn+1, algorithm='ball_tree', metric='l2') nbrs.fit(model.syn0norm) expansion = [] for phrase in terms: # space replaced with underscore in PMC/PubMed embeddings phrase = phrase.replace(" ","_") if phrase not in model.vocab: continue idx = model.vocab[phrase].index vec = model.syn0norm[idx] _,indices = nbrs.kneighbors(vec) neighbors = [model.index2word[j] for j in indices.flatten()] neighbors.remove(phrase) expansion += neighbors # transform words back to whitespace separators return map(lambda x:x.replace("_"," "), expansion)
def wordEmbedding(): """ These code is from http://vene.ro/blog/word-movers-distance-in-python.html """ if not os.path.exists("data/embed.dat"): print ("Caching word embeddings in memmapped format...") from gensim.models.word2vec import Word2Vec wv = Word2Vec.load_word2vec_format("/home/medialab/NLP_data/GoogleNews-vectors-negative300.bin.gz", binary = True) fp = numpy.memmap("data/embed.dat", dtype=numpy.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open("data/embed.vocab", "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): print >> f, unidecode(w) pass del fp, wv W = numpy.memmap("data/embed.dat", dtype=numpy.double, mode="r", shape=(3000000, 300)) with open("data/embed.vocab") as f: vocab_list = map(str.strip, f.readlines()) vocab_dict = {w:k for k,w in enumerate(vocab_list)} return W, vocab_dict
def query_word_similarity(model_file, word1, word2): seterr(all='raise') # don't ignore numpy errors #load model from given file model = Word2Vec.load(model_file + '.model') similarity = model.similarity(word1,word2) logging.info("similarity of \'%s\' and \'%s\' is %f" % (word1,word2,similarity))
def __init__(self, tokenWeights = True, extraFeatures = True, EXTRA_WEIGHTS_LABELS = [ 'bleuScore', 'similarityScore', 'wordMoversDistance', 'crossUnigramsRatio']): self.words = {} self.words2 = {} # hypothesis words self.wordId = 0 self.wordId2 = 0 # hypothesis self.extraFeatures = {} # for our new features self.docId = 0 self.documents = {} self.tokenWeights = tokenWeights self.extraFeatures = extraFeatures self.EXTRA_WEIGHTS_LABELS = EXTRA_WEIGHTS_LABELS ##################### if not os.path.exists("data/embed.dat"): print("Caching word embeddings in memmapped format...") #from gensim import models from gensim.models.word2vec import Word2Vec wv = Word2Vec.load_word2vec_format("data/GoogleNews-vectors-negative300.bin.gz", binary=True) wv.init_sims(replace=True) # recommended new step? fp = np.memmap("data/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open("data/embed.vocab", "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): f.write(w.encode('utf-8')) f.write('\n'.encode('utf-8')) #print(w, file=f) pass del wv self.W = np.memmap("data/embed.dat", dtype=np.double, mode="r", shape=(3000000, 300)) with open("data/embed.vocab") as f: self.vocab_list = map(str.strip, f.readlines()) self.vocab_dict = {w: k for k, w in enumerate(self.vocab_list)}
def __init__(self,*args,**kwargs): super(affinity_mapping, self).__init__(*args,**kwargs) # Load the model from disk self.M = Word2Vec.load(kwargs["f_w2v"]) self.shape = self.M.syn0.shape # Set parallel option self._PARALLEL = ast.literal_eval(kwargs["_PARALLEL"]) self.damping = float(kwargs["damping"]) if not os.path.exists(kwargs["f_affinity"]): h5 = h5py.File(kwargs["f_affinity"],'w') h5.close() self.h5 = h5py.File(kwargs["f_affinity"],'r+') global damping, M damping = self.damping M = self.M self.vocab_n = len(M.index2word) M.word2index = dict([(w,i) for w,i in zip(M.index2word,range(self.vocab_n))]) # Increment this as we find more clusters self.cluster_n = 0
''' print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) #提取测试固定数据 x_test,y_test = data_helpers.load_data_and_labels(FLAGS.positive_test_file,FLAGS.negative_test_file) max_document_length = max([len(x.split(" ")) for x in x_text]) print('max_document_length') print(max_document_length) ''' word2vec向量化 ''' #加载模型 model_path = './word2vec/word2vec_wx' model = Word2Vec.load(model_path) x_tmp = [] out_vocabulary_file = open('/Users/xiamin/Desktop/TextCNN/mail_data/out_of_vocabulary.txt','a') line_num = 0 for line in x_text: line_num += 1 line = line.decode('utf-8') words = line.split(" ") vector_line = [] i=0 for word in words: if(model.wv.__contains__(word)): i = i+1 word_vec = model.wv[word] #numpy array
pool = Pool() ret = pool.map(f, parms) pool.close() pool.join() return ret # -- run shit in parallel... # sentences = parallel_run(tokenize_document, enumerate(reviews_texts)) sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)] # build a default w2v model... w2v = Word2Vec(sentences=sentences, size=100, alpha=0.025, window=4, min_count=2, sample=1e-5, workers=4, negative=10) def tokens_to_mean_vec(tokens, w2v): ''' Takes a list of tokens and a Word2Vec models and finds the mean word vector of that list. ''' vec = [] for w in tokens: try: vec.append(w2v[w]) except KeyError:
import mysql.connector db = mysql.connector.connect(host="localhost", database="nsf", user="******", password="******") cursor = db.cursor() cursor.execute("select AwardTitle, AbstractNarration from Award limit 100") stop = set(stopwords.words('english')) stop.add(',') stop.add('.') result = cursor.fetchall() proc = [] for r in result: proc.append([w.lower() for w in sent_tokenize(r[0].replace("<br/>", "\n")) +\ sent_tokenize(r[1].replace("<br/>", "\n")) if w.lower() not in stop]) print(proc) exit() model = Word2Vec(proc, size=100, window=5, min_count=5, workers=4) model.save("nsf_w2v_model_2") similar = model.wv.most_similar(['data', 'science'], []) print(similar)
# -*- coding:utf-8 -*- import logging from gensim.models.word2vec import LineSentence, Word2Vec import sys reload(sys) sys.setdefaultencoding("utf-8") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"] sentences = LineSentence("../dataset/train_questions_with_evidence.txt") model = Word2Vec(sentences, min_count=1, iter=1000) model.train(sentences, total_examples=model.corpus_count, epochs=1000) model.save("../model/w2v.mod") model_loaded = Word2Vec.load("../model/w2v.mod") sim = model_loaded.wv.most_similar(positive=[u'酒精']) for s in sim: print s[0]
def generate_word2vec_file(x: pd.DataFrame, y: pd.Series, description: str, feature_column: str, timer: Timer = None, feature_size: int = 100, window_context: int = 5, min_word_count: int = 5, sample: float = 0.001, iterations: int = 5, ): """ generate features using word2vec :param x: :param y: :param description: :param feature_size: :param window_context: :param min_word_count: :param sample: :param iterations: :return: """ log.info("generating word2vec") log.debug(f'{x.head()}') wpt = WordPunctTokenizer() if timer: timer.start_timer(TOKENIZE_TIME_MIN) documents = [wpt.tokenize(review) for review in x.array] if timer: timer.end_timer(TOKENIZE_TIME_MIN) if timer: timer.start_timer(VECTORIZE_TIME_MIN) # TODO: add configuraton for pre-trained or train # if x.shape[0] <= 50: w2v_model = Word2Vec(documents, size=int(feature_size), window=int(window_context), min_count=int(min_word_count), sample=sample, iter=int(iterations) ) # else: # log.info("Downloading pre-trained word2vec") # w2v_model = api.load("word2vec-google-news-300") if timer: timer.end_timer(VECTORIZE_TIME_MIN) model_file = f"{MODEL_DIR}/{description}-{len(x)}-{feature_size}.model" log.info(f'Writing model file: {model_file}') if timer: timer.start_timer(MODEL_SAVE_TIME_MIN) w2v_model.save(model_file) if timer: timer.end_timer(MODEL_SAVE_TIME_MIN) feature_df = get_feature_df(w2v_model, x) return write_to_file(feature_df, y, feature_column, description, include_lda=False)
for word in words: lower_case_word = word.lower() temp_sentence.append(lower_case_word) # keep original formats Text_Data.append(temp_sentence) print('the length of text data is ', len(Text_Data)) # Create CBOW model # word2vec_model = gensim.models.Word2Vec(Text_Data, min_count = 1, size = 100, window = 2) # size : Dimensionality of the word vectors. import gensim.downloader as api from gensim.models.word2vec import Word2Vec corpus = api.load('text8') print("training word2vec model now...") word2vec_model = Word2Vec(corpus) print("word2vec training completed.") word2vec_features = [] for sentence_index in range(len(Text_Data)): sentence = Text_Data[sentence_index] sentence_vector = [] for word_index in range(len(sentence)): word = sentence[word_index] word_vector = word2vec_model.wv[word] sentence_vector.extend(word_vector) # for class 'ES' if Tags_List[sentence_index] == 'ES':
import shlex import re import gensim from pyemd import emd from sklearn.metrics import euclidean_distances from scipy.spatial.distance import cosine import numpy as np from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer #from sklearn.cross_validation import train_test_split if not os.path.exists("/home/avinash/PycharmProjects/data_local_model/embed.dat"): print("Caching word embeddings in memmapped format...") from gensim.models.word2vec import Word2Vec wv = Word2Vec.load( "/home/caniz/Devel/cognostics/difficulty_measure/Shared/word2vec_models/size600_min50_window5_bigram/size600_min50_window5_bigram") fp = np.memmap("/home/avinash/PycharmProjects/data_local_model/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open("/home/avinash/PycharmProjects/data_local_model/embed.vocab", "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): print(w, file=f) del fp, wv W = np.memmap("/home/avinash/PycharmProjects/data_local_model/embed.dat", dtype=np.double, mode="r", shape=(300000, 300)) with open("/home/avinash/PycharmProjects/data_local_model/embed.vocab") as f: vocab_list = map(str.strip, f.readlines()) vocab_dict = {w: k for k, w in enumerate(vocab_list)}
def get_predict_vecs(words): n_dim=500 imdb_w2v=Word2Vec.load('sentiment_analysis/w2v_model.pkl') train_vecs=build_sentence_vector_ave(words,n_dim,imdb_w2v) return train_vecs
def get_glove(W2V_DIM=50): glove_file = 'data/glove_data/glove.6B.' + str(W2V_DIM) + 'd.txt' glove = Word2Vec.load_word2vec_format(glove_file) return glove
parser.add_argument('--w2v', default='all.norm-sz100-w10-cb0-it1-min100.w2v', nargs='?', help='Path to the word2vec model.') parser.add_argument('--seed', default=228, type=int, nargs='?', help='Random seed.') args = vars(parser.parse_args()) RANDOM_SEED = args['seed'] random.seed(RANDOM_SEED) w2v = Word2Vec.load_word2vec_format(args['w2v'], binary=True, unicode_errors='ignore') w2v.init_sims(replace=True) print('Using %d word2vec dimensions from "%s".' % (w2v.layer1_size, args['w2v'])) def read_subsumptions(filename): subsumptions = [] with codecs.open(filename, encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: subsumptions.append((row[0], row[1]))
import torch.nn as nn import matplotlib.pyplot as plt from gensim.models.word2vec import Word2Vec import numpy as np import pandas as pd import time from model import ASTNN TRAINING_SET_SIZE = 30000 VALIDATION_SET_SIZE = 10000 TEST_SET_SIZE = 10000 print('Reading data...') w2v = Word2Vec.load('./data/c/w2v_128').wv embeddings = torch.tensor(np.vstack([w2v.vectors, [0] * 128])) programs = pd.read_pickle('./data/c/id_code_label_ast_(index_tree).pkl') training_set = programs[:TRAINING_SET_SIZE] validation_set = programs[TRAINING_SET_SIZE:TRAINING_SET_SIZE + VALIDATION_SET_SIZE] test_set = programs[TRAINING_SET_SIZE + VALIDATION_SET_SIZE:TRAINING_SET_SIZE + VALIDATION_SET_SIZE + TEST_SET_SIZE] def get_batch(dataset, i, batch_size): return dataset.iloc[i:i + batch_size]
def train_w2v(c): w2v = Word2Vec(c, size=100) return w2v
def __init__(self): self.data = pd.read_csv("./best_film.csv") self.annoy = AnnoyIndex(100, 'angular') self.annoy.load('annoy_for_films.ann') self.model = Word2Vec.load('w2v_for_films') self.lem = WordNetLemmatizer()
#sample(frac=1) -> shuffle data corpus = pd.concat([train_df.text, test_df.text]).sample(frac=1) #print(corpus.head()) #put it in Word2Vec #adjust size iter sg=0 use skip-gram(low freq good) sg=1(use CBOW) ,window: use+-1 letter to predict #train #model = Word2Vec(corpus, size = 250, iter = 10, sg=0, window=3, min_count=7, max_vocab_size=None, workers=3, min_alpha=0.0001, hs=0, negative=5, batch_words=10000) model.save('word2vec_min7.model') #window = 3 word2vec #window = 2 word2vec_window #load model = Word2Vec.load('word2vec_min7.model') #tcheck trained model def most_similar(w2v_model, words, topn=10): similar_df = pd.DataFrame() for word in words: try: similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos']) similar_df = pd.concat([similar_df, similar_words], axis=1) except: print(word, "not found in Word2Vec model!") return similar_df
# First download the pretrained gensim model: # $ wget http://public.shiroyagi.s3.amazonaws.com/latest-ja-word2vec-gensim-model.zip # $ unzip latest-ja-word2vec-gensim-model.zip from gensim.models.word2vec import Word2Vec model = Word2Vec.load('word2vec.gensim.model') some_words = ['ブランド', 'バッグ', 'アルマーニ'] for w in some_words: print(model.wv.most_similar(w))
import os import pickle import utils import numpy as np from gensim.models.word2vec import Word2Vec model = Word2Vec.load('w2vmodel_run/word2vec_model_cleaned_data') thermometers = [ 'democrats', 'republicans', 'protestants', 'catholics', 'jews', 'blacks', 'whites', 'southerners', 'big business', 'labor unions', 'liberals', 'conservatives', 'military', 'policemen', 'black militants', 'civil rights leaders', 'chicanos hispanics', 'democratic party', 'middle class people', 'people on welfare', 'political independents', 'political parties', 'poor people', 'republican party', 'women right activist', 'young people', 'asian americans', 'congress', 'environmentalists', 'anti abortionists', 'federal government', 'illegal aliens', 'christian fundamentalists', 'radical students', 'farmers', 'feminists', 'evangelical groups', 'elderly', 'supreme court', 'women' ] # Word2Vec size word2Vec_dimension = 100 # Data directory for cases data_dir = 'data' case_dir = os.path.join(data_dir, 'clean_Mar_20') # sub-directory containing our cases maj_dir = 'maj'
def load_w2v_model(modelpath): print("Loading model ", modelpath) return Word2Vec.load(modelpath)
def loadModelfromFile(self, modelFilePath): ''' load model from disk which is already existed can continue training with the loaded model (need more test) ''' return Word2Vec.load(modelFilePath)
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline()) vocab_size, layer1_size = map( int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) word.append(ch) word = utils.to_unicode(b''.join(word)) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line).split() if len(parts) != layer1_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result
batch_size = CommonUtilities.get_param_value_int("batch_size", sys.argv, batch_size) logging.info('batch_size:{0}'.format(batch_size)) lstm_hidden_size = 50 lstm_hidden_size = CommonUtilities.get_param_value_int("lstm_hidden_size", sys.argv, lstm_hidden_size) logging.info('lstm_hidden_size:{0}'.format(lstm_hidden_size)) if cmd == "train": logging.info("==========================") logging.info("======== TRAINING ========") logging.info("==========================") embeddings_vec_size = embeddings_size if embeddings_model_type == "w2v": logging.info("Loading w2v model..") if word2vec_load_bin: embeddings_model = Word2Vec.load_word2vec_format(embeddings_model_file, binary=True) # use this for google vectors else: embeddings_model = Word2Vec.load(embeddings_model_file) embeddings_vec_size = embeddings_model.syn0.shape[1] elif embeddings_model_type == "rand": embeddings_model = None else: raise Exception("embeddings_model_type=%s is not yet supported!" % embeddings_model_type) # train data input_data_fileslist_train = [data_tac2014_train, data_tac2015_train, data_tac2014_eval] train_data_files = "" # train_data_files = CommonUtilities.get_param_value("train_data_files", sys.argv, default=train_data_files)
from newsParser import parseSNU from db import loadSNU, connectDB host = 'localhost' chunk = 100 if __name__ == '__main__': build_hannanum.build() hannanum = Hannanum() print('Load Parser Done!') filedir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'w2v') os.chdir(filedir) w2v = Word2Vec.load('model.model') print('Load Word2Vector Done!') _, _, _, _, vecDB, *_ = connectDB(host) try: begin = vecDB['metadata'].find_one({'type': 'snu'})['idx'] except: begin = 0 while True: wv = [] li = loadSNU(chunk, begin) for i in li: wv += parseSNU(i, (hannanum, w2v)) begin += len(li) if wv:
try: cursor.execute( query_insert_word, (morph.parse(word)[0].normal_form, id_article)) connection.commit() except Exception: print(file) continue g = g + 1 print("Article " + str(n_files) + ": " + str(g) + " из " + str(len(list_words))) n_files = n_files + 1 print("Обработано: " + str(n_files) + " из " + str(len(files))) model = Word2Vec.load('ruwiki.word2vec.model') country_list = [item[0] for item in model.most_similar('швеция')] stop_words = get_stop_words('ru') cursor.execute( "SELECT word,count(*) as c FROM stats GROUP BY word ORDER BY c desc LIMIT 100" ) for row in cursor: if (row[0] not in stop_words) and (re.compile('\d+').match(row[0]) == None): if (row[0] in country_list): #print (row[0]+": "+ str(row[1])) president = model.most_similar(positive=[row[0], 'путин'], negative=['россия'], topn=1) print(row[0] + ": " + president[0])
tokenized_twt = [j for j in tokenized_twt if ( j not in string.punctuation )] stop_words = stopwords.words('russian') stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на']) tokenized_twt = [j for j in tokenized_twt if ( j not in stop_words )] tokenized_twt = [j.replace("«", "").replace("»", "") for j in tokenized_twt] tokens = [stemmer.stem(t) for t in tokenized_twt if not t.startswith('@')] #print(tokens) tokenized_corpus.append(tokens) vector_size = 512 window_size = 10 word2vec = Word2Vec(sentences=tokenized_corpus, size=vector_size, window=window_size, negative=20, iter=50, seed=1000, workers=4) model = load_model('model.h5') stemmer = SnowballStemmer("russian") cv = nltk.word_tokenize(sys.argv[1:][0]) cv = [j for j in cv if ( j not in string.punctuation )] stop_words = stopwords.words('russian') stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на']) cv = [j for j in cv if ( j not in stop_words )] cv = [j.replace("«", "").replace("»", "") for j in cv] vecs = []
logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) logging.info("using optimization %s" % FAST_VERSION) # check and process cmdline input program = os.path.basename(sys.argv[0]) if len(sys.argv) < 2: print(globals()['__doc__'] % locals()) sys.exit(1) infile = sys.argv[1] from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle seterr(all='raise') # don't ignore numpy errors # model = Word2Vec(LineSentence(infile), size=200, min_count=5, workers=4) model = Word2Vec(Text8Corpus(infile), size=200, min_count=5, workers=1) if len(sys.argv) > 3: outfile = sys.argv[3] model.save(outfile + '.model') model.save_word2vec_format(outfile + '.model.bin', binary=True) model.save_word2vec_format(outfile + '.model.txt', binary=False) if len(sys.argv) > 2: questions_file = sys.argv[2] model.accuracy(sys.argv[2]) logging.info("finished running %s" % program)
def build_dataset(train_data_path, test_data_path, save_wv_model_path, testOnly=True, toCSV=True): ''' 数据加载+预处理 :param train_data_path:训练集路径 :param test_data_path: 测试集路径 :return: 训练数据 测试数据 合并后的数据 ''' # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format( len(train_df), len(test_df))) # 2. 空值填充 train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True) test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True) # 3.多进程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format( len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) if toCSV: train_df.to_csv(train_seg_path, index=None, header=True) test_df.to_csv(test_seg_path, index=None, header=True) # 6. 保存合并数据 merged_df.to_csv(merger_seg_path, index=None, header=False) if osp.exists(save_wv_model_path): wv_model = Word2Vec.load(save_wv_model_path) else: # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim, negative=5, workers=8, iter=wv_train_epochs, window=3, min_count=5) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 if toCSV: train_df['X'].to_csv(train_x_pad_path, index=None, header=False) train_df['Y'].to_csv(train_y_pad_path, index=None, header=False) test_df['X'].to_csv(test_x_pad_path, index=None, header=False) if testOnly: print("No retraining! Test only...") return train_df['X'], train_df['Y'], test_df['X'], wv_model else: # 11. 词向量再次训练 print('start retrain w2v model') wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) wv_model.train(LineSentence(train_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) print('1/3') wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) wv_model.train(LineSentence(train_y_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) print('2/3') wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) wv_model.train(LineSentence(test_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) # 保存词向量模型 wv_model.save(save_wv_model_path) # or load wv_model # wv_model = Word2Vec.load(save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) return train_df['X'], train_df['Y'], test_df['X'], wv_model
def input_transform(string): words = jieba.lcut(string) words = np.array(words).reshape(1, -1) model = Word2Vec.load('lstm_data/Word2vec_model.pkl') _, _, combined = create_dictionaries(model, words) return combined
score.append(1) else: score.append(0) dd.append({'y_test': y_test[i], 'first': label[0], 'second': label[1]}) acc = sum(score) / len(y_test) return acc data = pd.read_excel('./data/doc_set_final_version3.xlsx') data['token'] = data.token.apply(lambda x: literal_eval(x)) X_data = data[['token', 'new_small_class']] target_big = data.new_class.tolist() target_small = data.new_small_class.tolist() w2v_model_name = './model/word_embedding/Word2vec1(base_token).model' word_vectorizer = Word2Vec.load(w2v_model_name) word_vectorizer.wv.vectors.shape word_index = word_vectorizer.wv.index2word EMBEDDING_DIM = word_vectorizer.trainables.layer1_size word_index tfidf = TfidfVectorizer(analyzer=lambda x: x, vocabulary=word_index) tfidf.fit(data['token']) max_idf = max(tfidf.idf_) word2weight = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) train_X, test_X, train_y, test_y = train_test_split(X_data, target_big, test_size=0.3,
from gensim.models.word2vec import Word2Vec from keras import Input, Model from keras.models import Sequential from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Activation from m1 import BOW maxlen = 100 # 每句话的固定长度(截断或者补全) batch_size = 64 embedding_dim = 300 epochs = 10 comments = [['same', 'coffee', 'shop', 'my', 'memory', 'of', 'it', 'is'],[]] # 训练词向量--------------------------------------- w2v_model = Word2Vec(comments,size=embedding_dim, min_count=5, workers=10) # 构造embedding字典 bow = BOW(comments.tolist(), min_count=5, maxlen=maxlen) vocab_size = len(bow.word2idx) embedding_matrix = np.zeros((vocab_size+1,300)) for key, value in bow.word2idx.items(): if key in w2v_model.wv.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab embedding_matrix[value] = w2v_model.wv[key] else: embedding_matrix[value] = [0] * embedding_dim # 构建数据集------------------------------------- X = copy.deepcopy(bow.doc2num[:159571]) # 训练集和验证集划分 4:1
from gensim.models.word2vec import Word2Vec model = Word2Vec.load( '/Users/pavel/PycharmProjects/NeuralNetworkWithTenser/src/wordrecognition/savemodal/word2vec_modal' ) print(model.wv.most_similar(positive=['woman', 'king'], topn=5))
def load_model(path): w2v = Word2vecEmbedder() w2v._model = Word2Vec.load(path) logging.info("loaded word2vec model from: " + path) w2v.is_fitted = True return w2v