print("Création d'index ...") #sauvegarder dans un dossier os.mkdir(documentsDict) for f in documents: out_file=open(join(documentsDict,f), 'w') out_file.write(str(documents[f])) out_file.close() out_file=open(doc_freqTerm, 'w') out_file.write(str(doc_freq)) out_file.close() print("corpus traité avec succès \n") #2 apprentissage print("Apprentissage ...") sentences = LineSentence(corpusAsSentences) model = Word2Vec(sentences, size=dimConcept, window=win, min_count=minc, workers=4) # lancer la génération du vocabulaire model.save_word2vec_format(index+'/word2vec'+str(dimConcept)+'_win'+str(win)+'_min'+str(minc)+'.txt', fvocab=None, binary=False) print("vocabulaire ok \n") #3 representation des documents en vecteurs print("Documents to vectors ...") os.mkdir(matDoc) for f in listdir(collection): # ici lire tte la collection doc=documents[f] tdoc=0 # taille du document à calculer : tdoc=sum(doc.values()) vec_doc=numpy.zeros(dimConcept) mat_doc={} for word in doc: if(word in model.vocab):
# -*-coding=utf-8-*- from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import codecs filename = 'html.txt' sentences = LineSentence(filename) model = Word2Vec(sentences, size=128, window=5, min_count=5, workers=4) model.save('word_embedding_128') items = model.most_similar('中国') for item in items: print(item[0], item[1]) print(model.similarity('男人', '女人')) filename = 'wikizhword.text' f = codecs.open(filename, 'r', encoding='utf-8') line = 20 for _ in range(line): print(f.readline()) # sentences = LineSentence(f) # model = Word2Vec(sentences,size=128,window=5,min_count=5,workers=4) # model.save('word_embedding_128') # # #model=Word2Evc.load('word_embedding_128') # items = model.most_similar('中国') # for item in items: # print(item[0],item[1]) # #
import os import sys root_path = "/home/ubuntu/answerbot-tool/src" sys.path.append(root_path) from gensim.models.word2vec import Word2Vec, LineSentence from utils.time_utils import get_current_time corpus_fpath = '../_1_preprocessing/corpus.txt' print 'start time : ', get_current_time() sentences = LineSentence(corpus_fpath) print "begin training..." # size is the dimensionality of the feature vectors. # window is the maximum distance between the current and predicted word within a sentence. # min_count = ignore all words with total frequency lower than this. # workers = use this many worker threads to train the model (=faster training with multicore machines). model = Word2Vec(sentences, size=200, window=5, min_count=0, workers=4, iter=100) model.save('model') print 'end time : ', get_current_time()
from gensim.models.word2vec import LineSentence import multiprocessing if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # load word2vec model inp = '..\dataset\ChnSentiCorp_htl_ba_6000\\6000_all_cut.txt' output1 = 'word2vec.model' output2 = 'word2vec.vector' # size:生成的词向量的维度; # min_count:可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5; # window:即词向量上下文最大距离,skip-gram和cbow算法是基于滑动窗口来做预测,默认值为5,对于一般的语料推荐在[5,10]之间。 model = Word2Vec(LineSentence(inp), size=300, min_count=5, window=5, workers=multiprocessing.cpu_count(), iter=1) # 生成word2vec词典 model.build_vocab(inp) # 训练word2vec模型 model.train(inp, total_examples=model.corpus_count, epochs=50) model.save('Word2vec_model.pkl') model.wv.save_word2vec_format('Word2vec_model.vector', binary=False)
config_pattern = "size{}window{}sg{}min_count{}negative{}iter{}" config_str = config_pattern.format(args.size, args.window, args.sg, args.min_count, args.negative, args.iter) outputfile1 = outputpath + config_str + ".model" outputfile2 = outputpath + config_str + ".vector" ############### end of config ################# logging.basicConfig(filename=config_str + '.log', filemode='w', format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger = logging.getLogger() logger.info("running train process in custom: %s" % args.train) model = Word2Vec( LineSentence(inpputfile), size=args.size, window=args.window, min_count=args. min_count, # with 0.35 billion corpus, #3000 can retain 9228 unique words workers=args.workers, # multiprocessing.cpu_count() #sample=args.sample, sg=args.sg, #hs=args.hs, negative=args. negative, # follow tensorflow's word2vec_optimized.py num_neg_samples 25 iter=args.iter) # trim unneeded model memory = use(much) less RAM # model.init_sims(replace=True) model.save(outputfile1)
def word2vec_training(text_file): sentences = LineSentence(text_file) model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=16) model.wv.save("merge_with_unk.kv") # model.wv.save_word2vec_format("merge_with_unk_vector.txt", binary=False) return model
def stool_simulator(total_epoch, special_epoch_count, restricted_vocab_name): # corpus_file = '/Users/zzcoolj/Code/GoW/data/training data/Wikipedia-Dumps_en_20170420_prep/AA/wiki_01.txt' corpus_file = 'input/enwiki-1G.txt' xlsx_path = 'output/test1G-vocab50000-stool-iter' + str(total_epoch) + '-first' + str(special_epoch_count) \ + 'EpochInitial-' + str(restricted_vocab_name) + '.xlsx' df = pd.DataFrame(columns=[ # word embeddings file name 'file name', # wordsim353 'wordsim353_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # simlex999 'simlex999_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # MTURK-771 'MTURK771_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # questions-words 'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total' ]) line_number_in_xlsx = 0 lr = 0.025 alphas = alpha_splitter(start=lr, epochs=total_epoch) print('alphas', alphas) # special starting epochs (final notIn) restricted_vocab = read_file_to_dict('../word_embeddings_evaluator/data/distinct-tokens/' + str(restricted_vocab_name) + '.txt') restricted_type = 1 params = { 'alpha': lr, 'min_alpha': alphas[special_epoch_count], 'size': 200, 'window': 5, 'iter': special_epoch_count, 'max_vocab_size': 50000, 'sample': 1e-4, 'sg': 1, # 1 for skip-gram 'hs': 0, # If 0, and negative is non-zero, negative sampling will be used. 'negative': 5, 'workers': 3, 'restricted_vocab': restricted_vocab, # [modified] ATTENTION: It must be a dictionary not a list! 'restricted_type': restricted_type # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn } print('special epochs half', special_epoch_count) gs_model = Word2Vec(LineSentence(corpus_file), **params) df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch_count) + '-half') # special starting epochs (final in) print('special epochs entire', special_epoch_count) gs_model.restricted_type = 2 gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter, start_alpha=lr, end_alpha=alphas[special_epoch_count]) line_number_in_xlsx += 1 df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch_count) + '-entire') # original ending epochs print('roof epochs') gs_model.restricted_type = 0 gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=total_epoch-special_epoch_count, start_alpha=alphas[special_epoch_count], end_alpha=alphas[-1]) line_number_in_xlsx += 1 df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(total_epoch)) writer = pd.ExcelWriter(xlsx_path) df.to_excel(writer, 'Sheet1') writer.save()
def train_d2v_model(infile, embedding_file): model = gensim.models.Word2Vec(LineSentence(infile), size=200, window=5, min_count=5) model.save(embedding_file)
parser.add_argument( "-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() if args.cbow == 0: skipgram = 1 if not args.alpha: args.alpha = 0.025 else: skipgram = 0 if not args.alpha: args.alpha = 0.05 corpus = LineSentence(args.train) model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter) if args.output:
import pandas as pd import numpy as np import os if __name__ == "__main__": print("Loading data...") data = pd.concat([ pd.read_csv( "/data/SO_data/downvoter/wv_train_processed_data.csv").body, pd.read_csv("/data/SO_data/downvoter/wv_val_processed_data.csv").body ]) print(data.shape) # save data to one line per doc file np.savetxt("data/wdocfile.txt", data.values, fmt="%s") tagged_data = LineSentence("data/wdocfile.txt") max_epochs = 50 alpha = 0.025 model_file = "./final/word_model.w2v" model = Word2Vec(size=50, alpha=alpha, min_alpha=0.01, min_count=25, window=30, workers=16) print("Building the vocabulary...") model.build_vocab(tagged_data)
def bow(filepath): for rev in LineSentence(filepath): yield tri_dictionary.doc2bow(rev)
def transform(self, sentencesPath, savePath): """ use trained phrases to transform sentences :param sentencesPath: the path of text file, the text file should be the format: one line one sentence :param savePath: the path of transformed text file, the text file are the format: one line one sentence """ with codecs.open(savePath, mode="w", encoding="utf-8") as fr: sentences = TxtIter(sentences=codecs.open(sentencesPath, mode="r", encoding="utf-8"), ngrams=self.phrasers) lines = [] for line in sentences: lines.append(" ".join(line) + "\n") if len(lines) > 500000: fr.writelines(lines) lines = [] fr.writelines(lines) logger.info("delete all phraser to save memory") for i in self.phrasers: del i del self.phrasers self.phrasers = None gc.collect() if __name__ == "__main__": ls = LineSentence("E:/a.txt") for i in ls: print(i)
# print("Please use python wiki_preprocess.py output_path") # exit() # output_path = sys.argv[1] logging.info("start") begin = time() dir_path = sys.argv[1] output_path = sys.argv[2] for root, dirs, files in os.walk(dir_path): for filename in files: file_path = root + '/' + filename logging.info(filename) ls_pageid = find_category_page(file_path) if len(ls_pageid) == 0: continue # ls_pg_text_clean = extract_pages(ls_pageid) extract_pages(ls_pageid) model = gensim.models.Word2Vec(LineSentence('/tmp/test.txt'), size=200, window=5, min_count=2, workers=multiprocessing.cpu_count()) model.wv.save_word2vec_format( complete_dir_path(output_path) + filename[:-4] + ".w2v_org", complete_dir_path(output_path) + filename[:-4] + ".vocab", binary=False) end = time() load_duration = end - begin logging.info("Total procesing time: %d seconds" % (end - begin))
"negative": [5], "sample": [0], "sg": [1], "size": [25], "window": [5], "workers": [cpu_count() - 1] } SENTENCES = "/home/ikram/workplace/projects/Islam-360/embedding/w2v/translation_sentences.txt" for index, param in enumerate(ParameterGrid(PARAMS)): file_name = "" for key, value in param.items(): file_name += f"{key}={value}|" print(f"Training: {file_name}") file = LineSentence(SENTENCES) model = gensim.models.Word2Vec(file, **param) predication = [] with open('../urdu_similar_words.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: try: if model.wv.similarity(row[0], row[1]) > 0.7: predication.append(1) else: predication.append(0) except KeyError: continue data = [1] * len(predication)
def __init__( self, corpus, corpus_format='txt', mark_char='_', word_embedding_type='word2vec', sg=0, size=100, window=10, alpha=0.025, min_alpha=0.0001, min_count=5, sample=1e-5, workers=20, hs=0, negative=25, cbow_mean=1, iter=15, min_n=3, max_n=6, word_ngrams=1): """ Initialize np2vec model and train it. Args: corpus (str): path to the corpus. corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json formats are supported. For json format, the file should contain an iterable of sentences. Each sentence is a list of terms (unicode strings) that will be used for training. mark_char (char): special character that marks NP's suffix. word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and fasttext are supported. np2vec_model_file (str): path to the file where the trained np2vec model has to be stored. binary (bool): boolean indicating whether the model is stored in binary format; if word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True. sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training algorithm. If 1, CBOW is used,otherwise, skip-gram is employed. size (int): model training hyperparameter, size of the feature vectors. window (int): model training hyperparameter, maximum distance between the current and predicted word within a sentence. alpha (float): model training hyperparameter. The initial learning rate. min_alpha (float): model training hyperparameter. Learning rate will linearly drop to `min_alpha` as training progresses. min_count (int): model training hyperparameter, ignore all words with total frequency lower than this. sample (float): model training hyperparameter, threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5) workers (int): model training hyperparameter, number of worker threads. hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non- zero, negative sampling will be used. negative (int): model training hyperparameter, negative sampling. If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. iter (int): model training hyperparameter, number of iterations. min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used for training word representations. max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training. """ self.mark_char = mark_char self.word_embedding_type = word_embedding_type self.sg = sg self.size = size self.window = window self.alpha = alpha self.min_alpha = min_alpha self.min_count = min_count self.sample = sample self.workers = workers self.hs = hs self.negative = negative self.cbow_mean = cbow_mean self.iter = iter self.min_n = min_n self.max_n = max_n self.word_ngrams = word_ngrams if corpus_format == 'txt': self._sentences = LineSentence(corpus) elif corpus_format == 'json': with open(corpus) as json_data: self._sentences = json.load(json_data) elif corpus_format == 'conll2000': try: self._sentences = list() for chunked_sent in conll2000.chunked_sents(corpus): tokens = list() for chunk in chunked_sent: if hasattr(chunk, '_label') and chunk._label == 'NP': s = '' for w in chunk: s += w[0] + self.mark_char tokens.append(s) else: if isinstance(chunk, nltk.Tree): for w in chunk: tokens.append(w[0]) else: tokens.append(chunk[0]) self._sentences.append(tokens) except Exception: print('Conll2000 dataset is missing from NLTK. See downloading details in the ' 'README file') else: logger.error('invalid corpus format: ' + corpus_format) sys.exit(0) if word_embedding_type == 'fasttext' and word_ngrams == 1: # remove the marking character at the end for subword fasttext model training for i, sentence in enumerate(self._sentences): self._sentences[i] = [ w[:-1] if self.is_marked(w) else w for w in sentence] logger.info('training np2vec model') self._train()
from gensim.models.word2vec import LineSentence from ms2vec.ms2vec import MultiSense2Vec corpus = LineSentence("../enwiki_cleaning.txt") model = MultiSense2Vec(corpus, sg=1, negative=5, workers=8, iter=5, window=5, min_count=10, min_sense_count=1000, max_sense_num=3, size=300, np_value=-0.5, cv2zero=True, use_all_window=True, seed=0) #print(model.wv.index2word) print(model.most_similar("mouse")) model_name = "npmssg_m0.5_enwiki_sense_10_neg_5_min_1000" model.save(model_name) model.wv.save_word2vec_format(model_name + ".bin", binary=True)
def _train_phrase_detection_model(self, input_filepath, output_filepath): sentences = LineSentence(input_filepath) model = Phraser(Phrases(sentences)) self._save_sentences(sentences, model, output_filepath) return model
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @Time : 2018/4/17 14:25 @Author : Junya Lu @Site : """ import warnings from gensim.models.word2vec import LineSentence warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') from gensim.models import word2vec from string import punctuation # LineSentence('G:\project3\\Data\\train\\genes\\genes_one_line_space.txt') # model = word2vec.Word2Vec(LineSentence('G:\project3\\Data\\train\\genes\\genes_one_line_space.txt')) # model = word2vec.Word2Vec(LineSentence('G:\project3\\Data\\train\\terms\\terms.txt')) model = word2vec.Word2Vec(LineSentence('genes0504.txt'), min_count=0) print(model['C1Q']) print('the number of vocabulary', len(model.wv.vocab)) vocab = list(model.wv.vocab.keys()) print(vocab[:10]) # model.save('G:\project3\\Data\\train\\w2v.model') # model.wv.save_word2vec_format('G:\project3\\Data\\train\\vector_genes0504.txt') # print (model.similarity('dogs', 'you')) # print (model.similar_by_vector('dogs')) # print (model['you'])
# MTURK-771 'MTURK771_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # questions-words 'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total' ]) for i in range(5): params = { 'alpha': 0.025, 'min_alpha': 0.0001, 'size': 200, 'window': 5, 'iter': 5, 'max_vocab_size': 50000, 'sample': 1e-4, 'sg': 1, # 1 for skip-gram 'hs': 0, # If 0, and negative is non-zero, negative sampling will be used. 'negative': 5, 'workers': 3, 'restricted_vocab': None, # [modified] ATTENTION: It must be a dictionary not a list! 'restricted_type': 0 # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn } gs_model = Word2Vec(LineSentence(corpus_file), **params) df.loc[i] = evaluate(gs_model.wv, str(i)) writer = pd.ExcelWriter(xlsx_path) df.to_excel(writer, 'Sheet1') writer.save()
with open("res.txt", "w", encoding="utf8") as resultFile: lines = file.readlines() for line in lines: intermediate = word_tokenize(line) words_literals = [word for word in intermediate if word.isalpha()] stop_words = set(stopwords.words('english')) clear_tokens = [w for w in words_literals if w not in stop_words] if not clear_tokens: continue resultFile.write(' '.join(clear_tokens)) resultFile.write('\n') sentences = LineSentence("res.txt") model = gensim.models.Word2Vec(sentences, min_count=5, size=300, workers=4, window=10, sg=1, negative=5) print('Similar for "Marfa"') print(model.wv.most_similar(positive=['Marfa'])) print('\nSimilar for "Petersburg"') print(model.wv.most_similar(positive=['Petersburg'])) model.wv.save_word2vec_format("IDIOT_preproc.model")
def iteration_simulator(total_epoch, special_epoch_count, restricted_vocab_name, jumps): # corpus_file = '/Users/zzcoolj/Code/GoW/data/training data/Wikipedia-Dumps_en_20170420_prep/AA/wiki_01.txt' corpus_file = 'input/enwiki-1G.txt' xlsx_path = 'output/test1G-vocab50000-original-iter' + str(total_epoch) + '-last' + str(special_epoch_count) \ + 'EpochInitial-' + str(restricted_vocab_name) + '-jump'+''.join(str(x) for x in jumps)+'.xlsx' df = pd.DataFrame(columns=[ # word embeddings file name 'file name', # wordsim353 'wordsim353_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # simlex999 'simlex999_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # MTURK-771 'MTURK771_Pearson correlation', 'Pearson pvalue', 'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV', # questions-words 'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total' ]) line_number_in_xlsx = 0 # epoch 0 lr = 0.025 alphas = alpha_splitter(start=lr, epochs=total_epoch) print('alphas', alphas) min_alpha = alphas[1] restricted_vocab = read_file_to_dict('../word_embeddings_evaluator/data/distinct-tokens/' + str(restricted_vocab_name) + '.txt') restricted_type = 0 params = { 'alpha': lr, 'min_alpha': min_alpha, 'size': 200, 'window': 5, 'iter': 0, # TODO NOW 'max_vocab_size': 50000, 'sample': 1e-4, 'sg': 1, # 1 for skip-gram 'hs': 0, # If 0, and negative is non-zero, negative sampling will be used. 'negative': 5, 'workers': 3, 'restricted_vocab': restricted_vocab, # [modified] ATTENTION: It must be a dictionary not a list! 'restricted_type': restricted_type # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn } print('cur_epoch', 0) gs_model = Word2Vec(LineSentence(corpus_file), **params) df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch0') gs_model.epochs = 1 # TODO NOW # # epoch 0.5 # gs_model.restricted_type = 2 # gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter, # start_alpha=lr, end_alpha=min_alpha) # df.loc[1] = evaluate(gs_model.wv, 'X-iter0.5') # epoch 1+ # gs_model.restricted_type = 0 for cur_epoch in range(1, total_epoch-special_epoch_count): print('cur_epoch', cur_epoch) start_alpha = alphas[cur_epoch] end_alpha = alphas[cur_epoch+1] print('start_alpha', start_alpha) print('end_alpha', end_alpha) gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter, start_alpha=start_alpha, end_alpha=end_alpha) line_number_in_xlsx += 1 df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch'+str(cur_epoch)) # # save common base model # write_to_pickle(gs_model, xlsx_path.split('.xlsx')[0]+'-base') for special_epoch in range(total_epoch-special_epoch_count, total_epoch): print('special epoch', special_epoch) start_alpha = alphas[special_epoch] end_alpha = alphas[special_epoch+1] print('start_alpha', start_alpha) print('end_alpha', end_alpha) # final special epochs 0.5 gs_model.restricted_type = 1 gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter, start_alpha=start_alpha, end_alpha=end_alpha) line_number_in_xlsx += 1 df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch'+str(special_epoch)+'-half') # final special epochs final if special_epoch not in jumps: gs_model.restricted_type = 2 gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter, start_alpha=start_alpha, end_alpha=end_alpha) line_number_in_xlsx += 1 df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch)+'-entire') # # baseline (final original word2vec epochs) # gs_model_base = read_pickle(xlsx_path.split('.xlsx')[0] + '-base') # gs_model_base.restricted_type = 0 # for baseline_epoch in range(total_epoch - special_epoch_count, total_epoch): # print('baseline epoch', baseline_epoch) # start_alpha = alphas[baseline_epoch] # end_alpha = alphas[baseline_epoch + 1] # print('start_alpha', start_alpha) # print('end_alpha', end_alpha) # gs_model_base.train(LineSentence(corpus_file), total_examples=gs_model_base.corpus_count, epochs=gs_model_base.iter, # start_alpha=start_alpha, end_alpha=end_alpha) # line_number_in_xlsx += 1 # df.loc[line_number_in_xlsx] = evaluate(gs_model_base.wv, 'epoch' + str(baseline_epoch)+'-baseline') writer = pd.ExcelWriter(xlsx_path) df.to_excel(writer, 'Sheet1') writer.save()
from gensim.test.utils import get_tmpfile from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence sentences = LineSentence('./word2vec_train.txt') # Linesentence : 라이별로 하나의 문장이라고 생각하는 것 # model = Word2Vec(sentences, size = 100, window = 3, min_count = 1, iter = 1000) # # model.save('Basic_word2vec.model') model = Word2Vec.load('Basic_word2vec.model') print(model.wv.most_similar('Korea', topn = 10)) print(len(model.wv.vocab)) score, predictions = model.wv.evaluate_word_analogies('./')
def train_w2v(): with open('./data/reduced_zhwiki.txt', 'r', encoding='utf8') as f: # 使用gensim的Word2Vec类来生成词向量 model = Word2Vec(LineSentence(f), sg=0, size=192, window=5, min_count=5, workers=4) model.save('./data/zhwiki_news.word2vec')
def train_w(): sentences = LineSentence('../file/after_fenci.txt') model = Word2Vec(sentences, size=128) model.save('../file/tarining')
executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes") do = delayed(partial(tokenize_sentence_corpus, corpus_out_path)) tasks = (do(i, batch) for i, batch in enumerate(partitions)) executor(tasks) # process_texts(documents_path, year='2020', court='01', corpus_out_path=unigram_sentences_path, batch_size=8, n_jobs=2, # debug=True) stop_words = get_custom_stop_words() pruned_words, counters, total_words = Phrases.learn_vocab( sentences=LineSentence(unigram_sentences_path), max_vocab_size=800000000, common_terms=stop_words, progress_per=100) counters = sorted(counters.items(), key=lambda key_value: key_value[1], reverse=True) count = 0 for key, value in counters: count += 1 print(any2unicode(key), value) print(count) bigram_model = Phrases(LineSentence(unigram_sentences_path),
# model generation and save import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import multiprocessing as mp en_fname = './data/prepro_en_wiki.txt' model_fname = './model/taekeun/en.bin' model = Word2Vec(LineSentence(en_fname), size=300, workers=mp.cpu_count(), sg=1) model.save(model_fname)
import os import codecs from gensim.models import Word2Vec from gensim.models import Phrases from gensim.models.word2vec import LineSentence import pandas as pd import settings trigram_sentences = LineSentence( os.path.join(settings.DATA_PATH, 'trigram_sentences.txt')) word2vec_filepath = os.path.join(settings.DATA_PATH, 'word2vec_model') if 0 == 1: text2vec = Word2Vec(trigram_sentences, size=100, window=5, min_count=20, sg=1, workers=4) text2vec.save(word2vec_filepath) for i in range(1, 12): text2vec.train(trigram_sentences) text2vec.save(word2vec_filepath) text2vec = Word2Vec.load(word2vec_filepath) text2vec.init_sims() print('{} training epochs so far.'.format(text2vec.train_count)) print('{:,} terms in the text2vec vocabulary.'.format(len(text2vec.vocab)))
from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import multiprocessing model = Word2Vec( LineSentence('data/simple.reg.txt'), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count() - 2, ) outp1 = 'data/simple.zh.text.model' outp2 = 'data/simple.zh.text.vector' model.save(outp1) model.wv.save_word2vec_format(outp2)
#coding:utf-8 from gensim.models import word2vec from gensim.models.word2vec import LineSentence import logging inFile = 'corpus.txt' outFile = 'output_demoModel.out' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = word2vec.Word2Vec(LineSentence(inFile), size=100, window=3, min_count=1) print model.wv[u'理']
def nlp_preprocess(filepath_dict: dict, col: str, df=None, verbose: bool = True, overwrite_interim: bool = True) -> pd.DataFrame: def clean_doc(corpus): ''' generator function to read in docs from the file, and substitute and remove substrings ''' for doc in corpus: yield au_tu.remove_substrings(au_tu.clean_tokens( doc, tokens=to_replace_dict, whole_words_only=whole_words_only, ignore_case=ignore_case, ), to_remove_list=to_remove_list, whole_words_only=whole_words_only, ignore_case=ignore_case) def tokenize_entities(parsed_doc): txt = parsed_doc.text for ent in parsed_doc.ents: txt = txt[:ent.start_char] + ent.text.replace( ' ', '_') + txt[ent.end_char:] return txt def cleaned_doc_corpus(corpus): ''' generator function to use spaCy to parse docs, clean docs, tokenize named entities, and yield documents ''' for parsed_doc in nlp.pipe(clean_doc(corpus), batch_size=nlp_batch_size, n_threads=nlp_n_threads): yield tokenize_entities(parsed_doc) def punct_space_more(token): ''' helper function to eliminate tokens that are pure punctuation or whitespace or digits or only 1 character ''' return ( token.is_punct or token.is_space or token.is_digit or token.text == "'s" or token.lemma_ == '-PRON-' or # token.lemma_ == 'say' or # token.lemma_ == 'tell' or # token.lemma_ == 'be' or len(token.text) <= 1) def line_doc(filename): ''' generator function to read in docs from the file, un-escape the original line breaks in the text, and do additional cleaning ''' def hyp_to_us(doc): return re.sub(r'\b-\b', '_', doc) def remove_punct(doc): # keep: alphanumberic (w), spaces (s), single quote, underscore return re.sub(r'[^\w\s\'_]+', '', doc) # with codecs.open(filename, encoding='utf_8') as f: with smart_open(filename) as f: for doc in f: yield remove_punct(hyp_to_us(doc.decode())).replace( '\\n', '\n') def lemmatized_sentence_corpus(filename): ''' generator function to use spaCy to parse docs, lemmatize the text, and yield sentences ''' for parsed_doc in nlp.pipe(line_doc(filename), batch_size=nlp_batch_size, n_threads=nlp_n_threads): for sent in parsed_doc.sents: yield ' '.join([ token.lemma_ for token in sent if not punct_space_more(token) ]) if verbose: logger.info(f'Working on text from: {col}') # # debug - only getting from the sample dataframe here # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy() df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].copy() nlp = spacy.load('en', disable=[]) # clean text and tokenize entities if verbose: logger.info('Cleaning docs...') df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values)) # remove 'the_' from NER tokens df_phrased[col] = df_phrased[col].apply( lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()])) if verbose: logger.info('\tDone.') # create & open a new file in write mode if verbose: logger.info('Saving documents, one per line...') doc_count = 0 with codecs.open(filepath_dict['doc_txt_filepath'], 'w', encoding='utf_8') as doc_txt_file: for doc in df_phrased[[col]].apply(lambda x: ' '.join(x), axis=1).tolist(): # write the doc as a line in the new file # escape newline characters in the original doc text doc_txt_file.write(doc.replace('\n', '\\n') + '\n') doc_count += 1 if verbose: logger.info( f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}" ) nlp = spacy.load('en', disable=['ner']) # lemmatize and save sentences if overwrite_interim: if verbose: logger.info( f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}" ) # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f: with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f: for sentence in lemmatized_sentence_corpus( filepath_dict['doc_txt_filepath']): f.write(sentence + '\n') if verbose: logger.info('Done.') unigram_sentences = LineSentence( filepath_dict['unigram_sentences_filepath']) if verbose: logger.info('Unigram examples:') for unigram_sentence in it.islice(unigram_sentences, 10, 20): logger.info(u' '.join(unigram_sentence)) logger.info('=' * 30) if verbose: logger.info('Finding bigram phrases') # create the bigram model bigram = Phrases(unigram_sentences, min_count=phrase_min_count, threshold=phrase_threshold, max_vocab_size=phrase_max_vocab_size, progress_per=phrase_progress_per, scoring=phrase_scoring, common_terms=phrase_common_terms) bigram_model = Phraser(bigram) bigram_model.save(filepath_dict['bigram_model_filepath']) if verbose: logger.info( f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}" ) # save bigram sentences with codecs.open(filepath_dict['bigram_sentences_filepath'], 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence( filepath_dict['bigram_sentences_filepath']) if verbose: logger.info('Bigram examples:') for bigram_sentence in it.islice(bigram_sentences, 10, 20): logger.info(u' '.join(bigram_sentence)) logger.info('=' * 30) if verbose: logger.info('Finding trigram phrases') # create the trigram model trigram = Phrases(bigram_sentences, min_count=phrase_min_count, threshold=phrase_threshold, max_vocab_size=phrase_max_vocab_size, progress_per=phrase_progress_per, scoring=phrase_scoring, common_terms=phrase_common_terms) trigram_model = Phraser(trigram) trigram_model.save(filepath_dict['trigram_model_filepath']) if verbose: logger.info( f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}" ) # create trigram sentences with codecs.open(filepath_dict['trigram_sentences_filepath'], 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence( filepath_dict['trigram_sentences_filepath']) if verbose: logger.info('Trigram examples:') for trigram_sentence in it.islice(trigram_sentences, 10, 20): logger.info(u' '.join(trigram_sentence)) logger.info('=' * 30) if verbose: logger.info( f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}" ) # using saved models, write transformed text out to a new file, one doc per line with codecs.open(filepath_dict['trigram_docs_filepath'], 'w', encoding='utf_8') as f: for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']), batch_size=nlp_batch_size, n_threads=nlp_n_threads): # removing punctuation and whitespace unigram_doc = [ token.lemma_ for token in parsed_doc if not punct_space_more(token) ] # apply the first-order and second-order phrase models bigram_doc = bigram_model[unigram_doc] trigram_doc = trigram_model[bigram_doc] # remove any remaining stopwords trigram_doc = [ term for term in trigram_doc if term not in nlp.Defaults.stop_words ] #extend the stop workds stop_words_extend = [ 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come' ] trigram_doc = [ term for term in trigram_doc if term not in stop_words_extended ] # write the transformed doc as a line in the new file trigram_doc = ' '.join(trigram_doc) f.write(trigram_doc + '\n') if verbose: logger.info('Done.') # put the text back in the dataframe trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath']) if len([doc for doc in trigram_docs]) == df_phrased.shape[0]: for i, doc in enumerate(trigram_docs): df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc) else: raise ValueError( 'Different number of processed and original documents') # save dataframe if verbose: logger.info('Saving NLP processed data: {}'.format( filepath_dict['filepath_out'])) df_phrased.to_csv(filepath_dict['filepath_out']) return df_phrased
from gensim.parsing.preprocessing import preprocess_string from gensim.parsing.preprocessing import strip_punctuation from gensim.parsing.preprocessing import strip_multiple_whitespaces from gensim.parsing.preprocessing import strip_non_alphanum from gensim.parsing.preprocessing import remove_stopwords # EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment" EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment" csv_file = EXP_HOME + "/stackoverflow/eclipse/eclipse-qa.csv" CUSTOM_FILTERS = [ lambda x: x.lower(), strip_multiple_whitespaces, strip_punctuation, remove_stopwords, strip_non_alphanum ] sentences = LineSentence(open(csv_file, 'r'), max_sentence_length=100000, limit=None) pre_processed = list() for sentence in sentences: # print(' '.join(sentence)) temp = ' '.join(sentence) pp_sentence = preprocess_string(temp, CUSTOM_FILTERS) # print(pp_sentence) pre_processed.append(' '.join(pp_sentence)) # saving the pre-processed to the file myFile = open(pp_raw_code, 'w') for line in pre_processed: myFile.write("%s\n" % line) print("Corpus preprocessed successfully!")
class NP2vec: """ Initialize the np2vec model, train it, save it and load it. """ def is_marked(self, s): """ Check if a string is marked. Args: s (str): string to check """ return len(s) > 0 and s[-1] == self.mark_char def __init__( self, corpus, corpus_format='txt', mark_char='_', word_embedding_type='word2vec', sg=0, size=100, window=10, alpha=0.025, min_alpha=0.0001, min_count=5, sample=1e-5, workers=20, hs=0, negative=25, cbow_mean=1, iter=15, min_n=3, max_n=6, word_ngrams=1): """ Initialize np2vec model and train it. Args: corpus (str): path to the corpus. corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json formats are supported. For json format, the file should contain an iterable of sentences. Each sentence is a list of terms (unicode strings) that will be used for training. mark_char (char): special character that marks NP's suffix. word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and fasttext are supported. np2vec_model_file (str): path to the file where the trained np2vec model has to be stored. binary (bool): boolean indicating whether the model is stored in binary format; if word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True. sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training algorithm. If 1, CBOW is used,otherwise, skip-gram is employed. size (int): model training hyperparameter, size of the feature vectors. window (int): model training hyperparameter, maximum distance between the current and predicted word within a sentence. alpha (float): model training hyperparameter. The initial learning rate. min_alpha (float): model training hyperparameter. Learning rate will linearly drop to `min_alpha` as training progresses. min_count (int): model training hyperparameter, ignore all words with total frequency lower than this. sample (float): model training hyperparameter, threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5) workers (int): model training hyperparameter, number of worker threads. hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non- zero, negative sampling will be used. negative (int): model training hyperparameter, negative sampling. If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. iter (int): model training hyperparameter, number of iterations. min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used for training word representations. max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training. """ self.mark_char = mark_char self.word_embedding_type = word_embedding_type self.sg = sg self.size = size self.window = window self.alpha = alpha self.min_alpha = min_alpha self.min_count = min_count self.sample = sample self.workers = workers self.hs = hs self.negative = negative self.cbow_mean = cbow_mean self.iter = iter self.min_n = min_n self.max_n = max_n self.word_ngrams = word_ngrams if corpus_format == 'txt': self._sentences = LineSentence(corpus) elif corpus_format == 'json': with open(corpus) as json_data: self._sentences = json.load(json_data) elif corpus_format == 'conll2000': try: self._sentences = list() for chunked_sent in conll2000.chunked_sents(corpus): tokens = list() for chunk in chunked_sent: if hasattr(chunk, '_label') and chunk._label == 'NP': s = '' for w in chunk: s += w[0] + self.mark_char tokens.append(s) else: if isinstance(chunk, nltk.Tree): for w in chunk: tokens.append(w[0]) else: tokens.append(chunk[0]) self._sentences.append(tokens) except Exception: print('Conll2000 dataset is missing from NLTK. See downloading details in the ' 'README file') else: logger.error('invalid corpus format: ' + corpus_format) sys.exit(0) if word_embedding_type == 'fasttext' and word_ngrams == 1: # remove the marking character at the end for subword fasttext model training for i, sentence in enumerate(self._sentences): self._sentences[i] = [ w[:-1] if self.is_marked(w) else w for w in sentence] logger.info('training np2vec model') self._train() def _train(self): """ Train the np2vec model. """ if self.word_embedding_type == 'word2vec': self.model = Word2Vec( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=self.iter) elif self.word_embedding_type == 'fasttext': self.model = FastText( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams) else: logger.error( 'invalid word embedding type: ' + self.word_embedding_type) sys.exit(0) def save(self, np2vec_model_file='np2vec.model', binary=False): """ Save the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format """ if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1: if not binary: logger.error( "if word_embedding_type is fasttext and word_ngrams is 1, " "binary should be set to True.") sys.exit(0) # not relevant to prune fasttext subword model self.model.save(np2vec_model_file) else: # prune non NP terms logger.info('pruning np2vec model') total_vec = 0 vector_size = self.model.vector_size for word in self.model.wv.vocab.keys(): if self.is_marked(word): total_vec += 1 logger.info( "storing %sx%s projection weights for NP's into %s" % (total_vec, vector_size, np2vec_model_file)) with utils.smart_open(np2vec_model_file, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store NP vectors in sorted order: most frequent NP's at the top for word, vocab in sorted( iteritems( self.model.wv.vocab), key=lambda item: -item[1].count): if self.is_marked(word): embedding_vec = self.model.wv.syn0[vocab.index] if binary: fout.write( utils.to_utf8(word) + b" " + embedding_vec.tostring()) else: fout.write( utils.to_utf8( "%s %s\n" % (word, ' '.join( "%f" % val for val in embedding_vec)))) @classmethod def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')