dirs = [ 'nlcow14ax01', 'nlcow14ax02', 'nlcow14ax03', 'nlcow14ax04', 'nlcow14ax05', 'nlcow14ax06', 'nlcow14ax07' ] vvb = '/vol/tensusers/fkarsdorp/vvb.tokenized.txt' def __iter__(self): for directory in CowReader.dirs: with codecs.open(os.path.join(CowReader.root, directory, directory + ".xml"), encoding='utf-8') as infile: sentence = [] for line in infile: if line.startswith('<s'): continue elif line.startswith('</s>'): yield sentence sentence = [] else: word, pos, lemma = line.strip().split('\t') if pos not in ('$.', 'punc'): sentence.append(word.lower()) with codecs.open(CowReader.vvb, encoding='utf-8') as vvb: for sentence in vvb: yield list(tokenize(sentence, lowercase=True)) sentences = CowReader() model = Word2Vec(sentences, size=300, window=10, min_count=10, workers=20) model.save("/vol/tensusers/fkarsdorp/cow-vvb.w2v")
KoreanTokenizedTerms = [] for KoreanTokenizedDocument in KoreanTokenizedDocuments: KoreanTokenizedTerms.append([ term[0] for term in KoreanTokenizedDocument if (term[1] in ('Noun', 'Adjective', 'Verb')) ]) #print(KoreanTokenizedSentences[:5]) print(KoreanTokenizedTerms) model = Word2Vec(sentences=KoreanTokenizedTerms, size=64, sg=1, window=10, min_count=1, seed=42, workers=8) model.save('KoreanWord2Vec.w2v') print(u"==================================") print(u"삼성 Similarity Words:") print(u"==================================") for word in model.most_similar(positive=[u'삼성'], negative=[], topn=30): print("==> " + str(word)) print("\n")
labelized = [] for i, v in tqdm(enumerate(tweets)): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized x_train = labelizeTweets(x_train, 'TRAIN') x_test = labelizeTweets(x_test, 'TEST') x_train_zemberek = labelizeTweets(x_train_zemberek, 'TRAIN') x_test_zemberek = labelizeTweets(x_test_zemberek, 'TEST') n_dims = [50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300] n_dim = 75 tweet_w2v = Word2Vec(size=n_dim, min_count=3, hs=1, window=7, iter=75, sg=0) tweet_w2v.build_vocab([x.words for x in tqdm(x_train)]) tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) tweet_w2v_zemberek = Word2Vec(size=n_dim, min_count=3, hs=1, window=7, iter=75, sg=0) tweet_w2v_zemberek.build_vocab([x.words for x in tqdm(x_train_zemberek)]) tweet_w2v_zemberek.train([x.words for x in tqdm(x_train_zemberek)], total_examples=tweet_w2v_zemberek.corpus_count, epochs=tweet_w2v_zemberek.iter)
rg.set_model(model=lfr_params) g = rg.lfr_model() graph_path = "./outputs/lfr_synthetic_n1000.gml" nx.write_gml(g, graph_path) # Find the embedding of the temp_adjlist_file = "./temp/graph.adjlist" embedding_file = "./outputs/output.embedding" nx.write_edgelist(g, temp_adjlist_file) dwg = dw.load_edgelist(temp_adjlist_file, undirected=True) walks = dw.build_deepwalk_corpus(dwg, num_paths=dw_params['n'], path_length=dw_params['l'], alpha=0) model = Word2Vec(walks, size=dw_params['d'], window=dw_params['w'], min_count=0, sg=1, hs=1, workers=dw_params['workers']) model.wv.save_word2vec_format(embedding_file) comdetect = CommunityDetection(embedding_file, graph_path, params={'directed': False}) score = comdetect.evaluate(num_of_communities=kmeans_num_of_communities) print("Score: {}".format(score))
from nltk.corpus import stopwords stoplist = set(stopwords.words('english')) if __name__ == '__main__': data = pd.read_csv('codeforces_problems_csv/data.csv') X_data = list(data['problem_text']) if not os.path.exists('w2v_problem_data.bin'): sentences = [line for text in X_data for line in clean(text)] #for i in range(len(sentences)): # sentences[i] = get_lemmatized_tokens(' '.join(sentences[i])) model = Word2Vec(sentences, workers=4, size=200, min_count=50, window=10, sample=1e-3) model.save('w2v_problem_data.bin') else: model = Word2Vec.load('w2v_problem_data.bin') # very common word in dp problems print(model.most_similar('ways')) print(len(model.wv.vocab.values())) X = model[model.wv.vocab] # visualize the data tsne = TSNE(n_components=2)
from gensim.models.word2vec import Word2Vec print("hello world") model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers)
sns.set(style='white', context='notebook', palette='deep') train = pd.read_csv('nlp_test/Train/Train_DataSet.csv') lablel = pd.read_csv('nlp_test/Train/Train_DataSet_Label.csv') test = pd.read_csv('nlp_test/Test/Test_DataSet.csv') test_title = test['title'] #将文本和标签合并到一个数据集 train = pd.merge(train, lablel, on='id') train['title'] = train['title'].apply(lambda x: str(x)) train['words'] = train['title'].apply(lambda x: jieba.lcut(x)) #构建特征工程 x = train['words'] y = train['label'] w2v = Word2Vec(size=100, min_count=5, window=5) w2v.build_vocab(x) w2v.train(x, total_examples=w2v.corpus_count, epochs=w2v.iter) # # #获取一个句子的向量 # # def total_vec(words): # vec=np.zeros(300).reshape(1,300) # for word in words: # try: # vec+=w2v.wv[word].reshape(1,300) # except KeyError: # continue # return vec # # train_vec=np.concatenate(total_vec(words) for words in x)
for file in files: with open(root + '/' + file, 'r') as f: text += f.read() ### each.translate(translator) == 특수문자 제거 ### x.lower() == 소문자화 ### if x.lower() not in stop_words == 불용어제거 clean = [[ x.lower() for x in each.translate(translator).split() if x.lower() not in stop_words ] for each in text.split('.\n')] print(clean) print("------------------------------------------------------------") #window크기 5, 최소 출현수 5, skip-gram, 10000번 학습 model = Word2Vec(clean, window=20, min_count=7, sg=1, iter=10000) print(list(model.wv.vocab.keys())) print("vocab length : %d" % len(model.wv.vocab)) #유사 의미 찾기 # print(model.wv.most_similar("good")) # from sklearn.manifold import TSNE from matplotlib import pyplot as plt X = model.wv[model.wv.vocab] tsne = TSNE(n_components=2) X_tsne = tsne.fit_transform(X)
#%% data = 'total_results.p' total_results = pickle.load(open(data, "rb")) #%% ### initialize model and build vocabulary n_dim = 300 window = 5 downsampling = 0.001 seed = 1 num_workers = os.cpu_count() - 2 ## not sure if this is a good idea min_count = 30 imf_w2v = Word2Vec(sg=1, seed=seed, workers=num_workers, size=n_dim, min_count=min_count, window=window, sample=downsampling) ## build the vocabulary imf_w2v.build_vocab(total_results) #%% ## train w2v model corpus_count = imf_w2v.corpus_count overall_start_time = time.time() for i in range(200): start_time = time.time() iteration = 10 print('running', i + 1, '-', (i + 1) * iteration) if gensim.__version__[0] == '1':
#wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) #tfidf = TfidfModel(wiki) # save for persistence #wiki.save('wiki.corpus') #tfidf.save('wiki.tfidf.model') # word2vec sentences=[] class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): yield line.split() file_path="enwiki-latest-pages-articles1.xml" with open(file_path,"r",buffering=1) as f: for line in f: sentences.append(line) params = {'size': 200, 'window': 10, 'min_count': 10, 'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3,} word2vec = Word2Vec(sentences, **params) word2vec.save('wiki.word2vec.model') print(word2vec.wv['configuration'])
import logging import re import nltk from gensim.models.word2vec import Word2Vec from gensim.models.word2vec import LineSentence from gensim.models.keyedvectors import KeyedVectors logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = LineSentence('txt/source.txt') model = Word2Vec(sentences, min_count=3, size=10, window=300) word_vectors = model.wv word_vectors.save('model) model = KeyedVectors.load('model')
# load all data sentence with open('data/structured/sentence_neg_handled/train/pos.pkl', 'rb') as fi: sentence_train_pos = dill.load(fi) with open('data/structured/sentence_neg_handled/train/neg.pkl', 'rb') as fi: sentence_train_neg = dill.load(fi) # join the data all_data = sentence_train_pos + sentence_train_neg size = 300 window = 7 min_count = 15 model_word2vec = Word2Vec(all_data, size=size, window=window, min_count=min_count, workers=4) # filename to save (or load) fname = "model/model{}_{}_{}.pkl".format(size, window, min_count) # check model print(model_word2vec.wv['good']) # save the file in fname model_word2vec.save(fname)
from gensim.models.word2vec import Word2Vec import pandas as pd #训练词向量 #加载数据 # 英语问句对 英语问句1,西班牙语翻译1,英语问句2,西班牙语翻译2,匹配标注。 english_spa = pd.read_csv('/home/moon/work/tianchi/data/cikm_english_train_20180516.txt', sep = '\t', header = None) english_spa.columns = ['eng_qura1', 'spa_qura1', 'eng_qura2', 'spa_qura2', 'label'] #西班牙语问句1 english_spa['spa_qura_list_1'] = english_spa['spa_qura1'].apply(lambda x : x.split(' ')) #西班牙语问句2 english_spa['spa_qura_list_2'] = english_spa['spa_qura2'].apply(lambda x : x.split(' ')) spa_list = list(english_spa['spa_qura_list_1']) spa_list.extend(list(english_spa['spa_qura_list_2'])) model = Word2Vec(spa_list, sg=1, size=30, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=8) model.save("./w2v.mod")
FILTER_ENGLISH = True # Name for output w2v model file OUTPUT_MODEL_FILE = "w2v_yelp_100_alpha_0.025_window_4" PICKLED_DATA = "/home/alfredo/deep-nlp/data/reviews.pickle." NUM_PARTITIONS = 2 # Use all data reviews_texts, _, _, _, _ = get_reviews_data(range(1, NUM_PARTITIONS), PICKLED_DATA) # Each review will be considered a sentence sentences = [] for num, text in enumerate(reviews_texts): if num % 10000 == 0: print "%d out of %d reviews read" % (num, len(reviews_texts)) if FILTER_ENGLISH: if detect_language(text) == u"english": sentences.append(tokenize_text(text)) else: sentences.append(text) # Build a w2v model w2v = Word2Vec(sentences=sentences, size=100, alpha=0.025, window=4, min_count=2, sample=1e-5, workers=4, negative=10) w2v.save(OUTPUT_MODEL_FILE)
def labelizeTweets(tweets, label_type): labelized = [] for i, v in tqdm(enumerate(tweets)): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized x_train = labelizeTweets(x_train, 'TRAIN') x_test = labelizeTweets(x_test, 'TEST') # print(x_train[0]) tweet_w2v = Word2Vec(size=200, min_count=10) # can change size tweet_w2v.build_vocab([x.words for x in tqdm(x_train)]) tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10) matrix = vectorizer.fit_transform([x.words for x in x_train]) tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) print('vocab size :', len(tfidf)) def buildWordVector(tokens, size): vec = np.zeros(size).reshape((1, size)) count = 0. for word in tokens:
result.append(TaggedDocument(t.split(), [prefix + '_%s' % i])) return result #bütün tweet verilerini topladık -train-test olan text columnlar toplandı all_x = pd.concat([x_train, x_test]) all_x_w2v = labelize_tweets_ug(all_x, 'all') #tweet kelimelerine word2vec cbow yöntemi(sg=0) uygulanıyor, #cümle içindeki current_wod ile predicted word arasındaki mesafewindow_size=2 #size=100 feature vetörlerin boyutu cores = multiprocessing.cpu_count() model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065) model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)]) #embedding eğitimi yapılıyor for epoch in range(30): model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1) model_ug_cbow.alpha -= 0.002 model_ug_cbow.min_alpha = model_ug_cbow.alpha #daha sonra skip-gram modeli model_sg = Word2Vec(sg=1,
all_docs.labels.iloc[1] all_docs.doc_words[4][:52] print(all_docs.doc_words[6]) # %% import multiprocessing import sys from gensim.models.word2vec import Word2Vec workers = multiprocessing.cpu_count() print('number of cpu: {}'.format(workers)) assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise." # %% word_model = Word2Vec(all_docs.doc_words, min_count=2, size=300, window=5, workers=workers, iter=100) # %% from UtilWordEmbedding import MeanEmbeddingVectorizer mean_vec_tr = MeanEmbeddingVectorizer(word_model) doc_vec = mean_vec_tr.transform(all_docs.doc_words) # %% word_model.most_similar('submit') # %%
# -*- coding: utf-8 -*- import logging import sys from gensim.models.word2vec import Word2Vec, LineSentence logging.basicConfig(level=logging.INFO) model = Word2Vec(LineSentence(sys.argv[1]), sg=1) model.save(sys.argv[2])
# data import and cleaning test_solution = pd.read_csv("test_with_solutions.csv") data_train = pd.read_csv("train.csv") corpus = corpus_creation(data_train['Comment']) # gensim word2vec model vector_size = 512 window_size = 10 # Create Word2Vec word2vec = Word2Vec(sentences=corpus, size=vector_size, window=window_size, negative=20, iter=50, seed=1000, workers=multiprocessing.cpu_count()) # Train subset size (0 < size < len(tokenized_corpus)) train_size = 3900 # test len(corpus - train) test_size = 47 # Compute average and max tweet length avg_length = 0.0 max_length = 0 for comment in corpus: if len(comment) > max_length:
""" corpus = [preprocessing(x) for x in corpus] X_train = [preprocessing(x) for x in X_train] X_test = [preprocessing(x) for x in X_test] # print(corpus) # print(X_train) """ 训练NLP模型 有了这些干净的数据集,我们可以做我们的NLP模型了。 先用最简单的Word2Vec """ model = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4) # print(model['ok']) """ 用NLP模型表达我们的数据 接着,我们可以用这个坐标,来表示之前干干净净的数据。 但是有个问题。我们的vec是基于每个单词的,怎么办呢? 由于文本本身的量很小,我们可以把所有的单词的vector拿过来取个平均值 """ # 先拿到全部的vocabulary vocab = model.wv.vocab def get_vector(word_list): # 得到任意text的vector # 建立一个全是0的array res = np.zeros([128])
def word2vec_train(combined): model = Word2Vec(min_count=n_exposures, window=window_size) model.build_vocab(combined) # input: list model.train(combined, total_examples=model.corpus_count, epochs=model.iter) model.save('./Word2vec_model.pkl') model.wv.save_word2vec_format("./word2vec.model", binary=True)
from gensim.models.word2vec import Word2Vec import gzip f = gzip.open('D:\Wikipedia\OpenSubtitles2018.en.gz') data = f.readlines() for i in range(len(data)): data[i] = data[i][:-1].decode("utf-8") model = Word2Vec(data, size=300, min_count=5, window=10, workers=7) model.save("model-subtitles") print('done') print('done')
def main(): arg_parser = ArgumentParser(description='Script to train Word2Vec.') arg_parser.add_argument('-i', '--fasta-file') arg_parser.add_argument('-o', '--model-file') arg_parser.add_argument('-c', '--corpus-file') arg_parser.add_argument('-v', '--word-vectors-file') arg_parser.add_argument('-u', '--context-vectors-file') arg_parser.add_argument('-n', '--ngram-size', type=int, default=3) arg_parser.add_argument('-s', '--vector-size', type=int, default=100) arg_parser.add_argument('-w', '--window-size', type=int, default=5) arg_parser.add_argument('-t', '--num-threads', type=int, default=3) arg_parser.add_argument('-r', '--random-seed', type=int, default=None) arg_parser.add_argument('-k', '--num-iterations', type=int, default=5) args = arg_parser.parse_args() fasta_file = args.fasta_file model_file = args.model_file corpus_file = args.corpus_file word_vectors_file = args.word_vectors_file context_vectors_file = args.context_vectors_file ngram_size = args.ngram_size random_seed = args.random_seed vector_size = args.vector_size window_size = args.window_size num_threads = args.num_threads num_iterations = args.num_iterations if not any([fasta_file, corpus_file]): print('Error: Please specify either a FASTA file or corpus file.') arg_parser.print_help() return if fasta_file and not Path(fasta_file).exists(): print('FASTA file not found: {}'.format(fasta_file)) return if not corpus_file: corpus_file = 'corpus.txt' elif not fasta_file and not Path(corpus_file).exists(): print('Corpus file not found: {}'.format(corpus_file)) return if random_seed: print('Random-Seed-Mode: Setting number of threads to 1') num_threads = 1 python_hash_seed = environ.get('PYTHONHASHSEED', None) if python_hash_seed is None or python_hash_seed == 'random': print('Random-Seed-Mode: Global PYTHONHASHSEED needs to be set') return else: random_seed = 42 if fasta_file: make_corpus(fasta_file, corpus_file, ngram_size) if not any([model_file, word_vectors_file, context_vectors_file]): return model = Word2Vec( LineSentence(corpus_file), size=vector_size, window=window_size, min_count=2, sg=1, # hs=0, # negative=5, # ns_exponent=0.75, # requires gensim 3.5 # cbow_mean=1, # sample=0.001, iter=num_iterations, # alpha=0.025, # min_alpha=0.0001, # batch_words=10000, # null_word=0, # trim_rule=None, # compute_loss=False, # sorted_vocab=1, # max_vocab_size=None, # max_final_vocab=None, # requires gensim 3.5 seed=random_seed, workers=num_threads, # callbacks=() ) if model_file: model.save(model_file) if word_vectors_file: save_w2v_vectors_file(word_vectors_file, model.wv.vocab, model.wv.vectors) if context_vectors_file: has_syn1 = hasattr(model, 'syn1') # hierarchical softmax has_syn1neg = hasattr(model, 'syn1neg') # negative sampling if has_syn1 and has_syn1neg: context_vectors_file_1 = context_vectors_file + '.hs' context_vectors_file_2 = context_vectors_file + '.ns' save_w2v_vectors_file(context_vectors_file_1, model.wv.vocab, model.syn1) save_w2v_vectors_file(context_vectors_file_2, model.wv.vocab, model.syn1neg) elif has_syn1: save_w2v_vectors_file(context_vectors_file, model.wv.vocab, model.syn1) elif has_syn1neg: save_w2v_vectors_file(context_vectors_file, model.wv.vocab, model.syn1neg)
def train_model(inpath, outpath): model = Word2Vec(LineSentence(inpath), workers=cpu_count()) model.save(outpath)
return w2indx, w2vec, train, test else: print('No data provided...') print('Loading Data...') train, test = import_tag(datasets=data_locations) combined = train.values() + test.values() print('Tokenising...') combined = tokenizer(combined) print('Training a Word2vec model...') model = Word2Vec(size=vocab_dim, min_count=n_exposures, window=window_size, workers=cpu_count, iter=n_iterations) model.build_vocab(combined) model.train(combined) # model.save('vectorizer.w2v') print('Transform the Data...') index_dict, word_vectors, train, test = create_dictionaries(train=train, test=test, model=model) print('Setting up Arrays for Keras Embedding Layer...') n_symbols = len(index_dict) + 1 # adding 1 to account for 0th index embedding_weights = np.zeros((n_symbols, vocab_dim)) for word, index in index_dict.items(): embedding_weights[index, :] = word_vectors[word]
def build_dataset(train_data_path, test_data_path): ''' 数据加载+预处理 :param train_data_path:训练集路径 :param test_data_path: 测试集路径 :return: 训练数据 测试数据 合并后的数据 ''' # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format( len(train_df), len(test_df))) # 2. 空值剔除 train_df.dropna(subset=['Report'], inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) # 3.多线程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format( len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) train_df.to_csv(train_seg_path, index=None, header=False) test_df.to_csv(test_seg_path, index=None, header=False) # 6. 保存合并数据 merged_df.to_csv(merger_seg_path, index=None, header=False) # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim, sg=1, workers=cores, iter=wv_train_epochs, window=5, min_count=5) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 训练集 验证集划分 X_train, X_val, y_train, y_val = train_test_split( train_df['X'], train_df['Report'], test_size=0.002, # 8W*0.002 ) X_train.to_csv(train_x_seg_path, index=None, header=False) y_train.to_csv(train_y_seg_path, index=None, header=False) X_val.to_csv(val_x_seg_path, index=None, header=False) y_val.to_csv(val_y_seg_path, index=None, header=False) test_df['X'].to_csv(test_x_seg_path, index=None, header=False) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(train_x_pad_path, index=None, header=False) train_df['Y'].to_csv(train_y_pad_path, index=None, header=False) test_df['X'].to_csv(test_x_pad_path, index=None, header=False) # # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len)) # 11. 词向量再次训练 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # 保存词向量模型 wv_model.save(save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) # 12. 更新vocab vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)} reverse_vocab = { index: word for index, word in enumerate(wv_model.wv.index2word) } # 保存字典 save_dict(vocab_path, vocab) save_dict(reverse_vocab_path, reverse_vocab) # 13. 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(embedding_matrix_path, embedding_matrix) # 14. 数据集转换 将词转换成索引 [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231 vocab = Vocab() train_ids_x = train_df['X'].apply( lambda x: transform_data(x, vocab.word2id)) train_ids_y = train_df['Y'].apply( lambda x: transform_data(x, vocab.word2id)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab.word2id)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800, 403, 986 ]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) # 保存数据 np.save(train_x_path, train_X) np.save(train_y_path, train_Y) np.save(test_x_path, test_X) return train_X, train_Y, test_X
cleaned_train.close() cleaned_test.close() for category in categories: lines = open('data/cleaned/cleaned_'+category+'_train.txt', 'r').readlines() cleaned_docs_in_sent = open('data/cleaned/cleaned_'+category+'_train_in_sent.txt', 'w') for line in lines: sentences = nltk.sent_tokenize(line) for s in sentences: cleaned_docs_in_sent.write(re.sub('\.+$', '', s.strip()) + '\n') cleaned_docs_in_sent.close() seeds = [1, 123, 888, 1234, 8888] entertainment_model, ideas_model, world_model, us_model, politics_model, all_model = [], [], [], [], [], [] for i in range(5): entertainment_model += [Word2Vec(LineSentence('data/cleaned/cleaned_Entertainment_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)] ideas_model += [Word2Vec(LineSentence('data/cleaned/cleaned_Ideas_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)] world_model += [Word2Vec(LineSentence('data/cleaned/cleaned_World_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)] us_model += [Word2Vec(LineSentence('data/cleaned/cleaned_US_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)] politics_model += [Word2Vec(LineSentence('data/cleaned/cleaned_Politics_train_in_sent.txt'), size=300, window=5, min_count=5, workers=4)] # os.system('cat data/cleaned/cleaned_Entertainment_train_in_sent.txt data/cleaned/cleaned_Ideas_train_in_sent.txt data/cleaned/cleaned_World_train_in_sent.txt ' # + 'data/cleaned/cleaned_US_train_in_sent.txt data/cleaned/cleaned_Politics_train_in_sent.txt > data/cleaned/cleaned_all_train_in_sent.txt') all_model += [Word2Vec(LineSentence('data/cleaned/cleaned_all_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)] vocab = list(set(world_model[0].vocab.keys()).union(us_model[0].vocab.keys()).union(politics_model[0].vocab.keys()).union(all_model[0].vocab.keys()) .union(entertainment_model[0].vocab.keys()).union(ideas_model[0].vocab.keys())) indices_of_vocab = dict({vocab[i] : i for i in range(len(vocab))}) count_of_vocab = np.zeros(len(vocab), dtype='int32') for category in categories: lines = open('data/cleaned/cleaned_'+category+'_train.txt', 'r').readlines() for line in lines:
# 数据集打乱 index = [i for i in range(len(all_data))] np.random.shuffle(index) x = np.array(all_data)[index] y = y[index] # 数据集划分 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20) print("data preprocessing completed") # word2vec imdb_w2v = Word2Vec(size=N_DIM, min_count=MIN_COUNT) imdb_w2v.build_vocab(all_data) imdb_w2v.train(all_data, total_examples=len(all_data), epochs=w2v_EPOCH) print("word2vec completed") # word2vec后处理 n_symbols = len(imdb_w2v.wv.vocab.keys()) + 1 embedding_weights = np.zeros((n_symbols, 100)) idx = 1 word2idx_dic = {} for w in imdb_w2v.wv.vocab.keys(): embedding_weights[idx, :] = imdb_w2v[w] word2idx_dic[w] = idx idx = idx + 1 # print(embedding_weights[0, :])
sentences = [] pre = open('sentences.txt', 'w', encoding='utf-8') for index in range(1, 600): path = ('judgments-%s.json' % index) data = json.load(open(path, encoding="utf8"))['items'] for j in data: text = j['textContent'].replace("-\n", "").lower() text = re.sub(r'<[^>]*>', "", text) for sentence in sent_tokenize(text, language='polish'): pre.write(re.sub('\s+', ' ', sentence).strip() + "\n") size += len(text) if size >= 1000000000: break if size >= 1000000000: break pre.close() print(size) sentences = LineSentence('sentences.txt') bigram = Phraser(Phrases(sentences)) bigram.save("bigram") print("1") sentence_stream = [bigram[sentence] for sentence in sentences] trigram = Phraser(Phrases(sentence_stream)) trigram.save("trigram") print("2") model = Word2Vec([trigram[bigram[sentence]] for sentence in sentence_stream], window=5, size=300, sg=0, workers=12, min_count=3) model.save("model") print("processed")
review_part.shape import warnings warnings.filterwarnings("ignore") tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') def split_sentences(review): raw_sentences = tokenizer.tokenize(review.strip()) sentences = [clean_text(s) for s in raw_sentences if s] #清洗 return sentences sentences = sum(review_part.apply(split_sentences), []) print('{} reviews -> {} sentences'.format(len(review_part), len(sentences))) sentences_list = [] for line in sentences: sentences_list.append(nltk.word_tokenize(line)) num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count num_workers = 4 # Number of threads to run in parallel context = 10 # Context window size model_name = 'Word_Vector.model' from gensim.models.word2vec import Word2Vec model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context) model.init_sims(replace=True) model.save(os.path.join('..', 'models', model_name))