def train(self): logging.info(' train tfidf model ... ') self.tfidf = models.TfidfModel(self.corpus, normalize=True) logging.info(' train word2vec model ... ') self.w2v = models.Word2Vec(min_count=2, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, iter=7) self.w2v.train(self.data, total_examples=self.w2v.corpus_count, epochs=15, report_delay=1) logging.info(' train fasttext model ... ') self.fast = models.FastText(self.data, size=300, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=2)
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : iterable of iterables of str Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. Returns ------- :class:`~gensim.sklearn_api.ftmodel.FTTransformer` The trained model. """ self.gensim_model = models.FastText( sentences=X, sg=self.sg, hs=self.hs, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, word_ngrams=self.word_ngrams, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, min_n=self.min_n, max_n=self.max_n, sorted_vocab=self.sorted_vocab, bucket=self.bucket, trim_rule=self.trim_rule, batch_words=self.batch_words ) return self
def trainer(self): ''' @description: Train tfidf, word2vec, fasttext and autoencoder @param {type} None @return: None ''' logger.info('train tfidf') count_vect = TfidfVectorizer(stop_words=self.stopWords, max_df=0.4, min_df=0.001, ngram_range=(1, 2)) self.tfidf = count_vect.fit(self.data["text"]) logger.info('train word2vec') self.data['text'] = self.data["text"].apply(lambda x: x.split(' ')) self.w2v = models.Word2Vec(min_count=2, window=5, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, iter=30, max_vocab_size=50000) self.w2v.build_vocab(self.data["text"]) self.w2v.train(self.data["text"], total_examples=self.w2v.corpus_count, epochs=15, report_delay=1) logger.info('train fast') # 训练fast的词向量 self.fast = models.FastText( self.data["text"], size=300, # 向量维度 window=3, # 移动窗口 alpha=0.03, min_count=2, # 对字典进行截断, 小于该数的则会被切掉,增大该值可以减少词表个数 iter=30, # 迭代次数 max_n=3, word_ngrams=2, max_vocab_size=50000) logger.info('train lda') self.id2word = gensim.corpora.Dictionary(self.data.text) corpus = [self.id2word.doc2bow(text) for text in self.data.text] self.LDAmodel = LdaMulticore(corpus=corpus, id2word=self.id2word, num_topics=30, workers=2, chunksize=4000, passes=7, alpha='asymmetric') logger.info('train autoencoder') self.ae.train(self.data)
def main(): import sys from pprint import pprint corpus_path = sys.argv[1] corpus = Corpus(corpus_path) w2v = m.Word2Vec(sentences=corpus) print() print("Word2Vec vectors:") pprint(w2v.wv.vectors) print("=" * 80) w2v.save(f"{corpus_path}.w2v") ft = m.FastText(sentences=corpus) print() print("FastText vectors:") pprint(ft.wv.vectors) print() ft.save(f"{corpus_path}.ft")
def train_model(self, training_sample: TrainingSample): self.training_sample = training_sample self.documents = training_sample.get_documents() self.dictionary = training_sample.dictionary self.corpus = training_sample.corpus print('\nfastText model: Обучаем модель...') start_time = time.time() self.model = models.FastText(self.documents, sg=1, hs=1, size=100, alpha=0.025, window=5, min_count=3, workers=3, min_alpha=0.0001, negative=10, cbow_mean=1, iter=10, min_n=3, max_n=6, sorted_vocab=0) print('Learning time:', round((time.time() - start_time), 3), 's') print('\nFastText model: Building ft matrix...') start_time = time.time() self.similarity_matrix = self.model.wv.similarity_matrix( self.training_sample.dictionary) # construct similarity matrix print('FastText matrix time:', round((time.time() - start_time), 3), 's')
data = [ 'I love machine learning', 'I like reading books', 'Python is beautiful', 'R is horrible.', 'Machine learning is cool!', 'I really like NLP' ] # pre-process our text text = [re.sub(r'([^\s\w]|_)+', '', sentence) for sentence in data] text = [sentence.lower().split() for sentence in text] # train Word2Vec model on our data word_model = models.Word2Vec(text, size=50, min_count=1, iter=100) print(word_model.wv.vectors) print(word_model.wv.vocab) print(len(word_model.wv.vocab)) print(word_model.wv['like']) word_model.wv.most_similar('python') # train Fast model on our data word_model = models.FastText(text, size=50, min_count=1, iter=100) print(word_model.wv.vectors) print(word_model.wv.vocab) print(word_model.wv['like']) word_model.wv.most_similar('python') word_vectors = list(word_model.wv.vectors) labels = list(word_model.wv.vocab) kutils.viz_vectors(word_vectors, labels) kutils.viz_vectors_corr(word_vectors, labels) kutils.viz_vectors_lower_dim(word_vectors, labels)
def get_embedding_pro(df_raw, sentence_id, word_id, emb_size=128, window=10, dropna=False, n_jobs=4, method='skipgram', hs=0, negative=10, epoch=10, return_model=False, embedding_type='fasttext', slide_window=1): """ Now, set min_count=1 to avoid OOV... How to deal with oov in a more appropriate way... Paramter: ---------- df_raw: DataFrame contains columns named sentence_id and word_id sentence_id: like user ID, will be coerced into str word_id: like item ID, will be coerced into str emb_size: default 8 dropna: default False, nans will be filled with 'NULL_zhangqibot'. if True, nans will all be dropped. n_jobs: 4 cpus to use as default method: 'sg'/'skipgram' or 'cbow' sg : {0, 1}, optional Training algorithm: 1 for skip-gram; otherwise CBOW. hs : {0, 1}, optional If 1, hierarchical softmax will be used for model training. If 0, and `negative` is non-zero, negative sampling will be used. negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. epoch: iter : int, optional,default 10 Number of iterations (epochs) over the corpus. return_model: default True embedding_type: fasttext word2vec Return: ---------- Example: def run_w2v(sentence_id,word_id,emb_size=128): res_dict= w2v_pro(datalog,sentence_id=sentence_id,word_id=word_id, emb_size=emb_size,dropna=False,n_jobs=-1, method='cbow', hs=0,negative=10,epoch=10, return_model=False) Cache.cache_data(res_dict,nm_marker=f'EMB_DICT_W2V_CBOW_10EPOCH_{sentence_id}_{word_id}') sentence_id='user_id' for word_id in tqdm(['creative_id', 'ad_id', 'product_id', 'advertiser_id']): run_w2v(sentence_id,word_id,emb_size=128) run_w2v(sentence_id,word_id='product_category',emb_size=8) run_w2v(sentence_id,word_id='industry',emb_size=64) ---------- """ if method.lower() in ['sg', 'skipgram']: sg = 1 elif method.lower() in ['cbow']: sg = 0 else: raise NotImplementedError list_col_nm = f'{sentence_id}__{word_id}_list' if (n_jobs is None) or (n_jobs <= 0): n_jobs = multiprocessing.cpu_count() print(f"========== W2V: {sentence_id} {word_id} ==========") df = df_raw[[sentence_id, word_id, 'pt_d']].copy() if df[sentence_id].isnull().sum() > 0: print("NaNs exist in sentence_id column!!") if dropna: df = df.dropna(subset=[sentence_id, word_id]) else: df[word_id] = df[word_id].fillna(-1).astype(int).astype(str) df[sentence_id] = df[sentence_id].fillna(-1).astype(int).astype(str) df['pt_d_last'] = df['pt_d'] + slide_window fe = df.groupby([sentence_id, 'pt_d_last' ])[word_id].apply(lambda x: list(x)).reset_index() fe.columns = [sentence_id, 'pt_d', list_col_nm] df = df.merge(fe, on=[sentence_id, 'pt_d'], how='left') df[list_col_nm] = df[list_col_nm].map(lambda x: x if isinstance(x, list) else []) # 加上本行的 df[word_id + '_add'] = df[word_id].map(lambda x: [x]) df[list_col_nm] = df[list_col_nm] + df[word_id + '_add'] sentences = df[list_col_nm].values.tolist() all_words_vocabulary = df[word_id].unique().tolist() del df[list_col_nm], df['pt_d_last'], df[word_id + '_add'] gc.collect() if embedding_type == 'w2v': model = Word2Vec( sentences, size=emb_size, window=window, workers=n_jobs, min_count=1, # 最低词频. min_count>1会出现OOV sg=sg, # 1 for skip-gram; otherwise CBOW. hs=hs, # If 1, hierarchical softmax will be used for model training negative=negative, # hs=1 + negative 负采样 iter=epoch, seed=0) else: model = models.FastText(sentences, size=emb_size, window=window, workers=n_jobs, seed=0, sg=sg, iter=epoch) # get word embedding matrix emb_dict = {} for word_i in all_words_vocabulary: if word_i in model.wv: emb_dict[word_i] = model.wv[word_i] else: emb_dict[word_i] = np.zeros(emb_size) # get sentence embedding matrix emb_matrix = [] for seq in sentences: vec = [] for w in seq: if w in model.wv: vec.append(model.wv[w]) if len(vec) > 0: emb_matrix.append(np.mean(vec, axis=0)) else: emb_matrix.append([0] * emb_size) emb_matrix = np.array(emb_matrix) emb_cols = [] for i in range(emb_size): df[f'EMB_{embedding_type}_{sentence_id}_{word_id}_{slide_window}_emb_{i}'] = emb_matrix[:, i] emb_cols.append( f'EMB_{embedding_type}_{sentence_id}_{word_id}_{slide_window}_emb_{i}' ) if not return_model: model = None return { "word_emb_dict": emb_dict, "sentence_emb_df": df[emb_cols], 'model': model }
def trainer_fasttext(self, path): print('train fasttext') corpus = get_corpus(path, w2v=True) fast = models.FastText(corpus, size=300, window=3, min_count=2) return fast
def trainer(self): ''' @description: Train tfidf, word2vec, fasttext and autoencoder @param {type} None @return: None ''' logger.info('train tfidf') ''' TfidfVectorizer:使用方法 stop_words: 单词的列表,eg: ['word1', 'word2'] max_df : 最大文档频率,eg:总共5个文档,出现在其中4个文档,df=0.8 超过就过滤 min_df : 同上 ngram_range: tuple,(1,2)可以是单个词,也可是两个词 进入到统计范畴 fit(): Fit the vectorizer/model to the training data and save the vectorizer/model to a variable (returns sklearn.feature_extraction.text.TfidfVectorizer) 输入形式: * ['doc1', 'doc2', ..] docN都是分好词的,使用空格分割;list; * dataframe 某个列字段 transform(): Use the variable output from fit() to transformer validation/test data (returns scipy.sparse.csr.csr_matrix) vocabulary_: 词语与列的对应关系 ref: https://stackoverflow.com/questions/53027864/what-is-the-difference-between-tfidfvectorizer-fit-transfrom-and-tfidf-transform https://blog.csdn.net/blmoistawinde/article/details/80816179 ''' count_vect = TfidfVectorizer(stop_words=self.stopWords, max_df=0.4, min_df=0.001, ngram_range=(1, 2)) self.tfidf = count_vect.fit(self.data["text"]) #直接支持了dataframe列索引的数据格式 ''' Word2Vec 参数说明: sg=1是skip-gram算法,对低频词敏感;默认sg=0为CBOW算法。 size是输出词向量的维数,值太小会导致词映射因为冲突而影响结果,值太大则会耗内存并使算法计算变慢,一般值取为100到200之间。 window是句子中当前词与目标词之间的最大距离,3表示在目标词前看3-b个词,后面看b个词(b在0-3之间随机)。 min_count是对词进行过滤,频率小于min-count的单词则会被忽视,默认值为5。(0-100) sample(float) - 用于配置哪些较高频率的词随机下采样的阈值,有用范围是(0,1e-5) hs=1表示层级softmax将会被使用,默认hs=0且negative不为0,则负采样将会被选择使用。 workers,使用线程数。控制训练的并行,此参数只有在安装了Cpython后才有效,否则只能使用单核。 train: total_examples (int) – 统计句子数量 report_delay (float) – 进度报告等待的秒数 ref:https://www.jianshu.com/p/b996e7e0d0b0 ''' logger.info('train word2vec') self.data['text'] = self.data["text"].apply( lambda x: x.split(' ')) ##[[word1,word2],[wordn..]] self.w2v = models.Word2Vec(min_count=2, window=5, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, iter=30, max_vocab_size=50000) self.w2v.build_vocab(self.data["text"]) self.w2v.train(self.data["text"], total_examples=self.w2v.corpus_count, epochs=15, report_delay=1) logger.info('train fast') # 训练fast的词向量 self.fast = models.FastText( self.data["text"], size=300, # 向量维度 window=3, # 移动窗口 alpha=0.03, min_count=2, # 对字典进行截断, 小于该数的则会被切掉,增大该值可以减少词表个数 iter=30, # 迭代次数 max_n=3, word_ngrams=2, max_vocab_size=50000) ''' 了解下gensim: https://zhuanlan.zhihu.com/p/37175253 Gensim是一款开源的第三方Python工具包,用于从原始的非结构化的文本中,无监督地学习到文本隐层的主题向量表达; 1、corpora.Dictionary 对象 : word <-> id 之间的映射 Dictionary(documents=None, prune_at=2000000) 字典封装了在归一化词汇(word)与整型id之间的映射关系。 主要函数有 doc2bow,它将许多词汇转换成词袋(bag-of-words)模型表示:一个2-tuples列表(word_id, word_frequency)。 如果给定了documents,使用它们进行字典初始化(参见:add_documents()) 输入:data.text 二维[[doc1_A,doc1_B][doc2_C,doc2_D]]矩阵 2、j https://radimrehurek.com/gensim/models/ldamulticore.html ''' logger.info('train lda') self.id2word = gensim.corpora.Dictionary(self.data.text) #建立语料特征的索引字典 corpus = [self.id2word.doc2bow(text) for text in self.data.text] #词袋模型的稀疏向量 self.LDAmodel = LdaMulticore(corpus=corpus, id2word=self.id2word, num_topics=30, workers=4, chunksize=4000, passes=7, alpha='asymmetric') ##lstm logger.info('train autoencoder') self.ae.train(self.data)
'wikileaks', con, ) class MySentences(object): def __init__(self): self.con = create_engine('sqlite://///home/luiztheodoro/Documentos/mestrado/iri/trab_final/wikileaks.sqlite', \ echo = False) self.df = pd.read_sql( 'wikileaks', con, ) def __iter__(self): for index, row in self.df.iterrows(): yield row['lista'].split(" ") sentences = list(MySentences()) print("Gerando FastText...") model_ft = models.FastText(size=100) model_ft.build_vocab(sentences) model_ft.train(sentences = sentences, \ epochs = model_ft.epochs, \ total_examples = model_ft.corpus_count, \ total_words = model_ft.corpus_total_words) model_ft.save( '/home/luiztheodoro/Documentos/mestrado/iri/trab_final/wikileaks.ft')
# turn our tokenized documents into a id <-> term dictionary if not os.path.isfile('./dictionary.dict'): print 'Turn our tokenized documents into a id <-> term dictionary ...', sys.stdout.flush() dictionary = corpora.Dictionary(texts) dictionary.save('./dictionary.dict') else: print 'Loading id <-> term dictionary from ./dictionary.dict ...', sys.stdout.flush() dictionary = corpora.Dictionary.load('./dictionary.dict') print ' Done!' # ignore words that appear in less than 20 documents or more than 50% documents print "Filtering less and more frequent words ..." dictionary.filter_extremes(no_below=2, no_above=0.5) for i, text in enumerate(texts): filtered_text = [] for w in text: if w in dictionary.token2id: filtered_text.append(w) texts[i] = filtered_text del dictionary # Learn the FastText model print 'Learning the FastText model ...', sys.stdout.flush() fasttextmodel = models.FastText(texts, size=NUM_TOPICS, workers=8, iter=25, window=8) fasttextmodel.save('fasttextmodel'+str(NUM_TOPICS)+'.fasttext') print ' Done!'