def filtrar_extremos(docs, max_freq=0.5, min_wordcount=2, n_top=3): dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq) dictionary.filter_n_most_frequent(n_top) _ = dictionary[0] return dictionary
def preprocess_dict(self, dictionary: corpora.Dictionary) -> corpora.Dictionary: # TODOx decouple dictionary.filter_n_most_frequent(5) # fixmeX when NO_BELOW or NO_ABOVE change, this will not automatically recreate dict dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above) return dictionary
def set_dictionary(self, language_processed_data: list, no_below: int, no_above: float, n_most_frequent: int, dictionary_file_path): logging.info("---- Creating dictionary from processed data") dic = Dictionary(language_processed_data) dic.filter_n_most_frequent(n_most_frequent) dic.filter_extremes(no_below=no_below, no_above=no_above) dic.save(dictionary_file_path) self.dictionary = dic logging.info("---- Dictionary is created") return
def lsa(corpus, size=8): dic = Dictionary(corpus) dic.filter_extremes( no_below=5, no_above=0.8, ) dic.filter_n_most_frequent(remove_n=10) dic.compactify() index_corpus = [dic.doc2bow(sent) for sent in corpus] tfidf = TfidfModel(index_corpus, dictionary=dic) normed_corpus = [tfidf[sent] for sent in index_corpus] lsi = LsiModel(normed_corpus, num_topics=size) return [[x[1] for x in lsi[sent]] for sent in normed_corpus]
def process_dict(train_texts, doc_len): dictionary = Dictionary(train_texts) print('dict size:', len(dictionary)) # remove extremes no_below = int(doc_len * 0.008) filter_freq = int(doc_len * 0.2) print('no_below,filter_freq:', no_below, filter_freq) dictionary.filter_extremes( no_below=no_below) # remove words in less 0.8% of documents dictionary.filter_n_most_frequent( filter_freq) # Filter out 20% of most common word tokens # filter_tokens(bad_ids=None, good_ids=None) return dictionary
def get_corpus_and_dict(data_path): print("[BLOCK] Getting corpus and dictionary files from %s" % (data_path)) sys.stdout.flush() file_paths, files_list = get_lists(data_path) print("[BLOCK] Building dictionary with %s documents" % len(files_list)) sys.stdout.flush() dictionary = Dictionary(files_list) print("[BLOCK] Filtering out %s (0.1)" % (int(len(dictionary) * 0.1))) sys.stdout.flush() dictionary.filter_n_most_frequent(int(len(dictionary) * 0.1)) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(doc) for doc in files_list] return corpus, dictionary
def preprocess_docs(docs): '''Preprocess all the documents and create dict + corpus.''' # Pre-process the documents. docs = [preprocess_doc(doc) for doc in docs] # Remove rare and common tokens. # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) # Remove 5 most frequent dictionary.filter_n_most_frequent(5) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] return corpus, docs, dictionary
def testFilterMostFrequent(self): d = Dictionary(self.texts) d.filter_n_most_frequent(4) expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2} self.assertEqual(d.dfs, expected)
def testFilterMostFrequent(self): d = Dictionary(self.texts) d.filter_n_most_frequent(4) expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2} self.assertEqual(d.dfs, expected)
def create_dict(corpus, NUM_TOPICS=5, filter_n_most_freq=10): dictionary = Dictionary(corpus) dictionary.filter_extremes(no_below=round(0.1/NUM_TOPICS*dictionary.num_docs)) dictionary.filter_n_most_frequent(10) return dictionary
texts = [[token for token in line if not token.isnumeric()] for line in texts] # Remove words that are only two or less characters. texts = [[token for token in line if len(token) > 2] for line in texts] #Lemmatization (not stem since stemming can reduce the interpretability). lemmatizer = WordNetLemmatizer() texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts] return texts train_texts = process_texts(train_texts) print('bigramed train_texts',len(train_texts)) dictionary = Dictionary(train_texts) print('dict size:',len(dictionary)) #remove extremes dictionary.filter_extremes(no_below=10, no_above=0.1) #remove words in less than 5 documents and more than 50% documents dictionary.filter_n_most_frequent(2000) #Filter out 1000 most common tokens #filter_tokens(bad_ids=None, good_ids=None) corpus = [dictionary.doc2bow(text) for text in train_texts] print('corpus size:',len(corpus)) coherences = [] #LSI ''' lsimodel = LsiModel(corpus=corpus, num_topics=1, id2word=dictionary) #print(lsimodel.show_topics(num_topics=5)) # Showing only the top 5 topics lsitopics = lsimodel.show_topics(formatted=False) lsitopics = [[word for word, prob in topic] for topicid, topic in lsitopics] lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence() print('LSI:',lsi_coherence) coherences.append(lsi_coherence) #LDA ldamodel = LdaModel(corpus=corpus, num_topics=1, id2word=dictionary)
ion = [] for w in i: if not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I': ion.append((w.lemma_)) txts.append(ion) bigram = gensim.models.Phrases(txts) txts = [bigram[line] for line in txts] dictionary = Dictionary(txts) # dictionary.filter_extremes(no_below=20, no_above=0.5) corpus = [dictionary.doc2bow(text) for text in txts] Counter(txts[1]).most_common(20) len(dictionary) dictionary.filter_n_most_frequent(2) len(dictionary) gensim.corpora.MmCorpus.serialize( "D:/Google Drive/BAP/text_analysis/corpus.mm", corpus) ###SAVE corpus = gensim.corpora.MmCorpus( "D:/Google Drive/BAP/text_analysis/corpus.mm") ###LOAD print(list( corpus)) # calling list() will convert any sequence to a plain Python list print(corpus) for doc in corpus: print(doc) from gensim.models import LdaModel, LsiModel, HdpModel
import jieba import time time_start = time.time() # 数据预处理 with open("toutiao_cat_data.txt", "r", encoding="utf-8") as f: # with open("test.txt","r",encoding="utf-8") as f: data = [] for line in f.readlines(): line = line.strip() # 去除空格 line = ','.join(line.split("_!_")[3:]) # 按符号切割数据,并且不要前三个无关文本内容的数据 data.append(jieba.lcut(line)) # 文本向量化 dictionary = Dictionary(data) # 统计每个词在其它文本中出现了多少次 dictionary.filter_n_most_frequent(200) # 过滤掉频率过高的词 corpus = [dictionary.doc2bow(text) for text in data] # 转化为词袋向量 # 训练模型 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10) # 指定了10个主题, # 获取主题词分布 topic_list = lda.print_topics(20) # print(topic_list) for i in topic_list: print(i) def pre(data): '获取某篇文档的主题分布' print(data)
class FasttextTfIdfTransformer: def __init__(self, model=None, dictionary=None, corpus_file=None, size=256, window=7, min_count=4, iter=30, min_n=4, max_n=5, word_ngrams=1, no_above=0.5, filter_n_most_frequent=100, do_filter_tokens=True, workers=multiprocessing.cpu_count() - 1, ft_prefix="ft_", token_column=None, inplace=True, store_train_data=False, skip_fit=False, skip_transform=False, normalize_word_vectors=True): self.size = size self.window = window self.min_count = min_count self.iter = iter self.min_n = min_n self.max_n = max_n self.word_ngrams = word_ngrams self.workers = workers self.token_column = token_column self.model = None assert type(self.token_column) == str self.ft_prefix = ft_prefix self.skip_fit = skip_fit self.skip_transform = skip_transform self.inplace = inplace self.normalize_word_vectors = normalize_word_vectors self.store_train_data = store_train_data self.train = None self.model = model self.no_above = no_above self.word_set = None self.filter_n_most_frequent = filter_n_most_frequent self.do_filter_tokens = do_filter_tokens self.dictionary = dictionary if model is None and corpus_file is not None: self.dictionary = Dictionary( map(lambda s: s.split(), load_list_per_line(corpus_file))) print("Total Unique Tokens = %s" % (len(self.dictionary))) self.dictionary.filter_extremes(no_below=self.min_count, no_above=self.no_above, keep_n=1000000) self.dictionary.filter_n_most_frequent(self.filter_n_most_frequent) print("Total Unique Tokens after filtering = %s" % (len(self.dictionary))) self.word_set = set(self.dictionary.values()) self.model = FastText(corpus_file=corpus_file, size=self.size, window=self.window, min_count=self.min_count, iter=self.iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams, workers=self.workers, bucket=8000000, alpha=0.03, negative=10, ns_exponent=0.5) if (model is None or dictionary is None) and corpus_file is None: raise ValueError("No data given to initialise FastText Model") assert self.dictionary is not None and self.model is not None def fit(self, X, y='ignored'): gc.collect() if self.store_train_data: self.train = (X, y) if self.skip_fit: return self if type(X) == pd.DataFrame: X = X[self.token_column].values else: raise ValueError() assert self.dictionary is not None and self.model is not None self.dictionary.add_documents(X) dct = self.dictionary print("Total Unique Tokens = %s" % (len(dct))) dct.filter_extremes(no_below=self.min_count, no_above=self.no_above, keep_n=1000000) dct.filter_n_most_frequent(self.filter_n_most_frequent) print("Total Unique Tokens after filtering = %s" % (len(dct))) self.word_set = set(dct.values()) print("FastText Modelling Started at %s" % (str(pd.datetime.now()))) self.model.build_vocab(X, update=True) self.model.train(X, total_examples=self.model.corpus_count, epochs=self.model.epochs) print("FastText Modelling done at %s" % (str(pd.datetime.now()))) print("FastText Vocab Length = %s, Ngrams length = %s" % (len( self.model.wv.vectors_ngrams), len(self.model.wv.vectors_vocab))) gc.collect() return self def fit_stored(self): X, y = self.train return self.fit(X, y) def partial_fit(self, X, y=None): self.fit(X, y='ignored') def transform_one(self, token_array): tokens2vec = [ self.model.wv[token] if token in self.model.wv else np.full( self.size, 0) for token in token_array ] if np.sum(tokens2vec) == 0: return np.full(self.size, 0) return np.average(tokens2vec, axis=0) def transform(self, X, y='ignored'): print("Fasttext Transforms start at: %s" % (str(pd.datetime.now()))) if self.skip_transform: return X if type(X) == pd.DataFrame: Input = X[self.token_column].values else: raise ValueError() if not self.inplace: X = X.copy() uniq_tokens = set(more_itertools.flatten(Input)) print("Number of Unique Test Tokens for Fasttext transform %s" % len(uniq_tokens)) if self.do_filter_tokens: uniq_tokens = uniq_tokens.intersection(self.word_set) print( "Number of Unique Test Tokens after filtering for Fasttext transform %s" % len(uniq_tokens)) empty = np.full(self.size, 0) token2vec = { k: self.model.wv[k] if k in self.model.wv else empty for k in uniq_tokens } token2vec = {k: v / np.linalg.norm(v) for k, v in token2vec.items()} def tokens2vec(token_array): empty = np.full(self.size, 0) if len(token_array) == 0: return empty return [ token2vec[token] if token in uniq_tokens else empty for token in token_array ] ft_vecs = list(map(tokens2vec, Input)) results = list( map( lambda x: np.average( x, axis=0, ) if np.sum(x) != 0 else np.full(300, 0), ft_vecs)) text_df = pd.DataFrame(list(map(list, results))) text_df.columns = [ self.ft_prefix + str(i) for i in range(0, self.size) ] text_df.index = X.index X[list(text_df.columns)] = text_df gc.collect() print("Fasttext Transforms done at: %s" % (str(pd.datetime.now()))) return X def inverse_transform(self, X, copy=None): raise NotImplementedError() def fit_transform(self, X, y='ignored'): self.fit(X) return self.transform(X)