def train(args, output_dir): """Build the corpus, trains the DTM, and saves the model to the output dir.""" corpus = Corpus() # Create the dictionary. dictionary = Dictionary(corpus.debates.bag_of_words) dictionary.filter_extremes(no_below=100) # Save empirical term distribution within each time step. term_counts = corpus2csc( corpus.debates.groupby('year').agg({ 'bag_of_words': 'sum' }).bag_of_words.apply(dictionary.doc2bow)) save_npz(os.path.join(output_dir, 'term_counts.npz'), term_counts) # Train and save dtm. time_slices = corpus.debates.groupby('year').size() dtm_corpus = corpus.debates.bag_of_words.apply(dictionary.doc2bow) model = Dtm(args.executable, corpus=dtm_corpus, id2word=dictionary, num_topics=args.num_topics, time_slices=time_slices.values, rng_seed=args.random_seed) model.save(os.path.join(output_dir, 'dtm.gensim'))
class EnronCorpus(TextCorpus): def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None): """ Initialize the corpus. This scans through all the emails once, to determine the corpus vocabulary. (only the first `keep_words` most frequent words that appear in at least `no_below` documents are kept). """ self.root_name = root_name if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) else: self.dictionary = dictionary def get_texts(self, return_raw=False): """ Walk the file system, strip punctuation, normalize all numbers to be '2'. """ filenames = walk_os(self.root_name) opened_files = gen_open(filenames) stripped_files = strip_punct(opened_files) length = 0 for email in stripped_files: if len(email) > ARTICLE_MIN_CHARS: length += 1 print 'Iteration: %i' % length yield tokenize(email) self.length = length # cache corpus length
def make_item_descriptions(max_sentence_length=None): descriptions = pd.read_csv(os.path.join( 'data', 'descriptions.csv')).rename(columns={'movie': 'item'}) texts = descriptions.description texts = texts.apply(lambda x: x.strip().split()) dictionary = Dictionary(texts.values) dictionary.filter_extremes() eos_id = len(dictionary.keys()) # to index list texts = texts.apply( lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id)) texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id])) max_sentence_length = max( texts.apply(len)) if max_sentence_length is None else min( max(texts.apply(len)), max_sentence_length) # padding texts = texts.apply(lambda x: x[:max_sentence_length]) texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)), 'constant', constant_values=(0, eos_id))) # change types texts = texts.apply(lambda x: x.astype(np.int32)) descriptions.id = descriptions.id.astype(np.int32) return descriptions.id.values, texts.values, len(dictionary.keys()) + 1
def build_dictionary(): dictionary = Dictionary() for line in open(wiki_index.ARTICLES_FILE): dictionary.add_documents([line.lower().split()]) dictionary.filter_extremes(no_below=2, no_above=0.5) dictionary.save(DICTIONARY_FILE) return dictionary
class EnronCorpus(TextCorpus): def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None): """ Initialize the corpus. This scans through all the emails once, to determine the corpus vocabulary. (only the first `keep_words` most frequent words that appear in at least `no_below` documents are kept). """ self.root_name = root_name if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) else: self.dictionary = dictionary def get_texts(self, return_raw=False): """ Walk the file system, strip punctuation, normalize all numbers to be '2'. """ filenames = walk_os(self.root_name) opened_files = gen_open(filenames) stripped_files = strip_punct(opened_files) length = 0 for email in stripped_files: if len(email) > ARTICLE_MIN_CHARS: length += 1 print "Iteration: %i" % length yield tokenize(email) self.length = length # cache corpus length
def fill_dictionary(self, prune_at=2000000): """ Update dictionary from a collection of documents. Each document is a list of tokens = **tokenized and normalized** strings (either utf8 or unicode). This is a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`, which also prunes infrequent words, keeping the total number of unique words <= `prune_at`. This is to save memory on very large inputs. To disable this pruning, set `prune_at=None`. """ if self.metadata: dictionary = Dictionary() for docno, item in enumerate(self.get_texts()): title, document = item self.titles.append(title) # log progress & run a regular check for pruning, once # every 10k docs if docno % 10000 == 0: if prune_at is not None and len(dictionary) > prune_at: dictionary.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at) logger.info("adding document #%i to %s", docno, dictionary) # update Dictionary with the document dictionary.doc2bow(document, allow_update=True) logger.info( "built %s from %i documents (total %i corpus positions)", dictionary, dictionary.num_docs, dictionary.num_pos) return dictionary else: return Dictionary(self.get_texts())
def main(): docs = get_train( 'D:/ByResearch/基于文本的原油油价预测/20200615code/code/SeaNMF-master/data/wedata.txt' ) docs = [s.strip().split() for s in docs] # Create a dictionary representation of the documents. dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=10, no_above=0.2) corpus = [dictionary.doc2bow(doc) for doc in docs] # Make a index to word dictionary. temp = dictionary[0] # only to "load" the dictionary. id2word = dictionary.id2token PMI = [] for i in range(2, 11): print(i) lda_model = LdaModel(corpus=corpus, id2word=id2word, iterations=100, num_topics=i) # Print the Keyword in the 5 topics print(lda_model.print_topics()) coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_uci') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) del lda_model PMI.append(coherence_lda) print(PMI)
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS): """\ """ wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle') bow_filename = os.path.join(out_dir, 'cables_bow.mm') tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm') predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany) # 1. Create word dict dct = Dictionary() dct_handler = DictionaryHandler(dct) handler = create_filter(dct_handler) handle_source(src, handler, predicate) dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) dct.save(wordid_filename) # 2. Reiterate through the cables and create the vector space corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False) handler = create_filter(corpus_handler) handle_source(src, handler, predicate) # 3. Load corpus mm = MmCorpus(bow_filename) # 4. Create TF-IDF model tfidf = TfidfModel(mm, id2word=dct, normalize=True) # 5. Save the TF-IDF model MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def create_lda_model(data_csv, num_topics): custom_texts = [] # for i in range(0, 30): for i in range(0, len(data_csv)): # add context vocab to dict context = data_csv['Context'][i] # hard-coded condition to train only contexts corresponding to first 20000 questions # if (context[: 42] == "Agricultural production is concentrated on"): # break context = context.lower() context = context.replace("\'s", '') context = context.replace("\'", '') lst_words_context = re.findall(r"[\w']+|[.,!?;]", context) words = [w for w in lst_words_context if not w in stop_words] # remove stopwords words = [word for word in words if word.isalpha()] # remove punctuation custom_texts.append(words) custom_dict = Dictionary(custom_texts) custom_dict.filter_extremes(no_below=1, no_above=0.3) custom_corpus = [custom_dict.doc2bow(text) for text in custom_texts] # Train the model on the corpus. lda = models.LdaModel(custom_corpus, num_topics=num_topics, id2word=custom_dict) return lda, custom_dict
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def create_gensim_dict_corpus(docs_raw, num_below, num_above, num_features): ''' Create corpus to be used to determine optimal number of components using the gensim package. ''' gensim_dict = Dictionary(docs_raw) gensim_dict.filter_extremes(no_below=num_below, no_above=num_above, keep_n=num_features) corpus = [gensim_dict.doc2bow(doc) for doc in docs_raw] return gensim_dict, corpus
def _make_property(self, review_dict_list: list) -> tuple: """ review_dict's keys are 'date', 'star', 'vote', 'name', 'title' and 'review' """ reviews = OrderedDict() for idx, review_dict in enumerate(review_dict_list): review = normalize(review_dict['review']) reviews[idx] = review text_list = [[ term.word for term in self._tokenizer.get_baseforms(review) ] for review in reviews.values()] dictionary = Dictionary(text_list) dictionary.filter_extremes(no_below=1, no_above=0.6) corpus = [dictionary.doc2bow(words) for words in text_list] return corpus, dictionary
def preprocess_text(docs): num_task = os.cpu_count() len_slices = len(docs) // num_task remainder_slices = len(docs) % num_task texts = [] stoplist = set(stopwords.words('english')) wn.ensure_loaded() t_start = time.perf_counter() with ProcessPoolExecutor(max_workers=num_task) as executor: futures_tokenize = [] for n in range(0, num_task): upper_bound = (n+1) * len_slices if n == num_task - 1: upper_bound = (n+1) * len_slices + remainder_slices print(n, upper_bound) futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound], stoplist)) for future in concurrent.futures.as_completed(futures_tokenize): texts += future.result() t_stop = time.perf_counter() print("removed stopwords and lemmatized in {} s".format(t_stop - t_start)) # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phraser(Phrases(texts, min_count=20)) for idx in range(len(texts)): for token in bigram[texts[idx]]: if '_' in token: # Token is a bigram, add to document. texts[idx].append(token) print("Done bigrams") dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=30, no_above=0.5) dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]]) special_tokens = {'_pad_': 0} dictionary.patch_with_special_tokens(special_tokens) return texts, dictionary
def compute_coherence_values(self, kmin, kmax, kstep): """ Compute c_v coherence for various number of topics Parameters: ---------- kmin : The minimum number of topics kmax : Max num of topics kstep : The step size of the topics Returns: ------- k_values: The number of topics used. coherence_values : Coherence values corresponding to the LDA model with respective number of topics topic_list: The list of topics. """ dictionary = Dictionary(self.docs) dictionary.filter_extremes(no_below=10, no_above=0.2) corpus = [dictionary.doc2bow(doc) for doc in self.docs] k_values = [] coherence_values = [] topic_list = [] for num_topics in range(kmin, kmax + 1, kstep): # The following print line is so that you can visually see it go and don't freak out print("num_topics:\t" + str(num_topics)) model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) coherencemodel = CoherenceModel(model=model, texts=self.docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherencemodel.get_coherence() coherence_values.append(coherence_lda) topic_list.append( model.show_topics(num_topics=num_topics, num_words=20, log=False, formatted=True)) k_values.append(num_topics) return k_values, coherence_values, topic_list
def get_classif_perf(theta, tokens, labels, embeds, methods=['theta', 'lda', 's-bert', 'tfidf']): # print('Checking inputs dim for classif:', len(theta), len(labels)) import pandas as pd perf = [] if 'theta' in methods: X = theta perf.append(train_predict(X, labels)) if 'lda' in methods: corpus = tokens.tolist() corpus = [[str(w) for w in d[0]] for d in corpus] dictionary = Dictionary(corpus) bow_corpus = [dictionary.doc2bow(x) for x in corpus] mod = LdaModel(bow_corpus, num_topics=theta.shape[1]) transcorp = mod[bow_corpus] X = transcorp2matrix(transcorp, bow_corpus, theta.shape[1]) perf.append(train_predict(X, labels)) if 's-bert' in methods: from sklearn.decomposition import PCA X = PCA(n_components=theta.shape[1]).fit_transform(embeds) perf.append(train_predict(X, labels)) if 'tfidf' in methods: corpus = tokens.tolist() corpus = [[str(w) for w in d[0]] for d in corpus] dictionary = Dictionary(corpus) dictionary.filter_extremes(keep_n=theta.shape[1]) bow_corpus = [dictionary.doc2bow(x) for x in corpus] mod = TfidfModel(bow_corpus, dictionary=dictionary) corpus_tfidf = mod[bow_corpus] X = corpus2dense(corpus_tfidf, num_terms=theta.shape[1]).T perf.append(train_predict(X, labels)) perf = pd.DataFrame(perf, index=methods) print('Model performances on classification is:\n{}'.format(perf))
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
class FolderCorpus(corpora.TextCorpus): def __init__(self, filepaths, preprocess=[], dictionary=None): self.filepaths = filepaths self.preprocess = preprocess self.metadata = None self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000) self.dictionary.compactify() def get_texts(self): for path in self.filepaths: with codecs.open(path, encoding='utf8') as f: raw_text = f.read() raw_text = raw_text.lower() for filt in self.preprocess: raw_text = filt(raw_text) text = list(utils.tokenize(raw_text, deacc=True, lowercase=True)) yield text
def further_preprocessing_phase(temp_data_frame): temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '') # textlist = temp_data_frame['text'].to_numpy() textlist = temp_data_frame['text'].tolist() # if it raises an exeption could be the empty texts patent_dictionary = Dictionary(textlist) corpus = [patent_dictionary.doc2bow(text) for text in textlist] print('original dictionary size: ', len(patent_dictionary)) vocab_tf={} for i in corpus: for item, count in dict(i).items(): if item in vocab_tf: vocab_tf[item]+=int(count) else: vocab_tf[item] =int(count) remove_ids=[] no_of_ids_below_limit=0 for id,count in vocab_tf.items(): if count<=5: remove_ids.append(id) patent_dictionary.filter_tokens(bad_ids=remove_ids) patent_dictionary.filter_extremes(no_below=0) patent_dictionary.filter_n_most_frequent(30) print('parsed dictionary size: ', len(patent_dictionary)) vocabulary = list(patent_dictionary.token2id.keys()) ids_list = [] data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification']) temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1) print(len(ids_list)) data_frame.set_index(data_frame['patent_id'], inplace=True) data_frame.drop(ids_list, axis=0, inplace=True) return data_frame
def create_LDA_model(coursesList): warnings.filterwarnings('ignore') text_clean = [doc.split(' ') for doc in coursesList['description']] bigrams, trigrams = create_n_grams(text_clean) text_clean = add_n_grams(text_clean, bigrams, trigrams) id2word = Dictionary(text_clean) id2word.filter_extremes(no_below=5, no_above=0.45) corpus = [id2word.doc2bow(text) for text in text_clean] num_topics = config.num_lda_topic lda_model = LDA(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, alpha='asymmetric', passes=25) lda_model.save("./best_model.lda") coherence_model_c_v = CoherenceModel(model=lda_model, texts=text_clean, dictionary=id2word, coherence='c_v') c_v = coherence_model_c_v.get_coherence() term_topic_mat = lda_model.get_topics() aver_cosine_similarities = 0 for i in range(0, (num_topics - 1)): cosine_similarities = linear_kernel(term_topic_mat[i].reshape(1, -1), term_topic_mat[i + 1:]).flatten() aver_cosine_similarities += sum(cosine_similarities) if num_topics != 1: aver_cosine_similarities = aver_cosine_similarities / ( num_topics * (num_topics - 1) / 2) print(c_v) print(aver_cosine_similarities) create_vector_topics(lda_model, corpus, id2word, coursesList) visual_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(visual_data, 'topics.html') return lda_model, id2word, bigrams, trigrams
def LDA_model(corpus_Quran, corpus_NT, corpus_OT): # run LDA on the entire set of verses from all corpora total_corpus = corpus_Quran + corpus_NT + corpus_OT dictionary = Dictionary(total_corpus) dictionary.filter_extremes(no_below=50, no_above=0.1) corpus = [dictionary.doc2bow(text) for text in total_corpus] lda = LdaModel(corpus, num_topics=20, id2word=dictionary, random_state=1) # compute document-topic probability for Quran dictionary1 = Dictionary(corpus_Quran) dictionary1.filter_extremes(no_below=50, no_above=0.1) corpus1 = [dictionary1.doc2bow(text) for text in corpus_Quran] topics_Quran = lda.get_document_topics(corpus1) topic_dic_Quran = {} for doc in topics_Quran: for topic in doc: if topic[0] not in topic_dic_Quran.keys(): topic_dic_Quran[topic[0]] = topic[1] else: topic_dic_Quran[topic[0]] += topic[1] # compute document-topic probability for OT dictionary2 = Dictionary(corpus_OT) dictionary2.filter_extremes(no_below=50, no_above=0.1) corpus2 = [dictionary2.doc2bow(text) for text in corpus_OT] topics_OT = lda.get_document_topics(corpus2) topic_dic_OT = {} for doc in topics_OT: for topic in doc: if topic[0] not in topic_dic_OT.keys(): topic_dic_OT[topic[0]] = topic[1] else: topic_dic_OT[topic[0]] += topic[1] # compute document-topic probability for NT dictionary3 = Dictionary(corpus_NT) dictionary3.filter_extremes(no_below=50, no_above=0.1) corpus3 = [dictionary3.doc2bow(text) for text in corpus_NT] topics_NT = lda.get_document_topics(corpus3) topic_dic_NT = {} for doc in topics_NT: for topic in doc: if topic[0] not in topic_dic_NT.keys(): topic_dic_NT[topic[0]] = topic[1] else: topic_dic_NT[topic[0]] += topic[1] for k, v in topic_dic_Quran.items(): topic_dic_Quran[k] = v / len(corpus_Quran) for k, v in topic_dic_OT.items(): topic_dic_OT[k] = v / len(corpus_OT) for k, v in topic_dic_NT.items(): topic_dic_NT[k] = v / len(corpus_NT) return lda, topic_dic_Quran, topic_dic_NT, topic_dic_OT
class HNCorpus(TextCorpus): def __init__(self, hn_folder, dictionary=None): """ Takes the HN folder of articles as input and builds the dictionary and corpus """ self.hn_folder = hn_folder if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=VOCAB_SIZE) else: self.dictionary = dictionary def get_texts(self): """ Iterate over the HN articles returning text """ positions, hn_articles = 0, 0 # ************ HN articles ************ fnamelist = [] for g in glob.iglob(self.hn_folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): hn_text = open(fname).read() hn_articles += 1 if LEMMATIZE: result = utils.lemmatize(hn_text) positions += len(result) yield result else: result = tokenize(hn_text) # text into tokens here positions += len(result) yield result print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions)) self.length = hn_articles # cache corpus length
def training_vectorize(holder): #Vector uses BOW to store features of the corpus. Uses dictionary #for facilitating this operation. This is an important part of the #sequential vectorization # split the data holder.content = holder['content'].apply(lambda row: row.split()) # make a dictionary dictionary = Dictionary(holder.content.tolist()) # filter the dictionary dictionary.filter_extremes(no_above=0.8, no_below=5) dictionary.compactify() # transform the data with the dictionary holder["content"] = holder["content"].apply( lambda row: dictionary.doc2bow(row)) # transform with tf-idf # tfidf = TfidfModel(holder["content"].tolist()) # holder["content"] = holder["content"].apply(lambda col: tfidf[col]) return holder, dictionary #, tfidf
class ArchiveCorpus(corpora.TextCorpus): def __init__(self, datafile, preprocess=[], dictionary=None): self.datafile = datafile self.preprocess = preprocess self.metadata = None if dictionary: self.dictionary = dictionary else: self.dictionary = Dictionary() if datafile is not None: self.dictionary.add_documents(self.get_texts()) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000) def get_texts(self): with utils.smart_open(self.datafile) as inputfile: for line in inputfile: for f in self.preprocess: line = f(line) text = list(utils.tokenize(line, deacc=True, lowercase=True)) yield text
def preprocess(tweets): # Get only negative ones (for this task) newTweets = tweets.copy() newTweets = remove_airline_tags(newTweets) newTweets.text = remove_links(newTweets.text) newTweets.text = lt_gt_conversion( ampersand_conversion(arrow_conversion(newTweets.text))) newTweets.text = with_without_conversion(newTweets.text) newTweets.text = hashtag_to_words(newTweets.text) newTweets = translate_all_emoji(newTweets) newTweets.text = remove_contractions(newTweets.text) newTweets.text = remove_punctuation(newTweets.text) newTweets.text = lemmatize_texts(newTweets.text) newTweets.text = remove_stopwords(newTweets.text) newTweets.text = newTweets.text.str.lower() texts = newTweets["text"].values # Tokenize and remove short words or filtered words tokenized_texts = [] for text in texts: split_text = text.split() split_text = [ word for word in split_text if len(word) > 2 and word not in FILTERED_WORDS ] tokenized_texts.append(split_text) # Create a dictionary for each word, and a bag of words text_dictionary = Dictionary(tokenized_texts) # Remove words that appear in over 50%, or less than 0.5%, and keep the top 66% of the vocabulary text_dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=len(text_dictionary) // 2) text_corpus = [text_dictionary.doc2bow(text) for text in tokenized_texts] return (text_dictionary, text_corpus)
def buildDict(self): batchiter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) common_dictionary = Dictionary(batchiter) print(len(common_dictionary)) if self.testReaderargs: print('update vocab from test set') batchiter = BatchIterBert(self.testDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) common_dictionary.add_documents(batchiter) print(len(common_dictionary)) common_dictionary.filter_extremes(no_below=self.dict_no_below, no_above=self.dict_no_above, keep_n=self.dict_keep_n) self.dictProcess = DictionaryProcess(common_dictionary) self.postProcessor.dictProcess = self.dictProcess self.vocab_dim = len(self.dictProcess) self.have_dict = True if 1: count_list = [] self.trainDataIter._reset_iter() batchiter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) for item in batchiter: current_count = sum(item) count_list.append(current_count) #print(current_count) print(sum(count_list) / len(count_list))
def train(docs): num_topics = lda_cfg("topics") epochs = lda_cfg("epochs") label = f'{datetime.now().isoformat(".", timespec="minutes")}({num_topics}-topics,{epochs}-epochs)' log_path = config("path.lda-log").format(label) os.makedirs(os.path.dirname(log_path), exist_ok=True) logging.basicConfig(filename=log_path, format='%(asctime)s : %(levelname)s : %(message)s', datefmt='%H:%M:%S', level=logging.INFO) dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=lda_cfg("word-extremes.min-count"), no_above=lda_cfg("word-extremes.max-freq")) corpus = [dictionary.doc2bow(doc) for doc in docs] model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=epochs, eval_every=lda_cfg.dict_like.get("eval-every"), chunksize=lda_cfg("chunk-size")) return label, model, dictionary, corpus
def evaluate(docs): # global docs # Perform function on our document docs = docs_preprocessor(docs) # Create Biagram & Trigram Models from gensim.models import Phrases if __name__ == "__main__": # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more. bigram = Phrases(docs, min_count=10) trigram = Phrases(bigram[docs]) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) for token in trigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # Remove rare & common tokens # Create a dictionary representation of the documents. dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=10, no_above=0.2) # Create dictionary and corpus required for Topic Modeling corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) print(corpus[:1]) # Set parameters. num_topics = 20 chunksize = 500 passes = 20 iterations = 400 eval_every = 1 # Make a index to word dictionary. temp = dictionary[0] # only to "load" the dictionary. id2word = dictionary.id2token lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) # Print the Keyword in the 5 topics print(lda_model.print_topics()) # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Compute Coherence Score using UMass coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence="u_mass") coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ model_list, coherence_values = compute_coherence_values( dictionary=dictionary, corpus=corpus, texts=docs, start=2, limit=40, step=6) # Show graph import matplotlib.pyplot as plt limit = 40 start = 2 step = 6 x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() return coherence_lda
from multiprocessing import Pool from functools import partial import math import numpy as np # use the newsgroup data as corpus df = pd.read_json( "https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json" ) documents = df.content.tolist() documents = preprocess_documents(documents) # fit an LDA model, n_topic = 5 news_dictionary = Dictionary(documents) news_dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000, keep_tokens=None) corpus = [news_dictionary.doc2bow(text) for text in documents] lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary) lda.show_topics() # convert gensim corpus to a sparse document-term matrix for coherence measure corpus_dense = gensim.matutils.corpus2csc(corpus, num_terms=len( news_dictionary.keys())) corpus_dense = corpus_dense.astype(int) corpus_dense = corpus_dense.transpose() print(corpus_dense.shape)
return total df = pd.read_csv("data_lda_final.csv", header=[0], sep='\t') clear = df["tokenized"].tolist() clear = [str(i) for i in clear] clear = trial_docs_preprocessor(clear) docs = clear # Create a dictionary representation of the documents. dictionary = Dictionary(docs) print('Number of unique words in initital documents:', len(dictionary)) # Filter out words that occur less than 3 documents, or more than 70% of the documents. dictionary.filter_extremes(no_below=3, no_above=0.70) print('Number of unique words after removing rare and common words:', len(dictionary)) corpus = [dictionary.doc2bow(doc) for doc in docs] # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token num_topics = 30 lda_model = LdaModel(corpus=corpus, id2word=id2word, alpha='auto', eta='auto', num_topics=num_topics)
from gensim.models.ldamodel import LdaModel from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary import os import numpy as np import matplotlib.pyplot as plt num_topics =10 f = open("./data/weibo_nof_vec.txt", "r", encoding='utf-8') texts = [document.split() for document in f] dictionary = Dictionary(texts) dictionary.filter_extremes(no_above = 0.2) corpus = [dictionary.doc2bow(text) for text in texts] print('done with corpus.') lda = LdaModel(corpus, id2word=dictionary, iterations=100, num_topics=10) #print(lda.print_topics()) for i in range(0,num_topics): print("-----------------------------------") print(lda.print_topic(i)) #lda.save('lda') topic = []
class WikiCorpus(TextCorpus): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if keep_words is None: keep_words = DEFAULT_DICT_SIZE if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) else: self.dictionary = dictionary def get_texts(self, return_raw=False): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print vec """ articles, articles_all = 0, 0 intext, positions = False, 0 if LEMMATIZE: lemmatizer = utils.lemmatizer yielded = 0 for _, text in _extract_pages(bz2.BZ2File(self.fname)): text = filter_wiki(text) articles_all += 1 if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 if return_raw: result = text yield result else: if LEMMATIZE: _ = lemmatizer.feed(text) while lemmatizer.has_results(): _, result = lemmatizer.read() # not necessarily the same text as entered above! positions += len(result) yielded += 1 yield result else: result = tokenize(text) # text into tokens here positions += len(result) yield result if LEMMATIZE: logger.info("all %i articles read; waiting for lemmatizer to finish the %i remaining jobs" % (articles, articles - yielded)) while yielded < articles: _, result = lemmatizer.read() positions += len(result) yielded += 1 yield result logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles before pruning)" % (articles, positions, articles_all)) self.length = articles # cache corpus length
class processor(processor_base): """ Pre-process text in memory. Includes utilities for cleaning, tokenization, and vectorization in parallel. """ def __init__(self, hueristic_pct_padding: float = .90, append_indicators: bool = False, keep_n: int = 150000, padding: str = 'pre', padding_maxlen: Union[int, None] = None, truncating: str = 'post'): """ Parameters: ---------- hueristic_pct_padding: float This parameter is only used if `padding_maxlen` = None. A histogram of documents is calculated, and the maxlen is set hueristic_pct_padding. append_indicators: bool If True, will append the tokens '_start_' and '_end_' to the beginning and end of your tokenized documents. This can be useful when training seq2seq models. keep_n: int = 150000 This is the maximum size of your vocabulary (unique number of words allowed). Consider limiting this to a reasonable size based upon your corpus. padding : str 'pre' or 'post', pad either before or after each sequence. padding_maxlen : int or None Maximum sequence length, longer sequences are truncated and shorter sequences are padded with zeros at the end. Note if this is specified, the `hueristic_pct_padding` is ignored. truncating : str 'pre' or 'post', remove values from sequences larger than padding_maxlen either in the beginning or in the end of the sequence. See https://keras.io/preprocessing/sequence/ Attributes: ----------- vocabulary : gensim.corpora.dictionary.Dictionary This is a gensim object that is built after parsing all the tokens in your corpus. n_tokens : int The total number of tokens in the corpus. Will be less than or equal to keep_n id2token : dict dict with { int : str} ex: {'the': 2, 'cat': 3} this is used for converting tokens to integers. token2id : dict dict with {str: int} ex: {2: 'the', 3: 'cat'} this is used for decoding predictions back to tokens document_length_stats : pandas.DataFrame histogram of document lengths. Can be used to decide padding_maxlen. """ super().__init__() self.hueristic_pct = hueristic_pct_padding self.append_indicators = append_indicators self.keep_n = keep_n self.padding = padding self.padding_maxlen = padding_maxlen self.truncating = truncating # These are placeholders for data that will be collected or calculated self.vocabulary = Dictionary() self.n_tokens = None self.id2token = None self.token2id = None self.document_length_histogram = Counter() self.document_length_stats = None self.doc_length_huerestic = None # These values are 'hardcoded' for now self.padding_value = 0.0 self.padding_dtype = 'int32' self.start_tok = '_start_' self.end_tok = '_end_' self.keep_tokens = [self.start_tok, self.end_tok] def process_text(self, text: List[str]) -> List[List[str]]: """Combine the cleaner and tokenizer.""" return self.__apply_tokenizer(self.__apply_cleaner(text)) def __apply_cleaner(self, data: List[str]) -> List[str]: """Apply the cleaner over a list.""" return [self.cleaner(doc) for doc in data] def __apply_tokenizer(self, data: List[str]) -> List[List[str]]: """Apply the tokenizer over a list.""" if self.append_indicators: tmp = [[self.start_tok] + self.tokenizer(doc) + [self.end_tok] for doc in data] return tmp else: return [self.tokenizer(doc) for doc in data] def parallel_process_text(self, data: List[str]) -> List[List[str]]: """Apply cleaner -> tokenizer.""" return apply_parallel(data, self.process_text) def generate_doc_length_stats(self): """Analyze document length statistics for padding strategy""" hueristic = self.hueristic_pct histdf = (pd.DataFrame( [(a, b) for a, b in self.document_length_histogram.items()], columns=['bin', 'doc_count']).sort_values(by='bin')) histdf['cumsum_pct'] = histdf.doc_count.cumsum( ) / histdf.doc_count.sum() self.document_length_stats = histdf self.doc_length_huerestic = histdf.query( f'cumsum_pct >= {hueristic}').bin.head(1).values[0] logging.warning(' '.join([ "Setting maximum document length to", f'{self.doc_length_huerestic} based upon', f'hueristic of {hueristic} percentile.\n', 'See full histogram by insepecting the', "`document_length_stats` attribute." ])) self.padding_maxlen = self.doc_length_huerestic def fit(self, data: List[str], return_tokenized_data: bool = False, no_below: int = 100, no_above: float = .9) -> Union[None, List[List[str]]]: """ TODO: update docs Apply cleaner and tokenzier to raw data and build vocabulary. Parameters ---------- data : List[str] These are raw documents, which are a list of strings. ex: [["The quick brown fox"], ["jumps over the lazy dog"]] return_tokenized_data : bool Return the tokenized strings. This is primarly used for debugging purposes. no_below : int See below explanation no_above : float See below explanation When tokenizing documents, filter tokens according to these rules: 1. occur less than `no_below` documents (absolute number) or 2. occur more than `no_above` documents (fraction of total corpus size, not absolute number). 3. after (1), and (2), keep only the first keep_n most frequent tokens. Returns ------- None or List[List[str]] if return_tokenized_data=True then will return tokenized documents, otherwise will not return anything. This method heavily leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html """ now = get_time() logging.warning(f'....tokenizing data') tokenized_data = list( chain.from_iterable(self.parallel_process_text(data))) if not self.padding_maxlen: document_len_counters = apply_parallel(tokenized_data, count_len) for doc_counter in document_len_counters: self.document_length_histogram.update(doc_counter) self.generate_doc_length_stats() # chunk the data manually for corpus build adnd pass to build corpus method logging.warning(f'(1/3) done. {time_diff(now)} sec') logging.warning(f'....building corpus') now = get_time() corpus = build_corpus(tokenized_data) # Merge the corpuses from each thread together, this is like a "reduce" step logging.warning(f'(2/3) done. {time_diff(now)} sec') logging.warning(f'....consolidating corpus') now = get_time() self.vocabulary.merge_with(corpus) # # get rid of rare tokens from corpus such that they will get the same id self.vocabulary.filter_extremes(no_below, no_above, self.keep_n, keep_tokens=self.keep_tokens) # compactify the ids for each word self.vocabulary.compactify() # Build Dictionary accounting For 0 padding, and reserve 1 for unknown and rare Words self.token2id = dict([(k, v + 2) for k, v in self.vocabulary.token2id.items()]) self.id2token = dict([(v, k) for k, v in self.token2id.items()]) self.n_tokens = len(self.id2token.keys()) # logging logging.warning(f'(3/3) done. {time_diff(now)} sec') logging.warning( f'Finished parsing {self.vocabulary.num_docs:,} documents.') if return_tokenized_data: return tokenized_data def token_count_pandas(self): """ See token counts as pandas dataframe""" freq_df = pd.DataFrame( [b for a, b in self.vocabulary.dfs.items()], index=[a for a, b in self.vocabulary.dfs.items()], columns=['count']) id2tokens = [(b, a) for a, b in self.vocabulary.token2id.items()] token_df = pd.DataFrame([b for a, b in id2tokens], index=[a for a, b in id2tokens], columns=['token']) return freq_df.join(token_df).sort_values('count', ascending=False) def fit_transform(self, data: List[str], no_below: int = 25, no_above: float = 0.8) -> List[List[int]]: """ Apply cleaner and tokenzier to raw data, build vocabulary and return transfomred dataset that is a List[List[int]]. This will use process-based-threading on all available cores. ex: >>> data = [["The quick brown fox"], ["jumps over the lazy dog"]] >>> pp = preprocess(maxlen=5, no_below=0) >>> pp.fit_transform(data) # 0 padding is applied [[0, 2, 3, 4, 5], [6, 7, 2, 8, 9]] Parameters ---------- data : List[str] These are raw documents, which are a list of strings. ex: [["The quick brown fox"], ["jumps over the lazy dog"]] no_below : int See below explanation no_above : float See below explanation When tokenizing documents, filter tokens according to these rules: 1. occur less than `no_below` documents (absolute number) or 2. occur more than `no_above` documents (fraction of total corpus size, not absolute number). 3. after (1), and (2), keep only the first keep_n most frequent tokens. Returns ------- numpy.array with shape (number of documents, max_len) This method leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html """ tokdata = self.fit(data, return_tokenized_data=True, no_below=no_below, no_above=no_above) logging.warning(f'...fit is finished, beginning transform') now = get_time() vec_data = self.vectorize_parallel(tokdata) logging.warning(f'done. {time_diff(now)} sec') return vec_data def transform(self, data: List[str]) -> List[List[int]]: """ Transform List of documents into List[List[int]] If transforming a large number of documents consider using the method `transform_parallel` instead. ex: >> pp = processor() >> pp.fit(docs) >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]] >> pp.transform(new_docs) [[1, 2, 3, 4], [5, 6, 1, 7, 8]] """ return self.vectorize(self.process_text(data)) def transform_parallel(self, data: List[str]) -> List[List[int]]: """ Transform List of documents into List[List[int]]. Uses process based threading on all available cores. If only processing a small number of documents ( < 10k ) then consider using the method `transform` instead. ex: >> pp = processor() >> pp.fit(docs) >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]] >> pp.transform_parallel(new_docs) [[1, 2, 3, 4], [5, 6, 1, 7, 8]] """ return np.vstack(apply_parallel(data, self.transform)) def get_idx(self, token: str) -> int: """Get integer index from token.""" # return the index for index or if not foudn return out of boundary index which is 1 return self.token2id.get(token, 1) def __vec_one_doc(self, doc: List[str]) -> List[int]: """ Vectorize a single tokenized document. ex: ['hello', 'world'] """ return [self.get_idx(tok) for tok in doc] def vectorize(self, docs: List[List[str]]) -> List[List[int]]: """ Vectorize and apply padding on a set of tokenized doucments ex: [['hello, 'world'], ['goodbye', 'now']] """ # First apply indexing on all the rows then pad_sequnces (i found this # faster than trying to do these steps on each row return pad_sequences(list(map(self.__vec_one_doc, docs)), maxlen=self.padding_maxlen, dtype=self.padding_dtype, padding=self.padding, truncating=self.truncating, value=self.padding_value) def vectorize_parallel(self, data: List[List[str]]) -> np.array: """ Apply idx-> token mappings in parallel and apply padding. Arguments: data: List of List of strings """ indexed_data = apply_parallel(data, self.vectorize) # concatenate list of arrays vertically return np.vstack(indexed_data)
dense_corpus_sent2vec = np.array(feature_vector) target_vector = np.array(target_vector) print() print(dense_corpus_sent2vec.shape) print(target_vector.shape) print(len(docs_lemma)) print(len(docs_pos)) print('Make Dictionary') dictionary_lemma = Dictionary(docs_lemma) dictionary_pos = Dictionary(docs_pos) print('Number of unique pos: %d' % len(dictionary_pos)) dictionary_lemma.filter_extremes( no_below=10, no_above=0.2, keep_tokens=trigger_words if allow_tw else None) print('Number of unique lemma: %d' % len(dictionary_lemma)) lemma_bigrams = list() for d in docs_lemma: lemma_bigram = [ f"{bigram[0]}_{bigram[1]}" for bigram in list(ngrams(d, 2)) ] lemma_bigrams.append(lemma_bigram) pos_bigrams = list() for d in docs_pos: pos_bigram = [ f"{bigram[0]}_{bigram[1]}" for bigram in list(ngrams(d, 2)) ]
def createDictionary(texts): dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=2, no_above=0.4, keep_n=1000000) dictionary.compactify() return dictionary
help='File name to give the dictionary upon saving') args = parser.parse_args() input_path = args.input_path output_name = args.output_name CHUNK_SIZE = args.chunk_size # Stream in documents from path rdr = lmd.Reader(input_path) gnr = rdr.stream_data(get_meta=True) # Build a dictionary out of the validation documents dictionary = Dictionary() docs = rdr.stream_data(threaded=True) doc_chunks = chunks(docs, size=CHUNK_SIZE) # Progress in chunks for chunk in doc_chunks: print("Adding ", CHUNK_SIZE, " docs") tokenized = [[ tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha ] for doc in tokenizer.pipe( [item for item in chunk if language(item) == 'en'], batch_size=CHUNK_SIZE)] dictionary.add_documents(tokenized) # Keep only 2**16 most frequent tokens dictionary.filter_extremes(keep_n=2**16) dictionary.compactify() dictionary.save(output_name)
trigram = Phrases(bigram[docs]) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) for token in trigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # Remove rare & common tokens # Create a dictionary representation of the documents. dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=10, no_above=0.2) # Create dictionary and corpus required for Topic Modeling corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) print(corpus[:1]) # Train the model print("Stage 2: Train the model.") # Set parameters. num_topics = 5 chunksize = 500 passes = 10 # 20 iterations = 100 # 400 # Make a index to word dictionary.
class CDS_Corpus(TextCorpus): def __init__(self, folder, dictionary=None): """ Takes the list of txt files in a folder from Isabelle as input and builds the dictionary and corpus """ self.folder = folder if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=VOCAB_SIZE) else: self.dictionary = dictionary def get_texts(self): """ Iterate over the "documents" (sessions/places) returning text """ filter_words = set() if FILTER_WORDS: filter_words = [] with open(FILTER_WORDS) as f: for line in f: filter_words.append(line.rstrip('\n')) filter_words = set(filter_words) #print "the following words will be filtered", filter_words filter_words_add = set() if FILTER_WORDS_ADD: filter_words_add = [] with open(FILTER_WORDS_ADD) as f: for line in f: filter_words_add.append(line.rstrip('\n')) filter_words_add = set(filter_words_add) positions, hn_articles = 0, 0 fnamelist = [] docs = 0 for g in glob.iglob(self.folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): with open(fname) as f: text = "" for line in f: if line[0] != '@': #sentence = re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')).split(' ') sentence = tokenize(re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', ''))) for ind, word in enumerate(sentence): w = word.lower().rstrip(' ').strip(' ').strip('\t') sentence[ind] = w if FILTER_WORDS: for ind, word in enumerate(sentence): if word.upper() in filter_words: sentence[ind] = '' if FILTER_WORDS_ADD: for ind, word in enumerate(sentence): if word in filter_words_add: sentence[ind] = '' text += ' '.join(sentence) + '\n' else: docs += 1 if LEMMATIZE: result = lemmatizer(text) positions += len(result) yield result else: result = tokenize(text) # text into tokens here positions += len(result) yield result text = "" docs += 1 if LEMMATIZE: result = lemmatizer(text) positions += len(result) yield result else: result = tokenize(text) # text into tokens here positions += len(result) yield result print (">>> finished iterating over the corpus of %i documents with %i positions" % (docs, positions)) self.length = docs # cache corpus length
class WikiCorpus(TextCorpus): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) else: self.dictionary = dictionary def get_texts(self, return_raw=False): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print vec """ articles, articles_all = 0, 0 intext, positions = False, 0 for lineno, line in enumerate(bz2.BZ2File(self.fname)): if line.startswith(" <text"): intext = True line = line[line.find(">") + 1 :] lines = [line] elif intext: lines.append(line) pos = line.find("</text>") # can be on the same line as <text> if pos >= 0: articles_all += 1 intext = False if not lines: continue lines[-1] = line[:pos] text = filter_wiki("".join(lines)) if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 if return_raw: result = text else: result = tokenize(text) # text into tokens here positions += len(result) yield result logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles before pruning)" % (articles, positions, articles_all) ) self.length = articles # cache corpus length
class WikiHNCorpus(TextCorpus): def __init__(self, wiki_file, hn_folder, dictionary=None, processes=None, lemmatize=utils.HAS_PATTERN): """ Takes the wikipedia *articles.xml.bz2 and the HN folder of articles as input and builds the dictionary and corpus """ global outputname self.lemmatize = lemmatize if self.lemmatize: print "We will lemmatize ('you were'->'be/VB')" self.outputname = outputname + "_lemmatized" else: print "We will only tokenize ('you were'->'you','were')" self.wiki_file = wiki_file self.hn_folder = hn_folder if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=VOCAB_SIZE) else: self.dictionary = dictionary def get_texts(self): """ Iterate over the Wikipedia dump and the HN articles returning text """ wiki_articles, hn_articles, articles_all = 0, 0, 0 positions, positions_all = 0, 0 # ************ Wikipedia ************ texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file))) pool = multiprocessing.Pool(self.processes) for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory for tokens in pool.imap(wikicorpus.process_article, group): articles_all += 1 positions_all += len(tokens) if len(tokens) > WIKI_ARTICLE_MIN_WORDS: wiki_articles += 1 positions += len(tokens) yield tokens pool.terminate() print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS)) # ************ HN articles ************ positions_after_wiki = positions fnamelist = [] for g in glob.iglob(self.hn_folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki hn_text = open(fname).read() if self.lemmatize: result = utils.lemmatize(hn_text) # text into lemmas here else: result = tokenize(hn_text) # text into tokens here articles_all += 1 positions_all += len(result) if len(result) > HN_ARTICLE_MIN_WORDS: hn_articles += 1 positions += len(result) yield result print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki)) # ************ /HN articles ************ self.length = wiki_articles + hn_articles # cache corpus length