def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
count = 0 doc_num = len(wikis) new_wikis = [] keywords = [] for keyword, wiki in wikis.items(): count += 1 print '\r', count, '/', doc_num, text = wiki['text'] cleaned = clean_text(text) # delete irrelevant characters wiki = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') wiki.append(word) # convert compound word into one token wiki = convert_compound(wiki) # filter stop words, long words, and non-english words wiki = [w for w in wiki if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] # FIXME: this allows non-english characters to be stored new_wikis.append(wiki) keywords.append(keyword) print '\n' logging.info('Saving wiki corpus...') enpickle(new_wikis, 'data/processed/wikis.pkl')
print '\r', count, '/', doc_num, text = document['text'] + (' ' + index) * title_weight # incorporate title information from_name = document['from'] date = document['date'] cleaned = clean_text(text) # delete irrelevant characters document = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') document.append(word) # convert compound word into one token document = convert_compound(document) # filter stop words, long words, and non-english words document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()] new_documents.append(document) titles.append(index) froms.append(from_name) dates.append(date) print '\n' logging.info('create dictionary and corpus...') dictionary = corpora.Dictionary(new_documents) dictionary.docid2title = titles dictionary.docid2from = froms dictionary.docid2date = dates
for keyword, wiki in wikis.items(): count += 1 print '\r', count, '/', doc_num, text = wiki['text'] cleaned = clean_text(text) # delete irrelevant characters wiki = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') wiki.append(word) # convert compound word into one token wiki = convert_compound(wiki) # filter stop words, long words, and non-english words wiki = [ w for w in wiki if not w in stop_words and 2 <= len(w) <= 15 and w.islower() ] # FIXME: it allows non-english characters to be stored new_wikis.append(wiki) keywords.append(keyword) print '\n' enpickle(new_wikis, 'data/txt/processed_wiki.pkl') logging.info('create dictionary and corpus...') dictionary = corpora.Dictionary(new_wikis)