def process_article((title, text, number)): if re.match(r'\d', title) is not None or re.search(r' list', title) is not None or re.search(r'disambiguation', title)\ is not None or re.search(r'List', title) is not None: return "", [] text = ourwikicorpus.filter_wiki(text) return title.encode('utf8'), ourutils.simple_preprocess(text)
def process_article((title, text, number)): text = ourwikicorpus.filter_wiki(text) # print gensim.corpora.wikicorpus.remove_markup(text) #print gensim.utils.simple_preprocess(doc=text) return title.encode('utf8'), ourutils.simple_preprocess(text)