Example #1
0
def process_article((title, text, number)):
    if re.match(r'\d', title) is not None or re.search(r' list', title) is not None or re.search(r'disambiguation', title)\
            is not None or re.search(r'List', title) is not None:
        return "", []
    text = ourwikicorpus.filter_wiki(text)
    return title.encode('utf8'), ourutils.simple_preprocess(text)
Example #2
0
def process_article((title, text, number)):
    text = ourwikicorpus.filter_wiki(text)
  #  print gensim.corpora.wikicorpus.remove_markup(text)
    #print gensim.utils.simple_preprocess(doc=text)
    return title.encode('utf8'), ourutils.simple_preprocess(text)