def dewiki(text): text = remove_double_curly(text) text = remove_double_brackets(text) text = wtp.parse(text).plain_text() text = htt(text) text = text.replace('\\n', ' ') text = re.sub('\[\[', ' ', text) text = re.sub('\]\]', ' ', text) text = re.sub('\s+', ' ', text) return text
def analyze_chunk(text): try: if '<redirect title="' in text: # this is not the main article return None else: title = text.split('<title>')[1].split('</title>')[0] if ':' in title: # this is a talk, category, or other (not a real article) return None title = htt(title).strip() serial = text.split('<id>')[1].split('</id>')[0] content = text.split('</text')[0].split('<text')[1].split( '>', maxsplit=1)[1] content = dewiki(content) return {'title': title, 'text': content, 'id': serial} except: return None
def dewiki(text): text = remove_simple_links(text) text = remove_pictures(text) text = remove_audio(text) text = remove_compound_links(text) text = remove_references(text) text = remove_citations(text) text = remove_categories(text) text = remove_all_links(text) text = remove_urls(text) text = remove_wikitables(text) # TODO preserve this data somehow text = wtp.parse(text).plain_text() # wiki to plaintext whatever is left text = htt(text) # de-HTML text #text = re.sub('\]\]', ' ', text) # remove any remnant brackets text = text.replace('\\n', ' ') # replace newlines text = re.sub('\s+', ' ', text) # replace excess whitespace return text
def dewiki(text): text = text.replace('\\n', ' ') # replace newlines text = re.sub('\s+', ' ', text) # replace excess whitespace text = remove_audio(text) text = remove_references(text) text = remove_citations(text) text = remove_categories(text) text = remove_simple_links(text) text = remove_compound_links(text) text = remove_pictures(text) text = remove_all_links(text) text = remove_urls(text) # TODO handle class=\"sortable wikitable\" and class=\"wikitable\" text = wtp.parse(text).plain_text() # wiki to plaintext whatever is left text = htt(text) # de-HTML text text = re.sub('\]\]', ' ', text) # remove any remnant brackets text = re.sub('\s+', ' ', text) return text
def analyze_chunk(text): try: if '<redirect title="' in text: # this is not the main article return None if '(disambiguation)' in text: # this is not an article return None else: title = text.split('<title>')[1].split('</title>')[0] title = htt(title) if ':' in title: # most articles with : in them are not articles we care about return None serial = text.split('<id>')[1].split('</id>')[0] content = text.split('</text')[0].split('<text')[1].split( '>', maxsplit=1)[1] content = dewiki(content) return { 'title': title.strip(), 'text': content.strip(), 'id': serial.strip() } except Exception as oops: print(oops) return None
def dewiki(text): text = wtp.parse(text).plain_text() # wiki to plaintext text = htt(text) # remove any HTML text = text.replace('\\n', ' ') # replace newlines text = re.sub('\s+', ' ', text) # replace excess whitespace return text