def filter_wiki(raw, image='', file='', category=''): global RE_P7_ global RE_P8_ global RE_P14_ global RE_P15_ RE_P16 = '' RE_P17 = '' RE_P18 = '' RE_P19 = '' if image: RE_P7_ = re.compile('\n\[\[%s(.*?)(\|.*?)*\|(.*?)\]\]' % image, re.UNICODE) # keep description of images if file: RE_P8_ = re.compile('\n\[\[%s(.*?)(\|.*?)*\|(.*?)\]\]' % file, re.UNICODE) # keep description of files if image and file: RE_P15_ = re.compile('\[\[(%s:|%s)[^]]*(\]\])' % (image, file), re.UNICODE) if category: RE_P14_ = re.compile('\[\[%s:[^][]*\]\]' % category, re.UNICODE) # categories """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # ' ' --> '\xa0' return remove_markup(text)
def filter_wiki(raw): """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # ' ' --> '\xa0' return remove_markup(text)
def articles(data): istack = [] for m in re.finditer(regex.recommended, data): item = MediaObj() plot = m.group('plot').decode('iso-8859-1') item.plot = decode_htmlentities(plot) item.title = m.group('title') item.thumbnail = m.group('image').encode('ascii', 'ignore') #State identifiers item.key = Key() item.key.id = int(m.group('id')) item.key.type = m.group('type').encode('ascii', 'ignore') istack.append(item) return istack
def articles(data): istack = [] for m in re.finditer(regex.recommended, data): item = MediaObj() plot = m.group('plot').decode('iso-8859-1') item.plot = decode_htmlentities( plot ) item.title = m.group('title') item.thumbnail = m.group('image').encode('ascii', 'ignore') #State identifiers item.key = Key() item.key.id = int(m.group('id')) item.key.type = m.group('type').encode('ascii', 'ignore') istack.append(item) return istack