Esempio n. 1
0
def filter_wiki(raw, image='', file='', category=''):
    global RE_P7_
    global RE_P8_
    global RE_P14_
    global RE_P15_
    RE_P16 = ''
    RE_P17 = ''
    RE_P18 = ''
    RE_P19 = ''
    if image:
        RE_P7_ = re.compile('\n\[\[%s(.*?)(\|.*?)*\|(.*?)\]\]' % image,
                            re.UNICODE)  # keep description of images
    if file:
        RE_P8_ = re.compile('\n\[\[%s(.*?)(\|.*?)*\|(.*?)\]\]' % file,
                            re.UNICODE)  # keep description of files
    if image and file:
        RE_P15_ = re.compile('\[\[(%s:|%s)[^]]*(\]\])' % (image, file),
                             re.UNICODE)
    if category:
        RE_P14_ = re.compile('\[\[%s:[^][]*\]\]' % category,
                             re.UNICODE)  # categories
    """
    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
    or utf-8 encoded string.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)  # ' ' --> '\xa0'
    return remove_markup(text)
Esempio n. 2
0
def filter_wiki(raw):
    """
    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
    or utf-8 encoded string.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text) # ' ' --> '\xa0'
    return remove_markup(text)
Esempio n. 3
0
 def articles(data):
     istack = []
     for m in re.finditer(regex.recommended, data):
         item = MediaObj()
         plot = m.group('plot').decode('iso-8859-1')
         item.plot = decode_htmlentities(plot)
         item.title = m.group('title')
         item.thumbnail = m.group('image').encode('ascii', 'ignore')
         #State identifiers
         item.key = Key()
         item.key.id = int(m.group('id'))
         item.key.type = m.group('type').encode('ascii', 'ignore')
         istack.append(item)
     return istack
Esempio n. 4
0
 def articles(data):
     istack = []
     for m in re.finditer(regex.recommended, data):	
         item = MediaObj()
         plot = m.group('plot').decode('iso-8859-1')
         item.plot =  decode_htmlentities( plot ) 
         item.title =  m.group('title') 
         item.thumbnail = m.group('image').encode('ascii', 'ignore')
         #State identifiers
         item.key = Key()
         item.key.id = int(m.group('id'))
         item.key.type = m.group('type').encode('ascii', 'ignore')
         istack.append(item)
     return istack