Beispiel #1
0
    def createElementFromHTML(cls, html, encoding='utf-8'):
        '''
            createElementFromHTML - Creates an element from a string of HTML.

                If this could create multiple root-level elements (children are okay),
                  you must use #createElementsFromHTML which returns a list of elements created.

            @param html <str> - Some html data

            @param encoding <str> - Encoding to use for document

            @raises MultipleRootNodeException - If given html would produce multiple root-level elements (use #createElementsFromHTML instead)

            @return AdvancedTag - A single AdvancedTag

            NOTE: If there is text outside the tag, they will be lost in this.
              Use createBlocksFromHTML instead if you need to retain both text and tags.

              Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
        '''

        parser = cls(encoding=encoding)

        html = stripIEConditionals(html)
        try:
            HTMLParser.feed(parser, html)
        except MultipleRootNodeException:
            raise MultipleRootNodeException('Multiple nodes passed to createElementFromHTML method. Use #createElementsFromHTML instead to get a list of AdvancedTag elements.')

        rootNode = parser.getRoot()
        rootNode.remove()

        return rootNode
Beispiel #2
0
    def feed(self, token):
        ttype, tvalue, tstart, tend, tline = token
        self.line = tline

        # Handle whitespace
        (prev_row, prev_col) = self.lastPos
        (cur_row, cur_col) = tstart
        (end_row, end_col) = tend

        assert cur_row >= prev_row, "Unexpected jump in row"
        self.lastPos = (end_row, end_col)

        # are we now on a new line?
        if cur_row > prev_row:
            self._appendRows(cur_row - prev_row)

        # are we on a muliline statement?
        if end_row > cur_row:
            self._appendRows(end_row - cur_row)

        # interpret jumps on the same line as a single space
        if cur_row == prev_row and cur_col > prev_col:
            HTMLParser.feed(self, ' ')

        HTMLParser.feed(self, tvalue)
Beispiel #3
0
    def feed(self, token):
        ttype, tvalue, tstart, tend, tline = token
        self.line = tline

        # Handle whitespace
        (prev_row, prev_col) = self.lastPos
        (cur_row, cur_col) = tstart
        (end_row, end_col) = tend

        assert cur_row >= prev_row, "Unexpected jump in row"
        self.lastPos = (end_row, end_col)

        # are we now on a new line?
        if cur_row > prev_row:
            self._appendRows(cur_row - prev_row)

        # are we on a multiline statement?
        if end_row > cur_row:
            self._appendRows(end_row - cur_row)

        # interpret jumps on the same line as a single space
        if cur_row == prev_row and cur_col > prev_col:
            HTMLParser.feed(self, ' ')

        if ttype != tokenize.COMMENT:
            HTMLParser.feed(self, tvalue)
        else:
            # comments go directly into the output since they
            # are not a part of the HTML
            self._appendString(tvalue)
Beispiel #4
0
 def feed(self, data):
     """
     Main method for purifying HTML (overrided)
     """
     self.reset_purified()
     HTMLParser.feed(self, data)
     return self.html()
Beispiel #5
0
    def feed(self, token):
        ttype, tvalue, tstart, tend, tline = token
        self.line = tline

        # Handle whitespace
        (prev_row, prev_col) = self.lastPos
        (cur_row, cur_col) = tstart
        (end_row, end_col) = tend

        assert cur_row >= prev_row, "Unexpected jump in row"
        self.lastPos = (end_row, end_col)

        # are we now on a new line?
        if cur_row > prev_row:
            self._appendRows(cur_row - prev_row)

        # are we on a multiline statement?
        if end_row > cur_row:
            self._appendRows(end_row - cur_row)

        # interpret jumps on the same line as a single space
        if cur_row == prev_row and cur_col > prev_col:
            HTMLParser.feed(self, ' ')

        if ttype != tokenize.COMMENT:
            HTMLParser.feed(self, tvalue)
        else:
            # comments go directly into the output since they
            # are not a part of the HTML
            self._appendString(tvalue)
 def feed(self, data):
   from HTMLParser import HTMLParser
   data_with_br = data.replace("\n", "<br/>")
   HTMLParser.feed(self, data_with_br)
   if len(self.current_line) > 0:
     self.lines.append(self.current_line)
     self.current_line = ''
Beispiel #7
0
	def feed(self, data):
		try:
			HTMLParser.feed(self, data)
			return self._topic_list
		except Exception, e:
			log(unicode(traceback.format_exc()))
			return None
Beispiel #8
0
    def feed(self, data):
        no_cc = u'no closed captioning available'

        if u'<html' in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File seems to be an HTML file.')
        elif no_cc in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File contains "%s"' % no_cc)

        # try to find style tag in SAMI
        try:
            # prevent BS4 error with huge SAMI files with unclosed tags
            index = data.lower().find(u"</head>")

            self.styles = self._css_parse(
                BeautifulSoup(data[:index]).find(u'style').get_text())
        except AttributeError:
            self.styles = {}

        # fix erroneous italics tags
        data = data.replace(u'<i/>', u'<i>')

        # fix awkward tags found in some SAMIs
        data = data.replace(u';>', u'>')
        try:
            HTMLParser.feed(self, data)
        except HTMLParseError as e:
            raise CaptionReadSyntaxError(e)

        # close any tags that remain in the queue
        while self.queue != deque([]):
            closing_tag = self.queue.pop()
            self.sami += u"</%s>" % closing_tag

        return self.sami, self.styles, self.langs
Beispiel #9
0
def htmlstrip(html):
    #html = html.strip()
    #html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',
                           re.I)  #Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>',
                             re.I)
    re_script_2 = re.compile(r'<script>.+</script>', re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>',
                             re.I)
    re_comment = re.compile(r'<!--.+//-->', re.I)
    re_iframe = re.compile(r'<iframe.+</iframe>', re.I)
    html = re_script.sub('', html)  #去掉SCRIPT
    html = re_script_1.sub('', html)  #strip script
    html = re_script_2.sub('', html)
    html = re_script_3.sub('', html)
    html = re_comment.sub('', html)
    html = re_iframe.sub('', html)

    html = html.replace('&nbsp;&nbsp;&nbsp;&nbsp;', '')
    html = html.replace('<br />', '\n')
    html = html.replace('<br>', '\n')
    html = html.replace('<br/>', '\n')
    html = html.replace('\n\n\n\n', '\n\n')
    #soup = BeautifulSoup(html, fromEncoding = "utf-8")
    #html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #10
0
 def feed(self, data):
     from HTMLParser import HTMLParser
     data_with_br = data.replace("\n", "<br/>")
     HTMLParser.feed(self, data_with_br)
     if len(self.current_line) > 0:
         self.lines.append(self.current_line)
         self.current_line = ''
Beispiel #11
0
    def feed(self, data):
        no_cc = u'no closed captioning available'

        if u'<html' in data.lower():
            raise CaptionReadSyntaxError(
                u'SAMI File seems to be an HTML file.')
        elif no_cc in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File contains "%s"' % no_cc)

        # try to find style tag in SAMI
        try:
            # prevent BS4 error with huge SAMI files with unclosed tags
            index = data.lower().find(u"</head>")

            self.styles = self._css_parse(
                BeautifulSoup(data[:index]).find(u'style').get_text())
        except AttributeError:
            self.styles = {}

        # fix erroneous italics tags
        data = data.replace(u'<i/>', u'<i>')

        # fix awkward tags found in some SAMIs
        data = data.replace(u';>', u'>')
        try:
            HTMLParser.feed(self, data)
        except HTMLParseError as e:
            raise CaptionReadSyntaxError(e)

        # close any tags that remain in the queue
        while self.queue != deque([]):
            closing_tag = self.queue.pop()
            self.sami += u"</%s>" % closing_tag

        return self.sami, self.styles, self.langs
Beispiel #12
0
def htmlstrip(html):
    # html = html.strip()
    # html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile("<\s*script[^>]*>[^<]*<\s*/\s*script\s*>", re.I)  # Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>', re.I)
    re_script_2 = re.compile(r"<script>.+</script>", re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>', re.I)
    re_comment = re.compile(r"<!--.+//-->", re.I)
    re_iframe = re.compile(r"<iframe.+</iframe>", re.I)
    html = re_script.sub("", html)  # 去掉SCRIPT
    html = re_script_1.sub("", html)  # strip script
    html = re_script_2.sub("", html)
    html = re_script_3.sub("", html)
    html = re_comment.sub("", html)
    html = re_iframe.sub("", html)

    html = html.replace("&nbsp;&nbsp;&nbsp;&nbsp;", "")
    html = html.replace("<br />", "\n")
    html = html.replace("<br>", "\n")
    html = html.replace("<br/>", "\n")
    html = html.replace("\n\n\n\n", "\n\n")
    # soup = BeautifulSoup(html, fromEncoding = "utf-8")
    # html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return "".join(result)
Beispiel #13
0
    def createElementFromHTML(cls, html, encoding='utf-8'):
        '''
            createElementFromHTML - Creates an element from a string of HTML.

                If this could create multiple root-level elements (children are okay),
                  you must use #createElementsFromHTML which returns a list of elements created.

            @param html <str> - Some html data

            @param encoding <str> - Encoding to use for document

            @raises MultipleRootNodeException - If given html would produce multiple root-level elements (use #createElementsFromHTML instead)

            @return AdvancedTag - A single AdvancedTag

            NOTE: If there is text outside the tag, they will be lost in this.
              Use createBlocksFromHTML instead if you need to retain both text and tags.

              Also, if you are just appending to an existing tag, use AdvancedTag.appendInnerHTML
        '''

        parser = cls(encoding=encoding)

        html = stripIEConditionals(html)
        try:
            HTMLParser.feed(parser, html)
        except MultipleRootNodeException:
            raise MultipleRootNodeException(
                'Multiple nodes passed to createElementFromHTML method. Use #createElementsFromHTML instead to get a list of AdvancedTag elements.'
            )

        rootNode = parser.getRoot()
        rootNode.remove()

        return rootNode
Beispiel #14
0
    def feed(self, data):
        logger.debug("\nStart CourseHTMLParser")
        logger.debug(u"周 节")

        HTMLParser.feed(self, data)

        return table_content
class HTMLDocParser(TextParser):
    """Concrete HTML parser"""
    def __init__(self, data, config=[]):
        super(HTMLDocParser, self).__init__(data, config)
        self.__parsedData = ""
        self.__nativeHTMLParser = HTMLParser()
        self.__collect = False

    def parse(self):
        self.__nativeHTMLParser.handle_data = self.__handle_data
        self.__nativeHTMLParser.handle_starttag = self.__handle_starttag
        self.__nativeHTMLParser.feed(self._data)

        return self.__parsedData

    def __handle_starttag(self, tag, attrs):
        if 'acceptedTags' not in self._config:
            raise LookupError('Config does not contain HTML tag whitelist')
        if tag in self._config['acceptedTags']:
            self.__collect = True

    def __handle_data(self, data):
        if self.__collect == True:
            self.__parsedData += data + "\n"
            self.__collect = False
Beispiel #16
0
 def strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #17
0
    def feed(self, raw_data):
        assert isinstance(raw_data, unicode), "feed data must be unicode!"
        data = raw_data.strip()

        # cut out <pre> and <tt> areas block tag areas
        data = block_re.sub(self._pre_cut_out, data)
        data = inline_re.sub(self._pre_cut_out, data)

        # Delete whitespace from html code
        data = strip_html(data)

        if self.debugging:
            print "_" * 79
            print "raw data:"
            print repr(raw_data)
            print " -" * 40
            print "cleaned data:"
            print data
            print "-" * 79
#            print clean_data.replace(">", ">\n")
#            print "-"*79

        HTMLParser.feed(self, data)

        return self.root
Beispiel #18
0
def strip_tags(html):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ' '.join(result)
def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = [
        'xgettext', '--join-existing', '--language=Python', '--keyword=_',
        '--add-comments=TRANS:',
        '--output=%s' % pot_file
    ]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print 'ERROR - xgettext failed with return code %i.' % retcode
Beispiel #20
0
    def crawl_item(self, url):
        self.__data = {}

        for i in range(1, self.__retryMax):
            self.output_log("crawling " + url + " ... retry:" + str(i))
            tmpCont = self.request_url(url)
            if not tmpCont :
                continue
            if tmpCont.readline() == 'no data':
                self.output_log("---\t no data")
                return

            tmpSoup = self.parse_web_page(tmpCont.read())
            bbCode = tmpSoup.find(id='bbcode_content')
            try :
                self.__data['img'] = re.compile(r'\[img\](.*)\[\/img\]').findall(bbCode.prettify())[0]
            except:
                self.__data['img'] =  ''
            try :
                self.__data['quality'] = re.compile(r'(\d)').findall(tmpSoup.find(id='item_detail').find('h2')['class'][0])[0]
            except:
                self.__data['quality'] =  ''
            try :
                self.__data['name'] = tmpSoup.find(id='item_detail').find('strong').text
            except:
                self.__data['name'] =  ''
            try :
                self.__data['id'] = re.compile(r'ID:([0-9]*)').findall(tmpSoup.find(id='item_detail').find('span').text)[0]
            except:
                self.__data['id'] =  ''
            try :
                self.__data['qnumber'] = tmpSoup.find(id='item_detail').find(id='ilv').text
            except:
                self.__data['qnumber'] =  ''
            try :
                self.__data['position'] = tmpSoup.find(id='item_detail').find('table').find('table').find('th').text
            except:
                self.__data['position'] =  ''
            try :
                self.__data['html'] = tmpSoup.find(id='main').find_all('div')[1].prettify()
            except:
                self.__data['html'] =  ''
            try :
                """ strip html tag """
                parser = HTMLParser()
                tmpList = []
                parser.handle_data = tmpList.append
                parser.feed(tmpSoup.find(id='item_detail').find(id='_dps').prettify().strip("\n"))
                parser.close()
                self.__data['attribute'] = ''.join(tmpList)
            except:
                self.__data['attribute'] = ''
            """ del temporary variables"""
            del(parser,tmpList,tmpSoup,bbCode,tmpCont)

            if not self.__data:
                continue

            return self.save_to_db(self.__data)
def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #22
0
 def feed(self, page):
     self.urls = []  #Cleaning old urls
     self.page = page
     try:
         html = unicode(str(page.html).decode('utf-8'))
     except:
         html = unicode(page.html, errors='ignore')
     HTMLParser.feed(self, html)
Beispiel #23
0
def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)
Beispiel #24
0
def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)
Beispiel #25
0
    def feed(self, data):
        """Goes through the string and throws out handle events as it comes across
        tags.

        :param data:
        """
        HTMLParser.feed(self, data)
        self.addLabelFromConstructed()
Beispiel #26
0
def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #27
0
def parse(data):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(data)
    data= "".join(result)
    parser.close()
    return data
Beispiel #28
0
 def feed(self, text):
     # reset al previous values
     #self.reset()
     self.__init__()
     # remove scripts and spans, cause they are malformated
     text = re.compile("<script([^>]*)>.*?</script>", re.DOTALL).sub("", text)
     text = re.compile("<span([^>]*)>.*?</span>", re.DOTALL).sub("", text)
     HTMLParser.feed(self, text)
Beispiel #29
0
 def feed(self, data):
     try:
         if self.encoding is not None:
             data = data.decode(self.encoding)
         HTMLParser.feed(self, data)
     except UnicodeDecodeError:
         print "BADSEED!"
         print self.encoding
         print self.rootname
Beispiel #30
0
 def feed(self,tags,data):
     ret = -1
     self.tags=[] #reset tag lists
     self.taglistcnt=0
     self.ret=[]
     ret = self.set_tags(tags)
     if ret > 0:
         HTMLParser.feed(self,data)
     return ret
Beispiel #31
0
 def feed(self, data):
     HTMLParser.feed(self, data)
     tmp = SafeHTMLParser.multi_replace(data)
     tmp = SafeHTMLParser.uriregex1.sub(
             r'<a href="\1">\1</a>',
             unicode(tmp, 'utf-8', 'replace'))
     tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
     tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
     self.raw += tmp
Beispiel #32
0
    def feed(self,url,contents):
        contents = self._script_pattern.sub('',contents)
#        HTMLParser.feed(self,contents)
        try:
            HTMLParser.feed(self,contents)
        except UnicodeDecodeError,err:
            sys.stderr.write((u'Unicode decoding problem when processing contents of "%s".\n' % url).encode('utf-8'))
            sys.stderr.write(repr(err) + '\n')
            raise err
Beispiel #33
0
def strip_tags(htmlStr):
    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)
Beispiel #34
0
    def feed(self, data):
        """

        """

        self.struct.clear()
        HTMLParser.feed(self, data)

        return self.struct.outmost
def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)
Beispiel #36
0
 def strip_tags(self,htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)
Beispiel #37
0
    def feed(self, data):
        """

        """

        self.structure.clear()
        HTMLParser.feed(self, normalize_string(data))

        return self.structure.outmost
Beispiel #38
0
def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)
Beispiel #39
0
 def test_download_report_with_html_format(self, client, report):
     r_format = client.list_report_formats(name="HTML").data[0]
     response = client.download_report(uuid=report["@id"],
                                       format_uuid=r_format["@id"])
     assert isinstance(response, six.string_types)
     parser = HTMLParser()
     parser.feed(response)
     parser.close()
     assert parser
Beispiel #40
0
def strip_tags(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #41
0
def strip_tags(html):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #42
0
    def feed(self, data):
        """

        """

        self.struct.clear()
        HTMLParser.feed(self, data)

        return self.struct.outmost
Beispiel #43
0
def saxify(html, handler, validate=False):
    if validate:
        validator = HTMLParser()
        # This will raise an exception if it cannot process the html
        validator.feed(html)
        validator.close()
    parser = Html2SaxParser(handler)
    parser.feed(html)
    parser.close()
Beispiel #44
0
def strip_tags(html, length):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)[:length]
def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = ['xgettext', '--join-existing', '--language=Python',
            '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print 'ERROR - xgettext failed with return code %i.' % retcode
Beispiel #46
0
def strip_tags(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #47
0
 def strip_content(self):
     html_string = self.content.strip()
     html_string = html_string.strip('\n')
     res = []
     parser = HTMLParser()
     parser.handle_data = res.append
     parser.feed(html_string)
     parser.close()
     content_string = ''.join(res)
     return content_string.encode('utf-8')
def html_strip(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #49
0
    def feed(self, data):
        """

        """
        if not data:
            return None
        self.structure.clear()
        HTMLParser.feed(self, clear_string(data))

        return self.structure.outmost
Beispiel #50
0
 def to_text(s):
     if None == s : return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #51
0
def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '')
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #52
0
 def to_text(s):
     if None == s: return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #53
0
def get_text(html):
    parse = HTMLParser()

    html = html.strip().strip('\n')
    result = []
    parse.handle_data = result.append

    parse.feed(html)
    parse.close()

    return "".join(result)
Beispiel #54
0
 def feed(self, data):
     try:
         quit_on_done_backup = self.quit_on_done
         path_backup = self.path
         self.reset()
         self.path = path_backup
         self.quit_on_done = quit_on_done_backup
         HTMLParser.feed(self, data)
     except HTMLParseError, msg:
         if not self.quit_on_done or not "DONE PROCESSING" in msg.msg:
             raise HTMLParseError(msg.msg, self.getpos())
Beispiel #55
0
 def ultimate_rip(data):
     """Вспомогательная функция вычищения тэгов. Оставляет ничего"""
     ripper = HTMLParser()
     from types import MethodType
     ripper.handle_data = MethodType(lambda self, d: self.fed.append(d),
                                     ripper, HTMLParser)
     ripper.get_data = MethodType(lambda self: u''.join(self.fed), ripper,
                                  HTMLParser)
     ripper.fed = []
     ripper.feed(data)
     return ripper.get_data()
Beispiel #56
0
 def feed(self, data):
     try:
         data = unicode(data, 'utf-8')
     except UnicodeDecodeError:
         data = unicode(data, 'utf-8', errors='replace')
     HTMLParser.feed(self, data)
     tmp = SafeHTMLParser.replace_pre(data)
     tmp = SafeHTMLParser.uriregex1.sub(r'<a href="\1">\1</a>', tmp)
     tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
     tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
     tmp = SafeHTMLParser.replace_post(tmp)
     self.raw += tmp
Beispiel #57
0
    def feed(self, data):
        """
        :type data: string
        :rtype: Root

        """
        if not data:
            return None
        self.structure.clear()
        HTMLParser.feed(self, data)

        return self.structure.outmost