Beispiel #1
0
def htmlstrip(html):
    # html = html.strip()
    # html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile("<\s*script[^>]*>[^<]*<\s*/\s*script\s*>", re.I)  # Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>', re.I)
    re_script_2 = re.compile(r"<script>.+</script>", re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>', re.I)
    re_comment = re.compile(r"<!--.+//-->", re.I)
    re_iframe = re.compile(r"<iframe.+</iframe>", re.I)
    html = re_script.sub("", html)  # 去掉SCRIPT
    html = re_script_1.sub("", html)  # strip script
    html = re_script_2.sub("", html)
    html = re_script_3.sub("", html)
    html = re_comment.sub("", html)
    html = re_iframe.sub("", html)

    html = html.replace("&nbsp;&nbsp;&nbsp;&nbsp;", "")
    html = html.replace("<br />", "\n")
    html = html.replace("<br>", "\n")
    html = html.replace("<br/>", "\n")
    html = html.replace("\n\n\n\n", "\n\n")
    # soup = BeautifulSoup(html, fromEncoding = "utf-8")
    # html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return "".join(result)
Beispiel #2
0
    def close(self):
        if dbg:
            pass

        HTMLParser.close(
            self
        )  # Force processing of all buffered data as if it were followed by an end-of-file mark. This method may be redefined by a derived class to define additional processing at the end of the input, but the redefined version should always call the HTMLParser base class method close().
Beispiel #3
0
 def close(self):
     """
     Fills in the last bit of result
     """
     if self.lastText:
         self._endGap()
     HTMLParser.close(self)
Beispiel #4
0
def htmlstrip(html):
    #html = html.strip()
    #html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',
                           re.I)  #Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>',
                             re.I)
    re_script_2 = re.compile(r'<script>.+</script>', re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>',
                             re.I)
    re_comment = re.compile(r'<!--.+//-->', re.I)
    re_iframe = re.compile(r'<iframe.+</iframe>', re.I)
    html = re_script.sub('', html)  #去掉SCRIPT
    html = re_script_1.sub('', html)  #strip script
    html = re_script_2.sub('', html)
    html = re_script_3.sub('', html)
    html = re_comment.sub('', html)
    html = re_iframe.sub('', html)

    html = html.replace('&nbsp;&nbsp;&nbsp;&nbsp;', '')
    html = html.replace('<br />', '\n')
    html = html.replace('<br>', '\n')
    html = html.replace('<br/>', '\n')
    html = html.replace('\n\n\n\n', '\n\n')
    #soup = BeautifulSoup(html, fromEncoding = "utf-8")
    #html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #5
0
 def strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #6
0
 def close(self):
     # close everything still open
     stack = self._stack
     while stack:
         pushed_tag = stack.pop()
         self._handler.endElementNS((None, pushed_tag), None)
     HTMLParser.close(self)
Beispiel #7
0
def strip_tags(html):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ' '.join(result)
Beispiel #8
0
    def close(self):
        HTMLParser.close(self)

        self._OnSectionBoundary()

        if self._processing_entry is not None:
            self._warnings.append(
                'Finished parsing while still processing a <%s>' %
                parser._processing_entry._tag)

        if self._expect_title:
            if not self._title_entry:
                self._warnings.append('Expected a title')
                title, title_attributes = '', {}
            else:
                title, title_attributes = (self._title_entry.name,
                                           self._title_entry.attributes)
        else:
            if self._title_entry:
                self._warnings.append('Found unexpected title "%s"' %
                                      self._title_entry.name)
            title, title_attributes = None, None

        self.parse_result = ParseResult(title, title_attributes,
                                        self._sections, self._warnings)
Beispiel #9
0
    def crawl_item(self, url):
        self.__data = {}

        for i in range(1, self.__retryMax):
            self.output_log("crawling " + url + " ... retry:" + str(i))
            tmpCont = self.request_url(url)
            if not tmpCont :
                continue
            if tmpCont.readline() == 'no data':
                self.output_log("---\t no data")
                return

            tmpSoup = self.parse_web_page(tmpCont.read())
            bbCode = tmpSoup.find(id='bbcode_content')
            try :
                self.__data['img'] = re.compile(r'\[img\](.*)\[\/img\]').findall(bbCode.prettify())[0]
            except:
                self.__data['img'] =  ''
            try :
                self.__data['quality'] = re.compile(r'(\d)').findall(tmpSoup.find(id='item_detail').find('h2')['class'][0])[0]
            except:
                self.__data['quality'] =  ''
            try :
                self.__data['name'] = tmpSoup.find(id='item_detail').find('strong').text
            except:
                self.__data['name'] =  ''
            try :
                self.__data['id'] = re.compile(r'ID:([0-9]*)').findall(tmpSoup.find(id='item_detail').find('span').text)[0]
            except:
                self.__data['id'] =  ''
            try :
                self.__data['qnumber'] = tmpSoup.find(id='item_detail').find(id='ilv').text
            except:
                self.__data['qnumber'] =  ''
            try :
                self.__data['position'] = tmpSoup.find(id='item_detail').find('table').find('table').find('th').text
            except:
                self.__data['position'] =  ''
            try :
                self.__data['html'] = tmpSoup.find(id='main').find_all('div')[1].prettify()
            except:
                self.__data['html'] =  ''
            try :
                """ strip html tag """
                parser = HTMLParser()
                tmpList = []
                parser.handle_data = tmpList.append
                parser.feed(tmpSoup.find(id='item_detail').find(id='_dps').prettify().strip("\n"))
                parser.close()
                self.__data['attribute'] = ''.join(tmpList)
            except:
                self.__data['attribute'] = ''
            """ del temporary variables"""
            del(parser,tmpList,tmpSoup,bbCode,tmpCont)

            if not self.__data:
                continue

            return self.save_to_db(self.__data)
Beispiel #10
0
 def close(self):
     HTMLParser.close(self)
     self._commit_block()
     byte_output = self.output_buffer.encode('utf-8')
     if hasattr(sys.stdout, 'buffer'):
         sys.stdout.buffer.write(byte_output)
     else:
         sys.stdout.write(byte_output)
Beispiel #11
0
def parse(data):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(data)
    data= "".join(result)
    parser.close()
    return data
 def close(self):
     """
     Fills in the last bit of result
     """
     if self.lastText:
         self._endGap()
         #self.result.append((self.lastText, self.lastGap))
     HTMLParser.close(self)
Beispiel #13
0
def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)
Beispiel #14
0
 def close(self):
     """
     Fills in the last bit of result
     """
     if self.lastText:
         self._endGap()
         #self.result.append((self.lastText, self.lastGap))
     HTMLParser.close(self)
Beispiel #15
0
 def close(self):
     HTMLParser.close(self)
     self._commit_block()
     byte_output = self.output_buffer.encode('utf-8')
     if hasattr(sys.stdout, 'buffer'):
         sys.stdout.buffer.write(byte_output)
     else:
         sys.stdout.write(byte_output)
def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #17
0
def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)
Beispiel #18
0
 def close(self):
     
     if self.item is not None:
         self.results.append(self.item)
     
     self.item = None
     
     HTMLParser.close(self)
Beispiel #19
0
def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #20
0
 def strip_tags(self,htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)
Beispiel #21
0
        def close(self):
            HTMLParser.close(self)
            text = self.__buf.getvalue()
            self.__buf.close()

            if self.__ignore_errors or self.__errors == 0 or self.__processed > 3*self.__errors:
                return text
            else:
                return None
Beispiel #22
0
	def close(self, *args, **kwargs):
		HTMLParser.close(self, *args, **kwargs)
		if len(self.tree):
			raise Exception("Unclosed tags: %s" % repr(self.tree))
		if len(self.stack):
			raise Exception("Unclosed stack: %s" % repr(self.stack))
		self.buffr += "return ctx;"
		self.buffr += "}"
		return self.buffr
Beispiel #23
0
def strip_tags(htmlStr):
    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)
Beispiel #24
0
def strip_tags(html, length):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)[:length]
def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)
Beispiel #26
0
def saxify(html, handler, validate=False):
    if validate:
        validator = HTMLParser()
        # This will raise an exception if it cannot process the html
        validator.feed(html)
        validator.close()
    parser = Html2SaxParser(handler)
    parser.feed(html)
    parser.close()
Beispiel #27
0
 def test_download_report_with_html_format(self, client, report):
     r_format = client.list_report_formats(name="HTML").data[0]
     response = client.download_report(uuid=report["@id"],
                                       format_uuid=r_format["@id"])
     assert isinstance(response, six.string_types)
     parser = HTMLParser()
     parser.feed(response)
     parser.close()
     assert parser
Beispiel #28
0
    def close(self):
        if (self._wb_parse_context):
            result = self.rewrite('</' + self._wb_parse_context + '>')
            self._wb_parse_context = None
        else:
            result = ''

        HTMLParser.close(self)
        return result
Beispiel #29
0
	def output(self):
		HTMLParser.close(self)

		if self.__parse_depth != 0:
			raise 'unmatched tag (%s)' % self.__parse_tag

		self.__consume()

		return self.__output
Beispiel #30
0
def strip_tags(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #31
0
def strip_tags(html):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #32
0
def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)
Beispiel #33
0
 def strip_content(self):
     html_string = self.content.strip()
     html_string = html_string.strip('\n')
     res = []
     parser = HTMLParser()
     parser.handle_data = res.append
     parser.feed(html_string)
     parser.close()
     content_string = ''.join(res)
     return content_string.encode('utf-8')
Beispiel #34
0
def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '')
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #35
0
 def to_text(s):
     if None == s : return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #36
0
 def to_text(s):
     if None == s: return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #37
0
def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '');
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #38
0
def strip_tags(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
def html_strip(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #40
0
def get_text(html) :
  parse = HTMLParser()

  html = html.strip().strip('\n')
  result = []
  parse.handle_data = result.append

  parse.feed(html)
  parse.close()

  return "".join(result)
Beispiel #41
0
    def _internal_close(self):
        if (self._wb_parse_context):
            end_tag = '</' + self._wb_parse_context + '>'
            self.feed(end_tag)
            self._wb_parse_context = None

        try:
            HTMLParser.close(self)
        except HTMLParseError:  # pragma: no cover
            # only raised in 2.6
            pass
Beispiel #42
0
def get_text(html):
    parse = HTMLParser()

    html = html.strip().strip('\n')
    result = []
    parse.handle_data = result.append

    parse.feed(html)
    parse.close()

    return "".join(result)
Beispiel #43
0
    def _internal_close(self):
        if (self._wb_parse_context):
            end_tag = '</' + self._wb_parse_context + '>'
            self.feed(end_tag)
            self._wb_parse_context = None

        try:
            HTMLParser.close(self)
        except HTMLParseError:  # pragma: no cover
            # only raised in 2.6
            pass
Beispiel #44
0
def parse_html(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    str = ''.join(result)
    str2 = ''.join(str.split())
    return str2
Beispiel #45
0
 def cleanHtmlTag(self,html):
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     res = ''.join(result)
     res = self.cleanHtmlAgain(res)
     return res
Beispiel #46
0
def strip_tags(html):
    try:
        html = html.strip()
        result = []
        parse = HTMLParser()
        parse.handle_data = result.append
        parse.feed(html)
        parse.close()
        return "".join(result)
    except Exception as e:
        print e
        return ''
 def strip_tags(self, htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     # ɾ³ýstyle±êÇ©
     re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)
     htmlStr = re_style.sub('', htmlStr)
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     htmlStr = parser.unescape(htmlStr)
     parser.feed(htmlStr)
     parser.close()
     return ''.join(result)
Beispiel #48
0
def strip_tags(text):
    """
    去除html标记
    """
    from HTMLParser import HTMLParser
    text = text.strip()
    text = text.strip('\n')
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(text)
    parse.close()
    return ''.join(result)
Beispiel #49
0
def strip_tags(text):
    """
    去除html标记
    """
    from HTMLParser import HTMLParser
    text = text.strip()
    text = text.strip('\n')
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(text)
    parse.close()
    return ''.join(result)
Beispiel #50
0
 def strip_tags(self,htmlStr):
     '''
     使用HTMLParser进行html标签过滤
     :param htmlStr:
     '''
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)
def transform_html_text(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    text = " ".join(result)
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    result, number = re.subn('  +', ' ', text)
    return result
Beispiel #52
0
def strip_tags1(htmlStr):
    '''
        使用HTMLParser进行html标签过滤
        :param htmlStr:
        '''

    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)
Beispiel #53
0
def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #54
0
    def strip_tags(self,htmlStr):
        htmlStr = re.sub("[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f\\x7f]","",htmlStr)
        htmlStr = re.sub(codecs.BOM_UTF8,"",htmlStr)
        htmlStr = re.sub("\\xef\\xbb\\xbf","",htmlStr)
        
        htmlStr = htmlStr.strip()
        htmlStr = htmlStr.strip("\n")

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(htmlStr)
        parser.close()
        return  ''.join(result)
Beispiel #55
0
def strip_tags(html):  
    """  
    Python中过滤HTML标签的函数  
    >>> str_text=strip_tags("<font color=red>hello</font>")  
    >>> print str_text  
    hello  
    """  
    html = html.strip()  
    html = html.strip("\n")  
    result = []  
    parser = HTMLParser()  
    parser.handle_data = result.append  
    parser.feed(html)  
    parser.close()  
    return ''.join(result)  
Beispiel #56
0
def send_message():

    content = request.form.get('userMessage')


    # The text to translate
    text = content
    # The target language
    target = 'en'

    # Translates text
    translation = translate_client.translate(
        text,
        target_language=target)

    print(u'Text: {}'.format(text))
    print(u'Translation: {}'.format(translation['translatedText']))

    translated_text = translation['translatedText']

    h = HTMLParser()
    h.close()
    content = h.unescape(translated_text)

    print content

    team_id = session['id']
    teammates = db.session.query(Athlete).filter(Athlete.team_id == team_id).all()

    # create list of phone numbers for team
    phone_list = []
    for person in teammates:
        phone_list.append(person.a_phone)

    #  Cycle through whole team phone list
    print phone_list
    for number in phone_list:
        message = client.messages.create(
            to=number,
            from_=twilio_phone,
            body=content)

        print(message.sid)

    confirmation = "Message sent"

    return jsonify(message=confirmation)
Beispiel #57
0
 def close(self, *args, **kwargs):
     self.handle_data(None)
     HTMLParser.close(self, *args, **kwargs)
     if len(self.stack):
         raise Exception("%s unclosed stack: %s" %
                         (self.name, repr(self.stack)))
     if len(self.tree):
         raise Exception("%s unclosed tags: %s" %
                         (self.name, repr(self.tree_names)))
     self.buffers['_r'] += "return _c.$t;"
     self.buffers['_r'] += "};"
     buffer_r = self.buffers['_r']
     del (self.buffers['_r'])
     final_buffer = ""
     for key, buff in self.buffers.iteritems():
         final_buffer += buff
     final_buffer += buffer_r
     return final_buffer
Beispiel #58
0
    def strip_tags_parser(self, html):
        """
        去除文本中的HTML标签.用到了HTMLParser
        使用示例:
        str_text=strip_tags("<font color=red>hello</font>")

        :return: String
        """
        from HTMLParser import HTMLParser
        html = html.strip('\n')
        html = html.strip('\t')
        html = html.strip(' ')
        html = html.strip()

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        return '$'.join(result)
Beispiel #59
0
def strip_tags(html):
    """
    Python strip html tags
    >>> str_text=strip_tags("<font color=red>hello</font>")
    >>> print str_text
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()

    r = ''.join(result)
    if len(r) >= 2048:
        r = r[:2048]
    return r