def handle_data(self, data):
     """ 处理文本元素 """
     HTMLParser.handle_data(self, data)
     for line in data.split(";"):
         (key, _, value) = line.partition("=")
         if key == "TRANSLATED_TEXT":
             print value
             break
 def handle_data(self, data):
     ''' 处理文本元素 '''
     HTMLParser.handle_data(self, data)
     for line in data.split(';'):
         (key, _, value) = line.partition('=')
         if key == 'TRANSLATED_TEXT':
             print value
             break
Beispiel #3
0
 def handle_data(self, data):
     HTMLParser.handle_data(self, data)
     for i in range(0,self.taglistcnt):
         if self.tags[i][2][0]:
             self.ret[i]+='%s'%data
             for el in self.tags[i][1]:#reset the get flag
                 el[1]=False
             self.tags[i][2][0] = False 
Beispiel #4
0
	def handle_data(self, data):
		HTMLParser.handle_data(self, data)
		print data
		l = re.findall(r'\b[\w,.]+?\b', data)
		if len(l) != 0:
			print l
		for i in l:
			if dic.has_key(i):
				if not (tmp[1] in dic[i]):
					dic[i].append(tmp[1])
			else:
				dic[i] = [tmp[1]]
Beispiel #5
0
    def handle_data(self, data):
        HTMLParser.handle_data(self, data)
        if str_address in data:
            self.dicts[self.index]['server'] = data[data.find(':') + 1:]

        elif str_port in data:
            self.dicts[self.index]['server_port'] = data[data.find(':') + 1:]
        elif str_password in data:
            self.dicts[self.index]['password'] = data[data.find(':') + 1:]
        elif str_crypt_method in data:
            self.dicts[self.index]['method'] = data[data.find(':') + 1:]
            self.index += 1
Beispiel #6
0
def strip_tags(html):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ' '.join(result)
Beispiel #7
0
def htmlstrip(html):
    #html = html.strip()
    #html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',
                           re.I)  #Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>',
                             re.I)
    re_script_2 = re.compile(r'<script>.+</script>', re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>',
                             re.I)
    re_comment = re.compile(r'<!--.+//-->', re.I)
    re_iframe = re.compile(r'<iframe.+</iframe>', re.I)
    html = re_script.sub('', html)  #去掉SCRIPT
    html = re_script_1.sub('', html)  #strip script
    html = re_script_2.sub('', html)
    html = re_script_3.sub('', html)
    html = re_comment.sub('', html)
    html = re_iframe.sub('', html)

    html = html.replace('&nbsp;&nbsp;&nbsp;&nbsp;', '')
    html = html.replace('<br />', '\n')
    html = html.replace('<br>', '\n')
    html = html.replace('<br/>', '\n')
    html = html.replace('\n\n\n\n', '\n\n')
    #soup = BeautifulSoup(html, fromEncoding = "utf-8")
    #html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #8
0
def htmlstrip(html):
    # html = html.strip()
    # html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile("<\s*script[^>]*>[^<]*<\s*/\s*script\s*>", re.I)  # Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>', re.I)
    re_script_2 = re.compile(r"<script>.+</script>", re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>', re.I)
    re_comment = re.compile(r"<!--.+//-->", re.I)
    re_iframe = re.compile(r"<iframe.+</iframe>", re.I)
    html = re_script.sub("", html)  # 去掉SCRIPT
    html = re_script_1.sub("", html)  # strip script
    html = re_script_2.sub("", html)
    html = re_script_3.sub("", html)
    html = re_comment.sub("", html)
    html = re_iframe.sub("", html)

    html = html.replace("&nbsp;&nbsp;&nbsp;&nbsp;", "")
    html = html.replace("<br />", "\n")
    html = html.replace("<br>", "\n")
    html = html.replace("<br/>", "\n")
    html = html.replace("\n\n\n\n", "\n\n")
    # soup = BeautifulSoup(html, fromEncoding = "utf-8")
    # html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return "".join(result)
Beispiel #9
0
 def strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
def Action(url,ext='pdf',output='.'):

	lpdf = []
	urlpdf = []
	namepdf = []

	#domain
	index = url.rfind('/')
	domain = url[0:index+1];

	print domain

	request = urllib2.Request(url);
	response = urllib2.urlopen(request);

	#content
	content = response.read()

	#resource
	mode = '<a[^>]+>[^<]+.pdf[^>]+a>'
	lpdf = re.compile(mode).findall(content)
	parserurl = HTMLParser()
	parsername = HTMLParser()
	print lpdf
	for x in lpdf:
		sta = x.find("href=\"")+6
		end = x.find("\"",sta+1)
		urlpdf.append(x[sta:end])
		sta = x.find(">")+1;
		end = x.find("pdf",sta);
		namepdf.append(x[sta:end+3])
	#print len(namepdf),len(urlpdf)
	for i in range(len(urlpdf)):
		tmp = []
		parserurl.handle_data = tmp.append
		parserurl.feed(urlpdf[i])
		urlpdf[i] = '&'.join(tmp);
	parserurl.close()
	for i in range(len(namepdf)):
		tmp = []
		parsername.handle_data = tmp.append
		parsername.feed(namepdf[i])
		namepdf[i] = '&'.join(tmp);
	for i in range(len(urlpdf)):
		print urlpdf[i]
		print namepdf[i]
		urllib.urlretrieve(urlpdf[i],output + unicode(namepdf[i],"utf8"))
def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = [
        'xgettext', '--join-existing', '--language=Python', '--keyword=_',
        '--add-comments=TRANS:',
        '--output=%s' % pot_file
    ]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print 'ERROR - xgettext failed with return code %i.' % retcode
Beispiel #12
0
    def crawl_item(self, url):
        self.__data = {}

        for i in range(1, self.__retryMax):
            self.output_log("crawling " + url + " ... retry:" + str(i))
            tmpCont = self.request_url(url)
            if not tmpCont :
                continue
            if tmpCont.readline() == 'no data':
                self.output_log("---\t no data")
                return

            tmpSoup = self.parse_web_page(tmpCont.read())
            bbCode = tmpSoup.find(id='bbcode_content')
            try :
                self.__data['img'] = re.compile(r'\[img\](.*)\[\/img\]').findall(bbCode.prettify())[0]
            except:
                self.__data['img'] =  ''
            try :
                self.__data['quality'] = re.compile(r'(\d)').findall(tmpSoup.find(id='item_detail').find('h2')['class'][0])[0]
            except:
                self.__data['quality'] =  ''
            try :
                self.__data['name'] = tmpSoup.find(id='item_detail').find('strong').text
            except:
                self.__data['name'] =  ''
            try :
                self.__data['id'] = re.compile(r'ID:([0-9]*)').findall(tmpSoup.find(id='item_detail').find('span').text)[0]
            except:
                self.__data['id'] =  ''
            try :
                self.__data['qnumber'] = tmpSoup.find(id='item_detail').find(id='ilv').text
            except:
                self.__data['qnumber'] =  ''
            try :
                self.__data['position'] = tmpSoup.find(id='item_detail').find('table').find('table').find('th').text
            except:
                self.__data['position'] =  ''
            try :
                self.__data['html'] = tmpSoup.find(id='main').find_all('div')[1].prettify()
            except:
                self.__data['html'] =  ''
            try :
                """ strip html tag """
                parser = HTMLParser()
                tmpList = []
                parser.handle_data = tmpList.append
                parser.feed(tmpSoup.find(id='item_detail').find(id='_dps').prettify().strip("\n"))
                parser.close()
                self.__data['attribute'] = ''.join(tmpList)
            except:
                self.__data['attribute'] = ''
            """ del temporary variables"""
            del(parser,tmpList,tmpSoup,bbCode,tmpCont)

            if not self.__data:
                continue

            return self.save_to_db(self.__data)
Beispiel #13
0
def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #14
0
def parse(data):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(data)
    data= "".join(result)
    parser.close()
    return data
Beispiel #15
0
def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)
Beispiel #16
0
def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)
def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #18
0
def strip_tags(htmlStr):
    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)
Beispiel #19
0
def strip_tags(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #20
0
def strip_tags(html):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)
    def handle_data(self, data):
        if self.string_to_find in data.lower():
            if not self.currentHtmFile in self.text_found_in.keys():
                self.text_found_in[self.currentHtmFile] = 1
            else:
                self.text_found_in[self.currentHtmFile] += 1

            if dbg: print "found in %s" % (self.currentHtmFile, )
        return HTMLParser.handle_data(self, data)
Beispiel #23
0
def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)
Beispiel #24
0
def strip_tags(html, length):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)[:length]
 def handle_data(self, data):
     tmp_data = data
     if self.start_tag not in ['script', 'link']:
         if not self.is_flag:
             if self.str_start_tag == self.get_starttag_text():
                 self.str_start_tag = ''
             else:
                 self.is_flag = True
         if self.is_flag:
             self.str_start_tag = self.get_starttag_text()
             self.is_flag = False
         str_starttag_text = '%s%s' % (self.str_start_tag, tmp_data)
         LST_SUB.append(str_starttag_text)
         regex = re.compile(u'(\\b[a-zA-Zа-яА-Я]{6}\\b)', re.U)
         tmp_data = regex.sub(r'\1&trade;', tmp_data)
         str_modify_text = '%s%s' % (self.str_start_tag, tmp_data)
         LST_SUB_MODIFY.append(str_modify_text)
     HTMLParser.handle_data(self, tmp_data)
Beispiel #26
0
 def strip_tags(self,htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)
def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = ['xgettext', '--join-existing', '--language=Python',
            '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print 'ERROR - xgettext failed with return code %i.' % retcode
Beispiel #28
0
def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '');
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #29
0
 def strip_content(self):
     html_string = self.content.strip()
     html_string = html_string.strip('\n')
     res = []
     parser = HTMLParser()
     parser.handle_data = res.append
     parser.feed(html_string)
     parser.close()
     content_string = ''.join(res)
     return content_string.encode('utf-8')
Beispiel #30
0
def strip_tags(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #31
0
def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '')
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #32
0
 def to_text(s):
     if None == s: return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Beispiel #33
0
 def to_text(s):
     if None == s : return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
def html_strip(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Beispiel #35
0
def parse_html(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    str = ''.join(result)
    str2 = ''.join(str.split())
    return str2
Beispiel #36
0
def get_text(html) :
  parse = HTMLParser()

  html = html.strip().strip('\n')
  result = []
  parse.handle_data = result.append

  parse.feed(html)
  parse.close()

  return "".join(result)
Beispiel #37
0
def get_text(html):
    parse = HTMLParser()

    html = html.strip().strip('\n')
    result = []
    parse.handle_data = result.append

    parse.feed(html)
    parse.close()

    return "".join(result)
Beispiel #38
0
 def cleanHtmlTag(self,html):
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     res = ''.join(result)
     res = self.cleanHtmlAgain(res)
     return res
Beispiel #39
0
 def ultimate_rip(data):
     """Вспомогательная функция вычищения тэгов. Оставляет ничего"""
     ripper = HTMLParser()
     from types import MethodType
     ripper.handle_data = MethodType(lambda self, d: self.fed.append(d),
                                     ripper, HTMLParser)
     ripper.get_data = MethodType(lambda self: u''.join(self.fed), ripper,
                                  HTMLParser)
     ripper.fed = []
     ripper.feed(data)
     return ripper.get_data()
Beispiel #40
0
def strip_tags(html):
    try:
        html = html.strip()
        result = []
        parse = HTMLParser()
        parse.handle_data = result.append
        parse.feed(html)
        parse.close()
        return "".join(result)
    except Exception as e:
        print e
        return ''
Beispiel #41
0
    def strip_tags(self, html):
        if not html:
            return ""

        from HTMLParser import HTMLParser
        html=html.strip()
        html=html.strip("\n")
        result=[]
        parse=HTMLParser()
        parse.handle_data=result.append
        parse.feed(html)
        parse.close()
        return "".join(result)
Beispiel #42
0
def strip_tags(text):
    """
    去除html标记
    """
    from HTMLParser import HTMLParser
    text = text.strip()
    text = text.strip('\n')
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(text)
    parse.close()
    return ''.join(result)
def transform_html_text(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    text = " ".join(result)
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    result, number = re.subn('  +', ' ', text)
    return result
Beispiel #44
0
def strip_tags(text):
    """
    去除html标记
    """
    from HTMLParser import HTMLParser
    text = text.strip()
    text = text.strip('\n')
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(text)
    parse.close()
    return ''.join(result)
Beispiel #45
0
	def _content_to_list(self, content):
	
		try:
			parser = HTMLParser()
			parser.fed = []
			parser.handle_data = lambda d: parser.fed.append(d)
			
			parser.feed(content)
			return (''.join(parser.fed).encode('ascii', 'ignore')
					.translate(string.maketrans("",""), string.punctuation).lower().split())
		
		except:
			return list()
 def strip_tags(self, htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     # ɾ³ýstyle±êÇ©
     re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)
     htmlStr = re_style.sub('', htmlStr)
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     htmlStr = parser.unescape(htmlStr)
     parser.feed(htmlStr)
     parser.close()
     return ''.join(result)
Beispiel #47
0
    def _content_to_list(self, content):

        try:
            parser = HTMLParser()
            parser.fed = []
            parser.handle_data = lambda d: parser.fed.append(d)

            parser.feed(content)
            return (''.join(parser.fed).encode('ascii', 'ignore').translate(
                string.maketrans("", ""), string.punctuation).lower().split())

        except:
            return list()
Beispiel #48
0
 def strip_tags(self,htmlStr):
     '''
     使用HTMLParser进行html标签过滤
     :param htmlStr:
     '''
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)
Beispiel #49
0
def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #50
0
    def strip_tags(self,htmlStr):
        htmlStr = re.sub("[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f\\x7f]","",htmlStr)
        htmlStr = re.sub(codecs.BOM_UTF8,"",htmlStr)
        htmlStr = re.sub("\\xef\\xbb\\xbf","",htmlStr)
        
        htmlStr = htmlStr.strip()
        htmlStr = htmlStr.strip("\n")

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(htmlStr)
        parser.close()
        return  ''.join(result)
Beispiel #51
0
def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    """
    if html is None:
        return ''
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Beispiel #52
0
def strip_tags1(htmlStr):
    '''
        使用HTMLParser进行html标签过滤
        :param htmlStr:
        '''

    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)
Beispiel #53
0
 def get_post_summary(self, html=''):
     """利用html返回一串纯文本"""
     from HTMLParser import HTMLParser
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     result = "<br/>".join(result)
     if len(result) > 200:
         result = result[0:200]
     return result
Beispiel #54
0
def strip_tags(html):  
    """  
    Python中过滤HTML标签的函数  
    >>> str_text=strip_tags("<font color=red>hello</font>")  
    >>> print str_text  
    hello  
    """  
    html = html.strip()  
    html = html.strip("\n")  
    result = []  
    parser = HTMLParser()  
    parser.handle_data = result.append  
    parser.feed(html)  
    parser.close()  
    return ''.join(result)  
Beispiel #55
0
 def strip_tags(self, html):
     """
     Python中过滤HTML标签的函数
     >>> str_text=strip_tags("<font color=red>hello</font>")
     >>> print str_text
     hello
     """
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
    def handle_data(self, data):

        result = self.regexp_for_search_compiled.search(data.lower())
        if result is not None:
            #  re.finditer(pattern, string) returns an iterator over MatchObject objects.
            MatchObjects = self.regexp_for_search_compiled.finditer(
                data.lower())

            for thisMatchObject in MatchObjects:
                if dbg: print thisMatchObject.start(), thisMatchObject.end()

                if not self.currentHtmFile in self.text_found_in.keys():
                    self.text_found_in[self.currentHtmFile] = 1
                else:
                    self.text_found_in[self.currentHtmFile] += 1

            # if dbg: print "found in %s" % (self.currentHtmFile,)

        return HTMLParser.handle_data(self, data)
Beispiel #57
0
def strip_tags(html):
    """
    Python strip html tags
    >>> str_text=strip_tags("<font color=red>hello</font>")
    >>> print str_text
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()

    r = ''.join(result)
    if len(r) >= 2048:
        r = r[:2048]
    return r
Beispiel #58
0
    def strip_tags_parser(self, html):
        """
        去除文本中的HTML标签.用到了HTMLParser
        使用示例:
        str_text=strip_tags("<font color=red>hello</font>")

        :return: String
        """
        from HTMLParser import HTMLParser
        html = html.strip('\n')
        html = html.strip('\t')
        html = html.strip(' ')
        html = html.strip()

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        return '$'.join(result)
Beispiel #59
0
def prepareString(string, lenght=0, strip_html=True):
    string = string.strip()

    if (strip_html):
        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(string)
        parser.close()
        string = ''.join(result)
    else:
        string = html2safehtml(string,
                               valid_tags=("b", "a", "i", "br", "ul", "li",
                                           "strong"))

    if lenght > 0:
        string = string[0:lenght]
        string += "..."

    string = cgi.escape(string)

    return string