def handle_data(self, data): """ 处理文本元素 """ HTMLParser.handle_data(self, data) for line in data.split(";"): (key, _, value) = line.partition("=") if key == "TRANSLATED_TEXT": print value break
def handle_data(self, data): ''' 处理文本元素 ''' HTMLParser.handle_data(self, data) for line in data.split(';'): (key, _, value) = line.partition('=') if key == 'TRANSLATED_TEXT': print value break
def handle_data(self, data): HTMLParser.handle_data(self, data) for i in range(0,self.taglistcnt): if self.tags[i][2][0]: self.ret[i]+='%s'%data for el in self.tags[i][1]:#reset the get flag el[1]=False self.tags[i][2][0] = False
def handle_data(self, data): HTMLParser.handle_data(self, data) print data l = re.findall(r'\b[\w,.]+?\b', data) if len(l) != 0: print l for i in l: if dic.has_key(i): if not (tmp[1] in dic[i]): dic[i].append(tmp[1]) else: dic[i] = [tmp[1]]
def handle_data(self, data): HTMLParser.handle_data(self, data) if str_address in data: self.dicts[self.index]['server'] = data[data.find(':') + 1:] elif str_port in data: self.dicts[self.index]['server_port'] = data[data.find(':') + 1:] elif str_password in data: self.dicts[self.index]['password'] = data[data.find(':') + 1:] elif str_crypt_method in data: self.dicts[self.index]['method'] = data[data.find(':') + 1:] self.index += 1
def strip_tags(html): result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ' '.join(result)
def htmlstrip(html): #html = html.strip() #html = html.strip("http://") html = html.replace(u"<!面页章节尾部广告>", "") re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) #Script re_script_1 = re.compile(r'<script type="text/javascript">.+</script>', re.I) re_script_2 = re.compile(r'<script>.+</script>', re.I) re_script_3 = re.compile(r'<script type="text/javascript.+</script>', re.I) re_comment = re.compile(r'<!--.+//-->', re.I) re_iframe = re.compile(r'<iframe.+</iframe>', re.I) html = re_script.sub('', html) #去掉SCRIPT html = re_script_1.sub('', html) #strip script html = re_script_2.sub('', html) html = re_script_3.sub('', html) html = re_comment.sub('', html) html = re_iframe.sub('', html) html = html.replace(' ', '') html = html.replace('<br />', '\n') html = html.replace('<br>', '\n') html = html.replace('<br/>', '\n') html = html.replace('\n\n\n\n', '\n\n') #soup = BeautifulSoup(html, fromEncoding = "utf-8") #html = soup.prettify() result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def htmlstrip(html): # html = html.strip() # html = html.strip("http://") html = html.replace(u"<!面页章节尾部广告>", "") re_script = re.compile("<\s*script[^>]*>[^<]*<\s*/\s*script\s*>", re.I) # Script re_script_1 = re.compile(r'<script type="text/javascript">.+</script>', re.I) re_script_2 = re.compile(r"<script>.+</script>", re.I) re_script_3 = re.compile(r'<script type="text/javascript.+</script>', re.I) re_comment = re.compile(r"<!--.+//-->", re.I) re_iframe = re.compile(r"<iframe.+</iframe>", re.I) html = re_script.sub("", html) # 去掉SCRIPT html = re_script_1.sub("", html) # strip script html = re_script_2.sub("", html) html = re_script_3.sub("", html) html = re_comment.sub("", html) html = re_iframe.sub("", html) html = html.replace(" ", "") html = html.replace("<br />", "\n") html = html.replace("<br>", "\n") html = html.replace("<br/>", "\n") html = html.replace("\n\n\n\n", "\n\n") # soup = BeautifulSoup(html, fromEncoding = "utf-8") # html = soup.prettify() result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return "".join(result)
def strip_tags(self, html): result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def Action(url,ext='pdf',output='.'): lpdf = [] urlpdf = [] namepdf = [] #domain index = url.rfind('/') domain = url[0:index+1]; print domain request = urllib2.Request(url); response = urllib2.urlopen(request); #content content = response.read() #resource mode = '<a[^>]+>[^<]+.pdf[^>]+a>' lpdf = re.compile(mode).findall(content) parserurl = HTMLParser() parsername = HTMLParser() print lpdf for x in lpdf: sta = x.find("href=\"")+6 end = x.find("\"",sta+1) urlpdf.append(x[sta:end]) sta = x.find(">")+1; end = x.find("pdf",sta); namepdf.append(x[sta:end+3]) #print len(namepdf),len(urlpdf) for i in range(len(urlpdf)): tmp = [] parserurl.handle_data = tmp.append parserurl.feed(urlpdf[i]) urlpdf[i] = '&'.join(tmp); parserurl.close() for i in range(len(namepdf)): tmp = [] parsername.handle_data = tmp.append parsername.feed(namepdf[i]) namepdf[i] = '&'.join(tmp); for i in range(len(urlpdf)): print urlpdf[i] print namepdf[i] urllib.urlretrieve(urlpdf[i],output + unicode(namepdf[i],"utf8"))
def cmd_genpot(config, options): """Generate the gettext pot file""" os.chdir(config.source_dir) po_path = os.path.join(config.source_dir, 'po') if not os.path.isdir(po_path): os.mkdir(po_path) python_files = [] for root, dirs_dummy, files in os.walk(config.source_dir): for file_name in files: if file_name.endswith('.py'): file_path = os.path.relpath(os.path.join(root, file_name), config.source_dir) python_files.append(file_path) python_files.sort() # First write out a stub .pot file containing just the translated # activity name, then have xgettext merge the rest of the # translations into that. (We can't just append the activity name # to the end of the .pot file afterwards, because that might # create a duplicate msgid.) pot_file = os.path.join('po', '%s.pot' % config.bundle_name) escaped_name = _po_escape(config.activity_name) f = open(pot_file, 'w') f.write('#: activity/activity.info:2\n') f.write('msgid "%s"\n' % escaped_name) f.write('msgstr ""\n') if config.summary is not None: escaped_summary = _po_escape(config.summary) f.write('#: activity/activity.info:3\n') f.write('msgid "%s"\n' % escaped_summary) f.write('msgstr ""\n') if config.description is not None: parser = HTMLParser() strings = [] parser.handle_data = strings.append parser.feed(config.description) for s in strings: s = s.strip() if s: f.write('#: activity/activity.info:4\n') f.write('msgid "%s"\n' % _po_escape(s)) f.write('msgstr ""\n') f.close() args = [ 'xgettext', '--join-existing', '--language=Python', '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file ] args += python_files retcode = subprocess.call(args) if retcode: print 'ERROR - xgettext failed with return code %i.' % retcode
def crawl_item(self, url): self.__data = {} for i in range(1, self.__retryMax): self.output_log("crawling " + url + " ... retry:" + str(i)) tmpCont = self.request_url(url) if not tmpCont : continue if tmpCont.readline() == 'no data': self.output_log("---\t no data") return tmpSoup = self.parse_web_page(tmpCont.read()) bbCode = tmpSoup.find(id='bbcode_content') try : self.__data['img'] = re.compile(r'\[img\](.*)\[\/img\]').findall(bbCode.prettify())[0] except: self.__data['img'] = '' try : self.__data['quality'] = re.compile(r'(\d)').findall(tmpSoup.find(id='item_detail').find('h2')['class'][0])[0] except: self.__data['quality'] = '' try : self.__data['name'] = tmpSoup.find(id='item_detail').find('strong').text except: self.__data['name'] = '' try : self.__data['id'] = re.compile(r'ID:([0-9]*)').findall(tmpSoup.find(id='item_detail').find('span').text)[0] except: self.__data['id'] = '' try : self.__data['qnumber'] = tmpSoup.find(id='item_detail').find(id='ilv').text except: self.__data['qnumber'] = '' try : self.__data['position'] = tmpSoup.find(id='item_detail').find('table').find('table').find('th').text except: self.__data['position'] = '' try : self.__data['html'] = tmpSoup.find(id='main').find_all('div')[1].prettify() except: self.__data['html'] = '' try : """ strip html tag """ parser = HTMLParser() tmpList = [] parser.handle_data = tmpList.append parser.feed(tmpSoup.find(id='item_detail').find(id='_dps').prettify().strip("\n")) parser.close() self.__data['attribute'] = ''.join(tmpList) except: self.__data['attribute'] = '' """ del temporary variables""" del(parser,tmpList,tmpSoup,bbCode,tmpCont) if not self.__data: continue return self.save_to_db(self.__data)
def stripTags(html): html = html.strip().strip('\r').strip('\n').strip(u' ') result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def parse(data): result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(data) data= "".join(result) parser.close() return data
def html_parser(content): content = content.strip() result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(content) parser.close() return ''.join(result)
def strip_tags(htmlStr): htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def strip_tags(html): html=html.strip() html=html.strip("\n") result=[] parse=HTMLParser() parse.handle_data=result.append parse.feed(html) parse.close() return "".join(result)
def strip_tags(html): html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def strip_tags(file_path): input_file = open(file_path) raw_data = input_file.read() plain_text = [] parser = HTMLParser() parser.handle_data = plain_text.append parser.feed(raw_data) parser.close() return ''.join(plain_text)
def handle_data(self, data): if self.string_to_find in data.lower(): if not self.currentHtmFile in self.text_found_in.keys(): self.text_found_in[self.currentHtmFile] = 1 else: self.text_found_in[self.currentHtmFile] += 1 if dbg: print "found in %s" % (self.currentHtmFile, ) return HTMLParser.handle_data(self, data)
def strip_tags(html, length): html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)[:length]
def handle_data(self, data): tmp_data = data if self.start_tag not in ['script', 'link']: if not self.is_flag: if self.str_start_tag == self.get_starttag_text(): self.str_start_tag = '' else: self.is_flag = True if self.is_flag: self.str_start_tag = self.get_starttag_text() self.is_flag = False str_starttag_text = '%s%s' % (self.str_start_tag, tmp_data) LST_SUB.append(str_starttag_text) regex = re.compile(u'(\\b[a-zA-Zа-яА-Я]{6}\\b)', re.U) tmp_data = regex.sub(r'\1™', tmp_data) str_modify_text = '%s%s' % (self.str_start_tag, tmp_data) LST_SUB_MODIFY.append(str_modify_text) HTMLParser.handle_data(self, tmp_data)
def strip_tags(self,htmlStr): htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def cmd_genpot(config, options): """Generate the gettext pot file""" os.chdir(config.source_dir) po_path = os.path.join(config.source_dir, 'po') if not os.path.isdir(po_path): os.mkdir(po_path) python_files = [] for root, dirs_dummy, files in os.walk(config.source_dir): for file_name in files: if file_name.endswith('.py'): file_path = os.path.relpath(os.path.join(root, file_name), config.source_dir) python_files.append(file_path) python_files.sort() # First write out a stub .pot file containing just the translated # activity name, then have xgettext merge the rest of the # translations into that. (We can't just append the activity name # to the end of the .pot file afterwards, because that might # create a duplicate msgid.) pot_file = os.path.join('po', '%s.pot' % config.bundle_name) escaped_name = _po_escape(config.activity_name) f = open(pot_file, 'w') f.write('#: activity/activity.info:2\n') f.write('msgid "%s"\n' % escaped_name) f.write('msgstr ""\n') if config.summary is not None: escaped_summary = _po_escape(config.summary) f.write('#: activity/activity.info:3\n') f.write('msgid "%s"\n' % escaped_summary) f.write('msgstr ""\n') if config.description is not None: parser = HTMLParser() strings = [] parser.handle_data = strings.append parser.feed(config.description) for s in strings: s = s.strip() if s: f.write('#: activity/activity.info:4\n') f.write('msgid "%s"\n' % _po_escape(s)) f.write('msgstr ""\n') f.close() args = ['xgettext', '--join-existing', '--language=Python', '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file] args += python_files retcode = subprocess.call(args) if retcode: print 'ERROR - xgettext failed with return code %i.' % retcode
def htmlstrip(html): html = html.strip() html = html.replace('</>', ''); html = html.strip("http://") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def strip_content(self): html_string = self.content.strip() html_string = html_string.strip('\n') res = [] parser = HTMLParser() parser.handle_data = res.append parser.feed(html_string) parser.close() content_string = ''.join(res) return content_string.encode('utf-8')
def strip_tags(html): from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def htmlstrip(html): html = html.strip() html = html.replace('</>', '') html = html.strip("http://") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def to_text(s): if None == s: return None html = s.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def to_text(s): if None == s : return None html = s.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def html_strip(html): from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def parse_html(html): html=html.strip() html=html.strip("\n") result=[] parse=HTMLParser() parse.handle_data=result.append parse.feed(html) parse.close() str = ''.join(result) str2 = ''.join(str.split()) return str2
def get_text(html) : parse = HTMLParser() html = html.strip().strip('\n') result = [] parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def get_text(html): parse = HTMLParser() html = html.strip().strip('\n') result = [] parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def cleanHtmlTag(self,html): html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() res = ''.join(result) res = self.cleanHtmlAgain(res) return res
def ultimate_rip(data): """Вспомогательная функция вычищения тэгов. Оставляет ничего""" ripper = HTMLParser() from types import MethodType ripper.handle_data = MethodType(lambda self, d: self.fed.append(d), ripper, HTMLParser) ripper.get_data = MethodType(lambda self: u''.join(self.fed), ripper, HTMLParser) ripper.fed = [] ripper.feed(data) return ripper.get_data()
def strip_tags(html): try: html = html.strip() result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result) except Exception as e: print e return ''
def strip_tags(self, html): if not html: return "" from HTMLParser import HTMLParser html=html.strip() html=html.strip("\n") result=[] parse=HTMLParser() parse.handle_data=result.append parse.feed(html) parse.close() return "".join(result)
def strip_tags(text): """ 去除html标记 """ from HTMLParser import HTMLParser text = text.strip() text = text.strip('\n') result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(text) parse.close() return ''.join(result)
def transform_html_text(html): html=html.strip() html=html.strip("\n") result=[] parse=HTMLParser() parse.handle_data=result.append parse.feed(html) parse.close() text = " ".join(result) text = text.replace('\n',' ') text = text.replace('\t',' ') result, number = re.subn(' +', ' ', text) return result
def _content_to_list(self, content): try: parser = HTMLParser() parser.fed = [] parser.handle_data = lambda d: parser.fed.append(d) parser.feed(content) return (''.join(parser.fed).encode('ascii', 'ignore') .translate(string.maketrans("",""), string.punctuation).lower().split()) except: return list()
def strip_tags(self, htmlStr): htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") # ɾ³ýstyle±êÇ© re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) htmlStr = re_style.sub('', htmlStr) result = [] parser = HTMLParser() parser.handle_data = result.append htmlStr = parser.unescape(htmlStr) parser.feed(htmlStr) parser.close() return ''.join(result)
def _content_to_list(self, content): try: parser = HTMLParser() parser.fed = [] parser.handle_data = lambda d: parser.fed.append(d) parser.feed(content) return (''.join(parser.fed).encode('ascii', 'ignore').translate( string.maketrans("", ""), string.punctuation).lower().split()) except: return list()
def strip_tags(self,htmlStr): ''' 使用HTMLParser进行html标签过滤 :param htmlStr: ''' htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def strip_tags(html): """ Python中过滤HTML标签的函数 hello """ from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def strip_tags(self,htmlStr): htmlStr = re.sub("[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f\\x7f]","",htmlStr) htmlStr = re.sub(codecs.BOM_UTF8,"",htmlStr) htmlStr = re.sub("\\xef\\xbb\\xbf","",htmlStr) htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def strip_tags(html): """ Python中过滤HTML标签的函数 """ if html is None: return '' html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def strip_tags1(htmlStr): ''' 使用HTMLParser进行html标签过滤 :param htmlStr: ''' htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def get_post_summary(self, html=''): """利用html返回一串纯文本""" from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() result = "<br/>".join(result) if len(result) > 200: result = result[0:200] return result
def strip_tags(html): """ Python中过滤HTML标签的函数 >>> str_text=strip_tags("<font color=red>hello</font>") >>> print str_text hello """ html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def strip_tags(self, html): """ Python中过滤HTML标签的函数 >>> str_text=strip_tags("<font color=red>hello</font>") >>> print str_text hello """ html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def handle_data(self, data): result = self.regexp_for_search_compiled.search(data.lower()) if result is not None: # re.finditer(pattern, string) returns an iterator over MatchObject objects. MatchObjects = self.regexp_for_search_compiled.finditer( data.lower()) for thisMatchObject in MatchObjects: if dbg: print thisMatchObject.start(), thisMatchObject.end() if not self.currentHtmFile in self.text_found_in.keys(): self.text_found_in[self.currentHtmFile] = 1 else: self.text_found_in[self.currentHtmFile] += 1 # if dbg: print "found in %s" % (self.currentHtmFile,) return HTMLParser.handle_data(self, data)
def strip_tags(html): """ Python strip html tags >>> str_text=strip_tags("<font color=red>hello</font>") >>> print str_text hello """ from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() r = ''.join(result) if len(r) >= 2048: r = r[:2048] return r
def strip_tags_parser(self, html): """ 去除文本中的HTML标签.用到了HTMLParser 使用示例: str_text=strip_tags("<font color=red>hello</font>") :return: String """ from HTMLParser import HTMLParser html = html.strip('\n') html = html.strip('\t') html = html.strip(' ') html = html.strip() result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return '$'.join(result)
def prepareString(string, lenght=0, strip_html=True): string = string.strip() if (strip_html): result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(string) parser.close() string = ''.join(result) else: string = html2safehtml(string, valid_tags=("b", "a", "i", "br", "ul", "li", "strong")) if lenght > 0: string = string[0:lenght] string += "..." string = cgi.escape(string) return string