def parse_page(title, text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_title.search(text) if r: title = r.group(1).strip().split('/')[1] title = tools.to_utf8(title) r = r_content.search(text) if r: url = r.group(1) text = tools.get_url(url, proxy).strip() b = "document.write('" e = "');" if text.startswith(b) and text.endswith(e): text = text[len(b):-1 * len(e)] text = tools.format_html_text(text) else: text = '' return title + '\r\n' * 2 + text
def parse_page(title, text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title, encoding) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1), encoding) else: text = '' return title + '\r\n' * 2 + text
def parse_page(title, text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title, encoding) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1), encoding) else: text = '' return title + '\r\n'*2 + text
def parse_page(title, text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1)) else: text = '' return title + '\r\n' * 2 + text
def parse_page(title, text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_title.search(text) if r: title = r.group(1) title = tools.to_utf8(title) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1)) else: text = '' return title + '\r\n'*2 + text