def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) new_href = re.sub(r'index\.html$', '.', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href for element in d('link'): e = PyQuery(element) href = e.attr('href') if href is not None and not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'\?v=.*$', '', href) e.attr('href', new_href) print "\t", href, "removed v =>", new_href for element in d('script'): e = PyQuery(element) href = e.attr('src') if href is not None and not abs_url_regex.search(href): new_href = re.sub(r'\?v=.*$', '', href) e.attr('src', new_href) print "\t", href, "removed v =>", new_href if parser == 'html': return "<!DOCTYPE html>" + d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href link_element_types = [ 'script', 'meta', 'a', 'link', 'img', 'amp-img' ] link_attributes = ['href', 'content', 'src', 'url'] for element_type in link_element_types: for element in d(element_type): e = PyQuery(element) for a in link_attributes: old_a = e.attr(a) if old_a: new_a = old_a.replace(arguments['--domain'], arguments['--target-domain']) new_a = re.sub(r'^[a-z]+://', '//', new_a) e.attr(a, new_a) print "\t", old_a, "=>", new_a if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fix_meta_image_links(text, parser): filetext = text.decode('utf8') td_regex = re.compile(target_domain + '|') assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')), parser=parser) for share_class in [ 'meta[property="og:image"], meta[name="twitter:image"]' ]: print "share_class : ", share_class for element in d(share_class): e = PyQuery(element) print "element : ", e href = e.attr('content') print "href : ", href print "domain : ", domain content_target_domain = target_domain.replace( "/static", "") print "target_domain : ", content_target_domain new_href = re.sub(domain, content_target_domain, href) e.attr('content', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) content_node = doc('div#rightdiv1') content_node.remove('span.white12') item = ContentItem() content_node = content_node.__unicode__() img_all = [] img='leftsmallimgurl\[1\]\=\"(.*?)\"\;' ob = re.compile(img) imgs = ob.findall(doc.__unicode__()) if not imgs: image='' else: image='<br/><img src="'+imgs[0]+'"/><br/>' img_all.append(self.getRealURI(imgs[0])) content_node=image+content_node item['image_urls'] = img_all item['title'] = self.title = doc('h1').text() item['content'] = self.content = content_node item['release_time'] = '' # item['release_switch_time'] = self.release_switch_time = time.time() item['source'] = u"瑞丽服饰网" item['author'] = '' item['pic_url'] = '' self.title = item['title'] self.content = item['content'] return item
def fixLinks(text, parser): # JQuery translator d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): # redirect rss file new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) # point index file to / as server location new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print("\t", href, "=>", new_href) # fix wrong jpgpg case for element in d("img"): e = PyQuery(element) attr_name = "srcset" print("img:", e) attr = e.attr(attr_name) if attr: new_attr = re.sub(r"\.jpgg ", ".jpg ", attr) new_attr = re.sub(r"\.jpgpg ", ".jpg ", new_attr) new_attr = re.sub(r"\.jpgjpg ", ".jpg ", new_attr) # upsert element attribute e.attr(attr_name, new_attr) print("\t", attr, "=>", new_attr) if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('link'): e = PyQuery(element) href = e.attr('href') if href: if href.find(domain) > -1: new_href = href.split(domain)[-1] new_href = '{}{}'.format(target_domain, new_href) e.attr('href', new_href) print "\t", "fixed link ", href, "=> ", new_href for element in d('a'): e = PyQuery(element) href = e.attr('href') if href: if href.find(domain) > -1: new_href = href.split(domain)[-1] e.attr('href', new_href) print "\t", "Fixed ", href, "=> ", new_href if href and not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): if text == '': return '' d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if (not abs_url_regex.search(href)) or ('/rss/' in href): new_href = re.sub(r"/([\w-]+)$", r"/\1.html", href) new_href = re.sub(r"^([\w-]+)$", r"\1.html", new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href href = e.attr('href') if bad_url_regex.search(href): new_href = re.sub(r'(.+)\.[0-9]{1,2}$', r'\1', href) e.attr('href', new_href) print "\t FIX! ", href, "=>", new_href if parser == 'html': return "<!DOCTYPE html>\n<html>" + d.html( method='html').encode('utf8') + "</html>" return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode( 'utf8') + "</html>"
def extract(self): item = ContentItem() self.html = re.sub('<!--.*?-->', '', self.html) content_node = self.hxs.select("//div[@class = 'art_con']").extract() content_node = PyQuery(content_node[0]) content_node.remove('div[class = "pconline_page"]') content_node.remove('div[class = "pc3g"]') content_node.remove('div[class = "pageTips"]') content_node.remove('div[class = "art_nav_box mt10"]') content_node.remove('div[class = "art_bottom"]') content_node.remove('div[class = "art_con_top"]') item['image_urls'] = [self.getRealURI(img.get('src')) for img in content_node('img') if not img.get('src').endswith('.gif')] item['title'] = self.title = self.hxs.select("//h1/text()").extract()[0] if not item['title']: item['title'] = self.title = self.hxs.select("//div[@id = 'UC_newsInfoDetail_lbl_newsTitle']/text()").extract()[0] item['content'] = self.content = content_node.__unicode__() release_time = self.hxs.select("//div[@class = 'art_con_top']").extract()[0] doc_t = PyQuery(release_time) release_time = doc_t('span').text() p = re.compile(u'20\d\d年\d\d月\d\d日') #item['release_time'] = self.release_time = doc('div[class="art_con_top"]').find('span').eq(0).text() item['release_time'] = self.release_time = p.search(release_time).group() item['source'] = u'pconline' item['author'] = '' item['pic_url'] = '' return item
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print("\t", href, "=>", new_href) for element in d('img'): e = PyQuery(element) srcset = e.attr('srcset') if srcset != None: new_srcset = re.sub(r'\.pngg', '.png', srcset) new_srcset = re.sub(r'\.pngng', '.png', new_srcset) new_srcset = re.sub(r'\.pngpng', '.png', new_srcset) e.attr('srcset', new_srcset) if srcset != new_srcset: print("\t", srcset, "=>", new_srcset) if parser == 'html': return d.html(method='html') return d.__unicode__()
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue print '// Drop queryString in included src' print 'from: ', href result = urlparse(href) if result.scheme == 'https': href = href elif result.scheme == '': href = result.path + (('#' + result.fragment) if result.fragment != '' else '') print 'to: ', href new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def extract(self): item = ContentItem() self.html = re.sub('<!--.*?-->', '', self.html) tz_title=self.hxs.select("//h1/text()").extract() content=self.hxs.select("//ul[@class='content']/li/div").extract() tz_content='' for con in content: if "fromposty" in con: tz_content=self.hxs.select("//ul[@class='content']/li/div")[2].extract() break else: tz_content=self.hxs.select("//ul[@class='content']/li/div")[1].extract() release_time=self.hxs.select("//div[@class='gray']/text()").extract() imgs=PyQuery(tz_content) ob=re.compile('src="(.*?)"') imgs=ob.findall(imgs.__unicode__()) img_all=[] for img in imgs: if ".gif" in img: continue if ".GIF" in img: continue else: img_all.append(self.getRealURI(img)) author=self.hxs.select("//td[@class='bbsname']/b/span/a/text()").extract() tz_content = PyQuery(tz_content) cont_div = tz_content('div[style = "color:#FCFCCC"]') for cont in cont_div: cont_div.eq(cont_div.index(cont)).removeAttr('style') tz_content = tz_content.__unicode__() item['image_urls'] = img_all item['title'] = self.title = tz_title[0].strip() item['content'] = self.content = tz_content item['release_time'] = '' item['source'] = u"铁血网" item['author'] = author[0] item['pic_url'] = '' # item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M')) return item
def fixLinks(text): d = PyQuery(text, parser='html') for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', href) e.attr('href', new_href) print "\t", href, "=>", new_href return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print( "\t", href, "=>", new_href) if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', href) new_href = re.sub(r'index.html', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if (not abs_url_regex.search(href)) or ('/rss/' in href): new_href = re.sub(r'rss/$', 'feed.rss', href) new_href = re.sub(r'index\.html$', '', new_href) new_href = re.sub(r'index\.html\#$', '', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return "<!DOCTYPE html>\n<html>" + d.html( method='html').encode('utf8') + "</html>" elif parser == 'xml': return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + d.__unicode__( ).encode('utf8') return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode( 'utf8') + "</html>"
def fix_share_links(text,parser): td_regex = re.compile(target_domain + '|' ) assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']: for element in d(share_class): e = PyQuery(element) href = e.attr('href') new_href = re.sub(domain, target_domain, href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fix_links(text, parser): # remove superfluous "index.html" from relative hyperlinks found in text abs_url_regex = re.compile(r'^(?:[a-z]+:)?//', flags=re.IGNORECASE) d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) content_node = doc('div.kb_zw') if not content_node: # content_node = doc('div.zw_text') content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0]) content_node.remove('script') content_node.remove('style') content_node.remove('iframe') content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]') content_node.remove('input') item = ContentItem() item['title'] = self.title = doc('td[align = "center"]')('b').text() if item['title'] == None: item['title'] = self.title = doc('div.zw_bt').text() if item['title'] == None: item['title'] = self.title = doc('h1.zw_title').text() item['release_time'] = '' item['source'] = u"新浪" item['author'] = '' item['pic_url'] = '' imgs = content_node('img') image_urls = [] for img in imgs: if ".gif" in img.get('src'): continue if not img.get('src'): continue else: imgs.eq(imgs.index(img)).before('<br>') imgs.eq(imgs.index(img)).append('<br>') image_urls.append(self.getRealURI(img.get('src'))) item['image_urls'] = image_urls content = content_node.__unicode__() item['content'] = self.content = content return item
def fix_share_links(text, parser): filetext = text.decode('utf8') td_regex = re.compile(target_domain + '|') assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')), parser=parser) for share_class in ['.share_links a']: for element in d(share_class): e = PyQuery(element) href = e.attr('href') new_href = re.sub(domain, target_domain, href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) content_node = doc('.firstTopic')('div') content_node.remove('script') content_node.remove('.rate') content_node.remove('.affixContent') content_node.remove('.thread_gold') item = ContentItem() imgs = content_node('.p14')('img') img_all = [] for img in imgs: if".gif" in img.get('src'): continue else: imgs.eq(imgs.index(img)).append('<br>') imgs.eq(imgs.index(img)).before('<br>') img_all.append(self.getRealURI(img.get('src'))) item['image_urls'] = img_all item['title'] = self.title = doc('#thread_title').text() content = content_node('.p14').__unicode__() content = PyQuery(content) del_style = content('div') for d in del_style: if d.get('style'): del_style.eq(del_style.index(d)).attr['style'] = '' content.remove('dl.rate_list') content.remove('span[style = "font-size:12px"]') content.remove('dl.rate') item['content'] = self.content = content.__unicode__() release_time=doc('.firstTopic')('.postTime').text() ob=re.compile(u'20\d\d.*\d\d') release_time=ob.findall(release_time) item['release_time'] = release_time[0] # item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(release_time[0],u'%Y-%m-%d %H:%M:%S')) item['source'] = u"17173论坛" item['author'] = doc('.th1').eq(0).text() item['pic_url'] = '' return item
def fix_meta_url_links(text, parser): filetext = text.decode('utf8') td_regex = re.compile(target_domain + '|') assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')), parser=parser) for share_class in [ 'meta[property="og:url"], meta[name="twitter:url"]', 'meta[property="og:url"]', 'meta[name="twitter:url"]' ]: for element in d(share_class): e = PyQuery(element) href = e.attr('content') new_href = re.sub(domain, target_domain, href) e.attr('content', new_href) print "\t meta fixed", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fix_href_links(text, parser, page_slug): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) page_slug = find_page_slug(d) for element in d('a'): e = PyQuery(element) href = e.attr('href') #print("\thref", href) if href is not None: #no href means it's a named anchor in the text if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) if new_href.find('#') > -1: print("\t\tfound an internal link: ", new_href) new_href = page_slug + new_href e.attr('href', REMOTE_PATH + new_href) print("\t", href, "=>", e.attr('href')) if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') print href if href is None: continue new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href # remove ?v=XXXXXXXXX in css for element in d('link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if re.match(r'http://fonts',href) is not None: continue new_href = re.sub(r'\?.*', '',href) if href != new_href: e.attr('href',new_href) print "\t", href, "=>", new_href # remove ?v=XXXXXXXXX in js for element in d('script'): e = PyQuery(element) src = e.attr('src') if src is None: continue new_src = re.sub(r'\?.*', '',src) if src != new_src: e.attr('src',new_src) print "\t", src, "=>", new_src ################### if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): #extremely lazy implementation - beware. text = text.replace('pngg', 'png') text = text.replace('pngng', 'png') text = text.replace('pngpng', 'png') text = text.replace('PNGG', 'PNG') text = text.replace('PNGNG', 'PNG') text = text.replace('PNGPNG', 'PNG') text = text.replace('jpgg', 'jpg') text = text.replace('jpgpg', 'jpg') text = text.replace('jpgjpg', 'jpg') text = text.replace('jpegg', 'jpeg') text = text.replace('jpegeg', 'jpeg') text = text.replace('jpegpeg', 'jpeg') text = text.replace('http://localhost:2368/', 'https://blog.lucaperic.com/') text = text.replace( 'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/', 'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/index.rss' ) text = text.replace('/author/luca/rss/', '/rss/index.rss') d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def extract(self): item = ContentItem() self.html = re.sub('<!--.*?-->', '', self.html) tz_title=self.hxs.select("//h1/text()").extract() tz_content=self.hxs.select("//div[@class='text']").extract() release_time=self.hxs.select("//div[@class='user']/ul/li/text()").extract() ob=re.compile(u'20\d\d.*:\d\d') release_time=ob.findall(release_time[0]) imgs=self.hxs.select("//div[@class='text']/div/div/p/a/img/@src").extract() img_all=[] for img in imgs: if ".gif" in img: continue if ".GIF" in img: continue else: img_all.append(self.getRealURI(img)) item['image_urls'] = img_all item['title'] = self.title = tz_title[0] content = tz_content[0] content_html = PyQuery(content) cont_div = content_html('div[style = "color:#f9f9f9"]') for cont in cont_div: cont_div.eq(cont_div.index(cont)).removeAttr('style') content_html = content_html.__unicode__() item['content'] = self.content = content_html item['release_time'] = release_time[0] item['source'] = u"铁血网" item['author'] = '' item['pic_url'] = '' # item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M')) return item