def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img # if img[-1] == '/': # print u"修改前,img为:" + str(img) # img = img[:-1] # print u"修改后,img为:" + str(img[:-1]) img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\ '../images/{}'.format(filename)) # 硬编码, 可以优化?写到fix_html函数中 # new_image += '</img>' content = content.replace( img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img if img[-1] == '/': img = img[:-1] img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg', '../images/{}'.format(filename)) new_image += '</img>' content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img if img[-1] == '/': img = img[:-1] img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace( '//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg', '../images/{}'.format(filename)) new_image += '</img>' content = content.replace( img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def worker(self, target_url): if target_url in self.work_complete_set: # 自动跳过已抓取成功的网址 return Debug.logger.info(u'开始抓取{}的内容'.format(target_url)) content = Http.get_content(target_url) if not content: return from src.worker.sinablog_worker import sinablogAuthorWorker if isinstance(self, sinablogAuthorWorker): content = Match.fix_html(content=content, recipe_kind='sinablog_author') else: content = Match.fix_html(content=content) # 需要修正其中的<br>标签,避免爆栈 self.content_list.append(content) Debug.logger.debug(u'{}的内容抓取完成'.format(target_url)) self.work_complete_set.add(target_url) return
def set_dom(self, dom): self.info = {} if dom and not (dom.select('div.answer-status')): self.header = dom.find('div', class_='zm-item-vote-info') self.body = dom.find('textarea', class_='content') self.footer = dom.find('div', class_='zm-meta-panel') if self.body: content = self.get_tag_content(self.body) self.content = BeautifulSoup(Match.fix_html(content), 'html.parser') self.author_parser.set_dom(dom) return
def fix_image(self, content, recipe): content = Match.fix_html(content=content, recipe_kind=recipe) for img in re.findall(r'<img[^>]*', content): if recipe not in [Type.sinablog_author, Type.cnblogs_author]: # fix img if img[-1] == '/': img = img[:-1] img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: if recipe in Type.zhihu and not src_download.startswith('http'): # fix zhuanlan image href src_download = src_download.split('.')[0] filename = self.image_container.add('https://pic2.zhimg.com/'+src_download+'_b.jpg') elif recipe in Type.generic: filename = '' # TODO else: filename = self.image_container.add(src_download) else: filename = '' new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) if recipe in Type.jianshu: new_image = new_image.replace('data-original-src', 'temppicsr') new_image = new_image.replace('src', 'falsesrc') new_image = new_image.replace('temppicsr', 'src') # 应该有更好的方式, 暂时先这样写 new_image += '</img>' elif recipe in Type.sinablog: # 硬编码, 可以优化?写到fix_html函数中 new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\ '../images/{}'.format(filename)) elif recipe in Type.zhihu: new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg', '../images/{}'.format(filename)) new_image += '</img>' elif recipe in Type.cnblogs: pass content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content
def worker(self, target_url): if target_url in self.work_complete_set: # 自动跳过已抓取成功的网址 return Debug.logger.info(u'开始抓取{}的内容'.format(target_url)) content = Http.get_content(target_url) if not content: return content = Match.fix_html(content) # 需要修正其中的<br>标签,避免爆栈 self.content_list.append(content) Debug.logger.debug(u'{}的内容抓取完成'.format(target_url)) self.work_complete_set.add(target_url) return
def fix_image(self, content): content = Match.fix_html(content) for img in re.findall(r'<img[^>]*', content): # fix img if img[-1] == '/': # print u"修改前,img为:" + str(img) img = img[:-1] # print u"修改后,img为:" + str(img) img += '>' src = re.search(r'(?<=src=").*?(?=")', img) if not src: new_image = img + '</img>' content = content.replace(img, new_image) continue else: src = src.group(0) if src.replace(' ', '') == '': new_image = img + '</img>' content = content.replace(img, new_image) continue src_download = HtmlCreator.fix_image_src(src) if src_download: filename = self.image_container.add(src_download) else: filename = '' # print u"src是什么?????" + str(src) new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename)) new_image = new_image.replace('data-original-src', 'temppicsr') new_image = new_image.replace('src', 'falsesrc') new_image = new_image.replace('temppicsr', 'src') # 应该有更好的方式, 暂时先这样写 # new_image = new_image.replace('"{}"'.format(src+'/w/1240'), '"./images/{}"'.format(filename)) # new_image = new_image.replace('"{}"'.format(src), '"./images/{}"'.format(filename)) new_image += '</img>' content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image)) return content