def downLoadImg(source_url, content_html): # 处理图片 selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath(u'descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath(u'@src').extract_first('') if not image_url_base: continue if image_url_base.startswith(u'//'): image_url = u'http:' + image_url_base elif image_url_base.startswith(u'/'): image_url = getNetLoc(source_url) + image_url_base elif image_url_base.startswith(u'./'): # 得到当前url结尾的最后一个 /之前的字符串 base_url = source_url[0:source_url.rindex(u'/')] + u'/' image_url = image_url_base.replace(u'./', base_url) elif image_url_base.startswith(u'../../'): image_url = image_url_base.replace(u'../../', getNetLoc(source_url) + u'/') elif image_url_base.startswith(u'http'): image_url = image_url_base else: base_url = source_url[0:source_url.rindex(u'/')] + u'/' image_url = base_url + image_url_base if image_url and image_url.startswith(u'http'): print(u'得到图片:' + image_url) image_urls.append({ u'url': image_url, }) content_html = content_html.replace(image_url_base, image_url) # TODO..先不下载 image_urls = [] result_image_urls = ImageUtil.downLoadImage(image_urls) for item in result_image_urls: url = item.get(u'url', u'') image_url = item.get(u'image_url', u'') content_html = content_html.replace(u'&', u'&').replace(url, image_url) return content_html