def upload(self, path): """ cos_path:/news/jiemian/image/ :param path :return: """ counter = 0 url = '' while counter != 10: try: # 得到hash uploadName = path.replace('image\\', '') request = UploadFileRequest(u"crawler", self.cos_path + uploadName, self.local_path + path, insert_only=0) upload_file_ret = self.cos_client.upload_file(request) if upload_file_ret['code'] == 0: data = upload_file_ret['data'] or {} url = data['source_url'] print u'上传成功 ' + url else: print u'上传图片失败', upload_file_ret break except Exception as e: counter += 1 TimerUtil.sleep(10) return url
def parseResult(self, response): status = response.status haoYaoShiId = response.meta['haoYaoShiId'] self.logWarn(u'haoyaoshi_id: %d 请求状态%d %s' % (haoYaoShiId, status, response.url)) if status == 404: self.statusDao.updateStatus(haoYaoShiId, self.statusDao.Status_no_source) return if status == 403: self.statusDao.updateStatus(haoYaoShiId, self.statusDao.Status_be_forbid) NetworkUtil.getNewIp() TimerUtil.sleep(60) return # 判断使用哪种解析方式, url是最终的url,重定向之后 url = response.url if 'http://www.ehaoyao.com/product' in url: contentItem = self.parseDetail1(response) if contentItem: return contentItem elif 'http://www.ehaoyao.us/goods.php' in url: # 更改状态:不需要处理 self.statusDao.updateStatus(haoYaoShiId, self.statusDao.Status_dont_need_parse) else: # 更改状态:没有解析方法 self.statusDao.updateStatus(haoYaoShiId, self.statusDao.Status_no_parse_method)
def process_item(self, item, spider): image_urls = [] for image_url in item['image_urls']: url = image_url.get('url') urlHash = EncryptUtil.md5(url) path = 'full/' + str(urlHash) + '.jpg' detailPath = self.savePath + '/' + path # 创建目录 saveDir = self.savePath + '/full' if not FileUtil.dirIsExist(saveDir): FileUtil.createDir(saveDir) if FileUtil.fileIsExist(detailPath): spider.logDao.info(u'图片已经存在本地:' + url) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } else: try: fileResponse = requests.get(url, timeout=10) req_code = fileResponse.status_code req_msg = fileResponse.reason if req_code == 200: open(detailPath, 'wb').write(fileResponse.content) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } spider.logDao.info(u'图片成功下载:' + url) else: spider.logDao.info(u'下载图片失败:' + url) image_url_new = { 'ok': False, 'x': { 'url': url, } } except Exception, e: print e spider.logDao.warn(u'下载图片失败:' + url) image_url_new = { 'ok': False, 'x': { 'url': url, } } image_urls.append(image_url_new) # 空转2s TimerUtil.sleep(2)
def wait_utils_env_ok(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logWarn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logWarn(u'检测服务器不可行') # continue return True
def downloadSpecific(self, haoYaoShiId): count = 0 while count <= 10: # 获取说明书 try: specificUrl = 'http://www.ehaoyao.com/meal/%s/specific?_=1508406771571' % haoYaoShiId result = requests.get(specificUrl) if result.status_code == 200: content = json.loads(result.content) if content.get('code') == 1: return json.dumps(json.loads(result.content).get( 'data', {}).get('specificInfo', ''), ensure_ascii=False) return '' except Exception as e: self.logWarn('downloadSpecific:' + str(e)) return '' count += 1 TimerUtil.sleep(15)
def downLoadImage(image_url_sources): image_urls = [] for image_url in image_url_sources: file_path = os.path.dirname(os.path.realpath(__file__)) + u'/image' if not os.path.isdir(file_path): os.mkdir(file_path) url = image_url.get('url') print url urlHash = EncryptUtil.md5(url) fileName = str(urlHash) + '.jpg' detailPath = file_path + '\\' + fileName if FileUtil.fileIsExist(detailPath): print u'图片已经存在本地:' + url image_url_new = { 'ok': True, 'x': { 'url': url, 'path': detailPath, 'fileName': fileName } } else: try: fileResponse = requests.get(url, timeout=10) req_code = fileResponse.status_code req_msg = fileResponse.reason if req_code == 200: open(detailPath, 'wb').write(fileResponse.content) # 判断大小是否大于100kb 压缩到600, 质量为80 if len(fileResponse.content) > 100 * 1024: # 目标图片大小 dst_w = 600 dst_h = 600 # 保存的图片质量 save_q = 80 ImageCompressUtil().resizeImg(ori_img=detailPath, dst_img=detailPath, dst_w=dst_w, dst_h=dst_h, save_q=save_q) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': detailPath, 'fileName': fileName } } # http://p0.ifengimg.com/pmop/2017/1010/E66C2599CE9403A670AD405F4CCAB271B366D7DC_size415_w1290_h692.png print u'图片成功下载,大小:' + str( len(fileResponse.content) / 1024) + 'kb ' + url print u'最终存储图片,大小:' + str( os.path.getsize(detailPath) / 1024) + 'kb ' + url else: print u'下载图片失败:' + url image_url_new = { 'ok': False, 'x': { 'url': url, } } except Exception, e: print u'下载图片失败:' + url image_url_new = { 'ok': False, 'x': { 'url': url, } } image_urls.append(image_url_new) # 空转2s TimerUtil.sleep(2)