Beispiel #1
0
    def process_item(self, item, spider):
        image_urls = []
        for image_url in item['image_urls']:
            url = image_url.get('url')
            urlHash = EncryptUtil.md5(url)
            path = 'full/' + str(urlHash) + '.jpg'
            detailPath = self.savePath + '/' + path
            # 创建目录
            saveDir = self.savePath + '/full'
            if not FileUtil.dirIsExist(saveDir):
                FileUtil.createDir(saveDir)

            if FileUtil.fileIsExist(detailPath):
                spider.logDao.info(u'图片已经存在本地:' + url)
                image_url_new = {
                    'ok': True,
                    'x': {
                        'url': url,
                        'path': path
                    }
                }
            else:
                try:
                    fileResponse = requests.get(url, timeout=10)
                    req_code = fileResponse.status_code
                    req_msg = fileResponse.reason
                    if req_code == 200:
                        open(detailPath, 'wb').write(fileResponse.content)
                        image_url_new = {
                            'ok': True,
                            'x': {
                                'url': url,
                                'path': path
                            }
                        }
                        spider.logDao.info(u'图片成功下载:' + url)
                    else:
                        spider.logDao.info(u'下载图片失败:' + url)
                        image_url_new = {
                            'ok': False,
                            'x': {
                                'url': url,
                            }
                        }
                except Exception, e:
                    print e
                    spider.logDao.warn(u'下载图片失败:' + url)
                    image_url_new = {
                        'ok': False,
                        'x': {
                            'url': url,
                        }
                    }
            image_urls.append(image_url_new)
            # 空转2s
            TimerUtil.sleep(2)
Beispiel #2
0
def delImg(url):
    file_path = os.path.dirname(os.path.realpath(__file__)) + u'/image'
    urlHash = EncryptUtil.md5(url)
    file_name = str(urlHash) + '.jpg'
    detailPath = file_path + '\\' + file_name
    try:
        FileUtil.delFile(detailPath)
        print(u'删除图片成功:%s' % detailPath)
    except Exception as e:
        print(u'删除图片失败:%s' % str(e))
Beispiel #3
0
def downLoadCss(styleUrls):
    styleList = []
    css = {}
    for styleUrl in styleUrls:
        # 得到hash作为key
        if styleUrl.startswith(u'//'):
            styleUrl = u'http:' + styleUrl
        styleUrlHash = EncryptUtil.md5(styleUrl)
        if styleUrlHash not in css:
            # 不存在则去下载 并保存
            styles = getStyle(styleUrl)
            if styles:
                css[styleUrlHash] = styles
        if css.get(styleUrlHash):
            styleList.append(css.get(styleUrlHash))
    return styleList
Beispiel #4
0
def getStyle(url):
    url_hash = EncryptUtil.md5(url)
    # 先检查缓存里面的style
    file_path = getFilePath(url_hash)
    loadF = None
    try:
        if not os.path.exists(file_path):
            # 不存在,则需要下载
            styles = CssUtil.downLoad(url)
            if styles:
                with open(file_path, u'w') as loadF:
                    json.dump(
                        {
                            u'update_time':
                            datetime.datetime.now().strftime(
                                u'%Y-%m-%d %H:%M:%S'),
                            u'url':
                            url,
                            u'styles':
                            styles
                        }, loadF)
            return styles
        else:
            with open(file_path, u'r') as loadF:
                detail = json.load(loadF)
                update_time = detail[u'update_time']
                styles = detail[u'styles']
                # 如果更新时间之间相差5天,就下载
                update_time = datetime.datetime.strptime(
                    update_time, u'%Y-%m-%d  %H:%M:%S')
                now = datetime.datetime.now()
                space_day = (now - update_time).days
                if space_day >= 5:
                    # 需要重新下载
                    loadF.close()
                    FileUtil.delFile(file_path)
                    return getStyle(url)
                else:
                    # 不需要重新下载
                    return styles
    finally:
        if loadF:
            loadF.close()
Beispiel #5
0
 def getHashCode(self, source_url):
     # 具体逻辑
     return EncryptUtil.md5(source_url)
Beispiel #6
0
 def getWxArticleHashCode(self, title, wx_account, source_id):
     # 具体逻辑 微信专用,别的请使用getHashCode
     return EncryptUtil.md5(title.encode('utf8')+wx_account.encode('utf8')+str(source_id))
Beispiel #7
0
def downLoadImage(image_url_sources):
    image_urls = []
    for image_url in image_url_sources:
        file_path = os.path.dirname(os.path.realpath(__file__)) + u'/image'
        if not os.path.isdir(file_path):
            os.mkdir(file_path)
        url = image_url.get('url')
        print url
        urlHash = EncryptUtil.md5(url)
        fileName = str(urlHash) + '.jpg'
        detailPath = file_path + '\\' + fileName

        if FileUtil.fileIsExist(detailPath):
            print u'图片已经存在本地:' + url
            image_url_new = {
                'ok': True,
                'x': {
                    'url': url,
                    'path': detailPath,
                    'fileName': fileName
                }
            }
        else:
            try:
                fileResponse = requests.get(url, timeout=10)
                req_code = fileResponse.status_code
                req_msg = fileResponse.reason
                if req_code == 200:
                    open(detailPath, 'wb').write(fileResponse.content)
                    # 判断大小是否大于100kb 压缩到600, 质量为80
                    if len(fileResponse.content) > 100 * 1024:
                        # 目标图片大小
                        dst_w = 600
                        dst_h = 600
                        # 保存的图片质量
                        save_q = 80
                        ImageCompressUtil().resizeImg(ori_img=detailPath,
                                                      dst_img=detailPath,
                                                      dst_w=dst_w,
                                                      dst_h=dst_h,
                                                      save_q=save_q)
                    image_url_new = {
                        'ok': True,
                        'x': {
                            'url': url,
                            'path': detailPath,
                            'fileName': fileName
                        }
                    }
                    # http://p0.ifengimg.com/pmop/2017/1010/E66C2599CE9403A670AD405F4CCAB271B366D7DC_size415_w1290_h692.png
                    print u'图片成功下载,大小:' + str(
                        len(fileResponse.content) / 1024) + 'kb ' + url
                    print u'最终存储图片,大小:' + str(
                        os.path.getsize(detailPath) / 1024) + 'kb ' + url
                else:
                    print u'下载图片失败:' + url
                    image_url_new = {
                        'ok': False,
                        'x': {
                            'url': url,
                        }
                    }
            except Exception, e:
                print u'下载图片失败:' + url
                image_url_new = {
                    'ok': False,
                    'x': {
                        'url': url,
                    }
                }
        image_urls.append(image_url_new)
        # 空转2s
        TimerUtil.sleep(2)