Beispiel #1
0
    def start_requests(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报
        cids = ['6', '66', '73', '84', '100', '119', '120', '121', '122']
        # 必读
        url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page='
        for cid in cids:
            for page in range(1, 2):
                newUrl = url + str(page) + ('&cid=' + str(cid))
                self.logDao.warn(u'进行抓取列表:' + newUrl)
                yield scrapy.Request(url=newUrl,
                                     meta={
                                         'request_type': 'jiemian_page_list',
                                         'url': newUrl
                                     },
                                     callback=self.parseArticleList,
                                     dont_filter=True)
Beispiel #2
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'第一财经'
        sub_channel = u'新闻'
        # 进行页面访问
        newUrl = 'http://www.yicai.com/news/'
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'diyicaijing_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseArticleList, dont_filter=True)
Beispiel #3
0
 def upload(self, path):
     """
         cos_path:/news/jiemian/image/
     :param path
     :return:
     """
     counter = 0
     url = ''
     while counter != 10:
         try:
             # 得到hash
             uploadName = path.replace('full/', '')
             request = UploadFileRequest(u"crawler",
                                         self.cos_path + uploadName,
                                         self.local_path + path,
                                         insert_only=0)
             upload_file_ret = self.cos_client.upload_file(request)
             if upload_file_ret['code'] == 0:
                 data = upload_file_ret['data'] or {}
                 url = data['source_url']
                 print u'上传成功', url
             else:
                 print u'上传图片失败', upload_file_ret
             break
         except Exception as e:
             counter += 1
             TimerUtil.sleep(10)
     return url
    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        for page in range(1, 2):
            r = random.uniform(0, 1)
            url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&callback=&_=1501148356254&page='
            newUrl = url + str(page) + '&r=' + str(r)
            self.logDao.info(u"开始抓取列表:" + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'sina_list',
                                     'url': newUrl
                                 },
                                 callback=self.parseList2)
Beispiel #5
0
    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1'
        # url = 'http://tech.sina.com.cn/t/2017-07-24/doc-ifyihrit1274195.shtml'

        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList)

        # 补缺补漏
        url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254'
        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList2)
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行页面访问
        newUrl = 'http://tech.qq.com/l/scroll.htm'
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'demoName_page_list',
                                 'url': newUrl
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)
 def parseList(self, response):
     source = response.meta['source']
     wx_account = response.meta['wx_account']
     url = response.meta['url']
     body = EncodeUtil.toUnicode(response.body)
     # 判断被禁止 提示需要重启路由 清理cookie
     if response.status == 302:
         # 更新状态为更新失败
         self.logDao.warn(u'您的访问过于频繁,重新拨号')
         self.wxSourceDao.updateStatus(wx_account, 'updateFail')
         # 获取Ip # 同时空线程30s
         NetworkUtil.getNewIp()
         TimerUtil.sleep(30)
     else:
         self.logDao.info(u'开始解析:' + wx_account)
         # 进行解析
         selector = Selector(text=body)
         results = selector.xpath('//*[@id="main"]/div[4]/ul/li')
         self.logDao.info(u'列表长度:' + str(len(results)))
         hasCatch = False
         for result in results:
             wx_name = result.xpath('//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/text()').extract_first()
             wx_account_ = result.xpath('//p[@class="info"]/label/text()').extract_first()
             wx_url = result.xpath('//p[@class="tit"]/a/@href').extract_first()
             if wx_account_ == wx_account:
                 self.logDao.info(u'成功抓取:' + wx_account_)
                 self.wxSourceDao.updateSource(wx_account, wx_name, wx_url, 'last')
                 hasCatch = True
                 break
         if not hasCatch:
             self.logDao.info(u'没有抓到:' + wx_account_)
             self.wxSourceDao.updateStatus(wx_account, 'none')
         pass
    def start_requests(self):
        # TODO..加上while可能有问题,有些抓不到

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行爬虫
        # 获取源  可用的,且(是更新失败的,或者最新的同时更新时间跟当前相比大于40分钟)
        sources = self.wxSourceDao.queryEnable_special(isRandom=True, wx_accounts=['CINNO_CreateMore'])

        for source in sources:
            # 更新当前条状态为 更新中,如果更新失败或者被绊则更新为更新失败,更新成功之后设置为成功
            (wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source
            # 更新状态为更新中
            self.wxSourceDao.updateStatus(wx_account, 'updating')
            # 进行页面访问
            url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query='
            newUrl = url + wx_account
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={'request_type': 'weixin_source', 'url': newUrl,
                                       'wx_account': wx_account, 'source': source},
                                 callback=self.parseList, dont_filter=True)
 def parseArticleList(self, response):
     body = EncodeUtil.toUnicode(response.body)
     selector = Selector(text=body)
     source_url = response.meta['source_url']
     print source_url
     title = selector.xpath('//title/text()').extract_first('').strip(u' ')
     isN = u"请输入验证码" == title
     if isN or response.status == 302:
         self.logDao.info(u'访问过多被禁止,重新拨号')
         # 获取Ip # 同时空线程30s
         NetworkUtil.getNewIp()
         TimerUtil.sleep(50)
         NetworkUtil.openWebbrowser(source_url)
     else:
         source = response.meta['source']
         wx_account = response.meta['wx_account']
         wx_account_id = response.meta['wx_account_id']
         self.logDao.info(u'开始解析列表:' + wx_account)
         # 进行解析
         articleJS = selector.xpath('//script/text()').extract()
         for js in articleJS:
             if 'var msgList = ' in js:
                 p8 = re.compile('var\s*msgList\s*=.*;')
                 matchList = p8.findall(js)
                 for match in matchList:
                     match = match.lstrip('var msgList = ').rstrip(';')
                     # 格式化
                     articles = demjson.decode(match) or {}
                     articles = articles['list'] or []
                     self.logDao.info(u'匹配到文章列表' + wx_account)
                     for article in articles:
                         app_msg_ext_info = article.get(
                             'app_msg_ext_info') or {}
                         desc = app_msg_ext_info.get('digest') or ''
                         title = app_msg_ext_info.get('title') or ''
                         # 如果存在则不抓取
                         if self.checkDao.checkExist(title, wx_account, 1):
                             self.logDao.info(u'已经存在' + wx_account + ':' +
                                              title)
                             continue
                         detailUrl = app_msg_ext_info['content_url'] or ''
                         if not detailUrl:
                             continue
                         detailUrl = "http://mp.weixin.qq.com" + detailUrl
                         detailUrl = detailUrl.replace("amp;", "")
                         self.logDao.info(u'抓取' + wx_account + ':' + title +
                                          ':' + detailUrl)
                         yield scrapy.Request(url=detailUrl,
                                              meta={
                                                  'request_type':
                                                  'weixin_detail',
                                                  'wx_account': wx_account,
                                                  "source": source,
                                                  "title": title,
                                                  'wx_account_id':
                                                  wx_account_id,
                                                  "source_url": detailUrl
                                              },
                                              callback=self.parseArticle)
Beispiel #10
0
    def start_requests(self):
        # 如果在晚上12点到早上6点不爬
        hour = datetime.datetime.now().hour
        if 0 <= hour <= 6:
            self.logDao.info(u'这个时间不爬。0-6点')
            return

        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 获取源  可用有值
        sources = self.wxSourceDao.queryWxUrl(isRandom=True)

        # 排序优先
        update_time, brokenAccounts = self.getBrokenAccounts()
        firstGroup = []
        secondGroup = []
        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            if wx_account in brokenAccounts:
                firstGroup.append(source)
            else:
                secondGroup.append(source)
        sources = firstGroup + secondGroup

        self.wxSources = sources
        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            # 进行页面访问
            newUrl = wx_url
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'weixin_page_list',
                                     'source_url': newUrl,
                                     'wx_account': wx_account,
                                     'source': source,
                                     'wx_account_id': id
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)
Beispiel #11
0
    def process_item(self, item, spider):
        image_urls = []
        for image_url in item['image_urls']:
            url = image_url.get('url')
            urlHash = EncryptUtil.md5(url)
            path = 'full/' + str(urlHash) + '.jpg'
            detailPath = self.savePath + '/' + path
            # 创建目录
            saveDir = self.savePath + '/full'
            if not FileUtil.dirIsExist(saveDir):
                FileUtil.createDir(saveDir)

            if FileUtil.fileIsExist(detailPath):
                spider.logDao.info(u'图片已经存在本地:' + url)
                image_url_new = {
                    'ok': True,
                    'x': {
                        'url': url,
                        'path': path
                    }
                }
            else:
                try:
                    fileResponse = requests.get(url, timeout=10)
                    req_code = fileResponse.status_code
                    req_msg = fileResponse.reason
                    if req_code == 200:
                        open(detailPath, 'wb').write(fileResponse.content)
                        image_url_new = {
                            'ok': True,
                            'x': {
                                'url': url,
                                'path': path
                            }
                        }
                        spider.logDao.info(u'图片成功下载:' + url)
                    else:
                        spider.logDao.info(u'下载图片失败:' + url)
                        image_url_new = {
                            'ok': False,
                            'x': {
                                'url': url,
                            }
                        }
                except Exception, e:
                    print e
                    spider.logDao.warn(u'下载图片失败:' + url)
                    image_url_new = {
                        'ok': False,
                        'x': {
                            'url': url,
                        }
                    }
            image_urls.append(image_url_new)
            # 空转2s
            TimerUtil.sleep(2)
Beispiel #12
0
    def start_requests(self):
        # while True:
        # 检测网络
        if not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        if not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        if self.request_stop:
            # 拨号生效时间不定,所以需要间隔一段时间再重试
            timeSpace = time.time() - self.request_stop_time
            if timeSpace / 60 <= 2:
                # 当时间间隔小于 2分钟 就不请求
                # continue
                pass
            else:
                self.request_stop = False

        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=30&spec=&type=&ch=05&k' \
              '=&offset_page=0&offset_num=0&num=60&asc=&page='

        for page in range(0, 11):
            if self.request_stop:
                self.logDao.warn(u'出现被绊或者出现网络异常,退出循环')
                # 当网络出现被绊的情况,就需要停止所有的请求等待IP更换
                break
            r = random.uniform(0, 1)
            newUrl = url + str(page)
            newUrl += ('&r=' + str(r))
            self.logDao.info(u"开始抓取列表:" + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={'url': newUrl},
                                 callback=self.parseList)
            # 跑空线程2秒
            TimerUtil.sleep(2)

        if self.request_stop:
            # 需要发起通知 进行重新拨号
            self.logDao.warn(u'发送重新拨号信号,请等待2分钟会尝试重新抓取')
            self.request_stop_time = time.time()
            pass
        else:
            # 正常抓好之后,当前跑空线程10分钟,不影响一些还没请求完成的request
            self.logDao.info(u'请求了一轮了,但是可能还有没有请求完成,睡一会10分钟')
            TimerUtil.sleep(10 * 60)
            pass
Beispiel #13
0
    def start_requests(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'网易科技'
        sub_channel = u'科技'
        # 进行页面访问
        newUrl = 'http://tech.163.com/special/00094IHV/news_json.js?' + str(
            random.uniform(0, 1))
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'wangyi_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        src_channel = u'网易财经'
        sub_channel = u'财经'
        # 进行页面访问
        newUrl = 'http://money.163.com/special/00251G8F/news_json.js?' + str(
            random.uniform(0, 1))
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'wangyi_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)
Beispiel #14
0
    def start_requests(self):
            # 检测网络
            while not NetworkUtil.checkNetWork():
                # 20s检测一次
                TimerUtil.sleep(20)
                self.logDao.warn(u'检测网络不可行')

            # 检测服务器
            while not NetworkUtil.checkService():
                # 20s检测一次
                TimerUtil.sleep(20)
                self.logDao.warn(u'检测服务器不可行')

            # 进行页面访问
            src_channel = u'腾讯科技'
            sub_channel = u'科技'
            newUrl = 'http://tech.qq.com/l/scroll.htm'
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'tengxun_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList, dont_filter=True)

            # 进行页面访问
            src_channel = u'腾讯财经'
            sub_channel = u'财经要闻'
            newUrl = 'http://finance.qq.com/'
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'tengxun_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList2, dont_filter=True)
    def start_requests(self):
        # unKnow = ["didalive", "HIS_Technology", "ad_helper", "zhongduchongdu"]; 是搜索不到的
        # TODO..加上while可能有问题,有些可能抓不到
        # 如果正在爬,就不请求
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 获取源  可用有值
        sources = self.wxSourceDao.queryWxUrl_special(isRandom=True,
                                                      wx_accounts=['qqtech'])

        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            # 进行页面访问
            newUrl = wx_url
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'weixin_page_list',
                                     'source_url': newUrl,
                                     'wx_account': wx_account,
                                     'source': source,
                                     'wx_account_id': id
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)
Beispiel #16
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'搜狐科技'
        sub_channel = u'科技'

        for page in range(1, 4):
            # 进行页面访问
            url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&size=20&callback=&_=1502075449669&page='
            newUrl = url + str(page)
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'sohu_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)
Beispiel #17
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报
        cids = [{
            'src_channel': u'界面科技',
            'sub_channel': u'必读',
            'num': '6'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'玩物',
            'num': '66'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'产品榜',
            'num': '73'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'快报',
            'num': '84'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'游戏要闻',
            'num': '100'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'单品',
            'num': '119'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'盘点',
            'num': '120'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'花边要闻',
            'num': '121'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'游戏快报',
            'num': '122'
        }]
        # 必读
        url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page='
        for cid in cids:
            for page in range(1, 2):
                cidNum = cid.get('num')
                src_channel = cid.get('src_channel')
                sub_channel = cid.get('sub_channel')
                newUrl = url + str(page) + ('&cid=' + cidNum)
                self.logDao.warn(u'进行抓取列表:' + newUrl)
                yield scrapy.Request(url=newUrl,
                                     meta={
                                         'request_type': 'jiemian_page_list',
                                         'url': newUrl,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=self.parseArticleList,
                                     dont_filter=True)
Beispiel #18
0
    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        src_channel = u'新浪科技'
        sub_channel = u'科技'
        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1'

        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'sina_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseList)

        src_channel = u'新浪科技'
        sub_channel = u'科技'
        # 补缺补漏
        url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254'
        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'sina_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseList2)

        # 新浪财经 要闻
        src_channel = u'新浪财经'
        sub_channel = u'要闻'
        url = 'http://finance.sina.com.cn/'
        newUrl = url
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'sina_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseList3)
Beispiel #19
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        selector = Selector(text=body)
        title = selector.xpath('//title/text()').extract_first('').strip(u' ')
        source_url = response.meta['source_url']
        wx_account = response.meta['wx_account']
        isN = u"请输入验证码" == title
        if isN or response.status == 302:
            self.logDao.info(u'访问过多被禁止,重新拨号')
            # 存起来
            self.brokenAccounts.append(wx_account)
            # 获取Ip # 同时空线程30s
            NetworkUtil.getNewIp()
            TimerUtil.sleep(80)
            NetworkUtil.openWebbrowser(source_url)
        else:
            title = response.meta['title']
            source_url = response.meta['source_url']
            wx_account_id = response.meta['wx_account_id']
            self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' +
                             source_url)
            self.logDao.info(u'开始解析文章:' + source_url)
            # 进行解析
            post_date = selector.xpath(
                '//*[@id="post-date"]/text()').extract_first('')

            try:
                post_date = time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.strptime(post_date, "%Y-%m-%d"))
            except Exception:
                pass

            styles = selector.xpath('//style/text()').extract()
            styles = CssUtil.compressCss(styles).replace('\'', '"').replace(
                '\\', '\\\\')
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3'])

            post_user = selector.xpath(
                '//*[@id="post-user"]/text()').extract_first('')
            content_html = selector.xpath('//*[@id="js_content"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            content_items = content_html.xpath('*')
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # content_items_new = []
            # for item in content_items:
            #     itemStr = item.extract()
            #     if u'订阅微信' in itemStr:
            #         continue
            #     content_items_new.append(item)
            # content_items = content_items_new

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)

            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url = img.xpath('@src | @data-src').extract_first('')
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
            self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date +
                             ':' + post_user)
            self.logDao.info(u'得到文章:' + source_url)

            # 得到hashCode1
            hash_code = self.checkDao.getHashCode(title, wx_account, 1)

            self.saveFile(hash_code, body)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = ''
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 1
            contentItem['src_account_id'] = wx_account_id
            contentItem['src_channel'] = '微信公众号'
            contentItem['src_ref'] = ''
            contentItem['wx_account'] = wx_account

            return contentItem
Beispiel #20
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        src_channel = u'凤凰财经'

        sub_channel = u'电子竞技'
        url = 'http://games.ifeng.com/listpage/17886/1/list.shtml'
        styleUrlDefault = [
            'http://p2.ifengimg.com/a/2016/0523/esports.css',
            'http://y1.ifengimg.com/package/t_20130820__15953/css/pl_detail_v8.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList3,
                             dont_filter=True)

        sub_channel = u'产品资讯'
        url = 'http://games.ifeng.com/listpage/27456/1/list.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        sub_channel = u'热点资讯'
        url = 'http://games.ifeng.com/listpage/27455/1/list.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        src_channel = u'凤凰科技'
        sub_channel = u'资讯'
        url = 'http://tech.ifeng.com/listpage/800/0/1/rtlist.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList2,
                             dont_filter=True)