Beispiel #1
0
    def parse(self, response):
        try:
            if response.status == 404:
                return
            r = json.loads(response.body)
            for each in r:
                aid = each['aid']
                author = each['name']
                mid = each['mid']
                view = each['playTotal']
                favorite = each['favoritesTotal']
                danmaku = each['danmakuTotal']
                coin = None
                share = None
                like = None
                date = response.meta['date']
                date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8])
                current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")

                data = {
                    'view': view,
                    'favorite': favorite,
                    'danmaku': danmaku,
                    'coin': coin,
                    'share': share,
                    'like': like,
                    'datetime': current_date
                }

                subChannel = None
                tid = None
                title = each['title']
                date = each['created']
                date = datetime.datetime.strptime(date[0:-5],
                                                  '%Y-%m-%dT%H:%M:%S')
                pic = 'http:' + each['pic']
                item = VideoItem()
                item['current_view'] = view
                item['current_favorite'] = favorite
                item['current_danmaku'] = danmaku
                item['current_coin'] = coin
                item['current_share'] = share
                item['current_like'] = like
                item['current_datetime'] = current_date
                item['aid'] = aid
                item['mid'] = mid
                item['pic'] = pic
                item['author'] = author
                item['data'] = data
                item['title'] = title
                item['subChannel'] = subChannel
                item['datetime'] = date
                if author == '腾讯动漫' or author == '哔哩哔哩番剧':
                    continue
                self.coll.find_one({'aid': aid})
                d = self.coll.find_one({'aid': aid})
                flag = 0
                if d != None and 'data' in d:
                    if 'subChannel' in d:
                        item['subChannel'] = d['subChannel']
                    if 'channel' in d:
                        item['channel'] = d['channel']
                    for each_data in d['data']:
                        data_date = each_data['datetime'].strftime("%Y-%m-%d")
                        if data_date == date_str:
                            flag = 1
                            break
                if flag != 0:
                    item['data'] = None

                yield item

        except Exception as error:
            pass
    def parse(self, response):
        try:
            if response.status == 404:
                return
            r = json.loads(response.body)
            for each in r:
                aid = each['aid']
                author = each['name']
                mid = each['mid']
                view = each['playTotal']
                favorite = each['favoritesTotal']
                danmaku = each['danmakuTotal']
                coin = None
                share = None
                like = None
                date = response.meta['date']
                date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8])
                current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")

                data = {
                    'view': view,
                    'favorite': favorite,
                    'danmaku': danmaku,
                    'coin': coin,
                    'share': share,
                    'like': like,
                    'datetime': current_date
                }

                subChannel = None
                tid = None
                title = each['title']
                date = each['created']
                pic = 'http:' + each['pic']
                item = VideoItem()
                item['current_view'] = view
                item['current_favorite'] = favorite
                item['current_danmaku'] = danmaku
                item['current_coin'] = coin
                item['current_share'] = share
                item['current_like'] = like
                item['current_datetime'] = current_date
                item['aid'] = aid
                item['mid'] = mid
                item['pic'] = pic
                item['author'] = author
                item['data'] = data
                item['title'] = title
                item['subChannel'] = subChannel
                item['datetime'] = date
                if author == '腾讯动漫' or author == '哔哩哔哩番剧':
                    continue
                self.coll.find_one({'aid': aid})
                d = self.coll.find_one({'aid': aid})
                flag = 0
                if d != None and 'data' in d:
                    if 'subChannel' in d:
                        item['subChannel'] = d['subChannel']
                    if 'channel' in d:
                        item['channel'] = d['channel']
                    for each_data in d['data']:
                        data_date = each_data['datetime'].strftime("%Y-%m-%d")
                        if data_date == date_str:
                            flag = 1
                            break
                if flag == 0:
                    yield item

        except Exception as error:
            # 出现错误时打印错误日志

            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(item)
            logging.error(response.url)
            logging.error(error)
  def parse(self, response):
    try:
      self.task.crawl_count += 1
      r = json.loads(response.body)
      d = r["data"]
      keys = list(d.keys())
      for each_key in keys:

        aid = d[each_key]['stat']['aid']
        author = d[each_key]['owner']['name']
        mid = d[each_key]['owner']['mid']
        view = d[each_key]['stat']['view']
        favorite = d[each_key]['stat']['favorite']
        danmaku = d[each_key]['stat']['danmaku']
        coin = d[each_key]['stat']['coin']
        share = d[each_key]['stat']['share']
        like = d[each_key]['stat']['like']
        current_date = datetime.utcnow() + timedelta(hours=8)
        data = {
            'view': view,
            'favorite': favorite,
            'danmaku': danmaku,
            'coin': coin,
            'share': share,
            'like': like,
            'datetime': current_date
        }

        subChannel = d[each_key]['tname']
        title = d[each_key]['title']
        date = d[each_key]['pubdate']
        tid = d[each_key]['tid']
        pic = d[each_key]['pic']
        item = VideoItem()
        item['current_view'] = view
        item['current_favorite'] = favorite
        item['current_danmaku'] = danmaku
        item['current_coin'] = coin
        item['current_share'] = share
        item['current_like'] = like
        item['current_datetime'] = current_date
        item['aid'] = aid
        item['mid'] = mid
        item['pic'] = pic
        item['author'] = author
        item['data'] = data
        item['title'] = title
        item['subChannel'] = subChannel
        item['datetime'] = date

        if subChannel != '':
          item['channel'] = sub_channel_2_channel[subChannel]
        elif subChannel == '资讯':
          if tid == 51:
            item['channel'] == '番剧'
          if tid == 170:
            item['channel'] == '国创'
          if tid == 159:
            item['channel'] == '娱乐'
        else:
          item['channel'] = None

        url_list = response.url.split('&')
        if len(url_list) == 2:
          item['object_id'] = url_list[1]
        else:
          item['object_id'] = None
        yield item

    except Exception as error:
      # 出现错误时打印错误日志
      self.task.crawl_failed += 1
      if r['code'] == -404:
        return

      logging.error("视频爬虫在解析时发生错误")
      logging.error(item)
      logging.error(response.url)
      logging.error(error)
    def parse(self, response):
        try:
            r = json.loads(response.body)
            d = r["data"]
            keys = list(d.keys())
            for each_key in keys:

                aid = d[each_key]['stat']['aid']
                author = d[each_key]['owner']['name']
                mid = d[each_key]['owner']['mid']
                view = d[each_key]['stat']['view']
                favorite = d[each_key]['stat']['favorite']
                danmaku = d[each_key]['stat']['danmaku']
                coin = d[each_key]['stat']['coin']
                share = d[each_key]['stat']['share']
                like = d[each_key]['stat']['like']
                current_date = datetime.now()
                data = {
                    'view': view,
                    'favorite': favorite,
                    'danmaku': danmaku,
                    'coin': coin,
                    'share': share,
                    'like': like,
                    'datetime': current_date
                }

                subChannel = d[each_key]['tname']
                title = d[each_key]['title']
                date = d[each_key]['pubdate']
                tid = d[each_key]['tid']
                pic = d[each_key]['pic']
                item = VideoItem()
                item['current_view'] = view
                item['current_favorite'] = favorite
                item['current_danmaku'] = danmaku
                item['current_coin'] = coin
                item['current_share'] = share
                item['current_like'] = like
                item['current_datetime'] = current_date
                item['aid'] = aid
                item['mid'] = mid
                item['pic'] = pic
                item['author'] = author
                item['data'] = data
                item['title'] = title
                item['subChannel'] = subChannel
                item['datetime'] = date

                if subChannel != '':
                    if (subChannel not in sub_channel_2_channel):
                        item['channel'] = '未知'
                    else:
                        item['channel'] = sub_channel_2_channel[subChannel]
                elif subChannel == '资讯':
                    if tid == 51:
                        item['channel'] == '番剧'
                    if tid == 170:
                        item['channel'] == '国创'
                    if tid == 159:
                        item['channel'] == '娱乐'
                else:
                    item['channel'] = None
                yield item

        except Exception as error:
            # 出现错误时打印错误日志
            if r['code'] == -404:
                return
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(item)
            logging.error(response.url)
            logging.error(error)