コード例 #1
0
    def __download_videos(self, descrs):
        channel_id = descrs[Tab.HomePage][0]['owner_channel']['id']
        for descr in descrs[Tab.Videos]:
            video_id = descr['id']

            # Check in Cache video_id
            if self.__cache.check_exist_video(video_id):
                logging.info("such video already exist (video_id=%s)" %
                             video_id)
                continue

            # Download video
            try:
                full_video_descr = self.scrappy_decorator(
                    self.__video_downloader.load, video_id)
            except Exception as e:
                self.__cache.update_failed_video(video_id)
                msg = "problem with video downloading (video_id=%s)" % video_id
                logging.warning(utils.CrawlerError(e=e, msg=msg))
                continue

            data = self.__create_video(video_id, channel_id, full_video_descr,
                                       descr)
            try:
                self.__cache.insert_video_descr(data)
            except Exception as e:
                msg = "problem with video inserting into db (video_id=%s)" % video_id
                logging.warning(utils.CrawlerError(e=e, msg=msg))
                logging.error(e)
コード例 #2
0
    def process(self, channel_ids=None):
        if channel_ids is None:
            channel_ids = []
        if not isinstance(channel_ids, list):
            raise utils.CrawlerError("channel_ids is not list")
        logging.info("setting channel ids from arguments into cache")

        self.__set_base_videos(channel_ids)
        # Getting first channel from Cache
        channel_id = self.__cache.get_best_channel_id()

        while channel_id is not None:
            full_descr, is_scrappy = self.__scrappy(channel_id)
            if not is_scrappy:
                channel_id = self.__cache.get_best_channel_id()
                continue

            self.__set_neighb_channels(full_descr)

            # Downloading youtube for ChannelId
            # TODO: move to scrapper
            self.__download_videos(full_descr)

            # Channel was downloaded
            if not self.__update_channel_downloaded(channel_id):
                channel_id = self.__cache.get_best_channel_id()
                continue

            # Getting next channel from Cache
            channel_id = self.__cache.get_best_channel_id()
コード例 #3
0
 def __set_base_videos(self, channel_ids):
     msg = None
     try:
         ch_ids_str = ','.join(channel_ids)
         msg = "set base channels was failed (channel_ids=%s)" % ch_ids_str
         self.__cache.set_base_channels(channel_ids)
     except Exception as e:
         logging.exception(utils.CrawlerError(e=e, msg=msg))
コード例 #4
0
 def __update_channel_downloaded(self, channel_id):
     try:
         self.__cache.update_channel_downloaded(channel_id)
     except Exception as e:
         msg = "problem with update channel_id. " + self.__crash_msg % (
             "channel_id", channel_id)
         e = utils.CrawlerError(e=e, msg=msg)
         logging.error(e)
         return False
     return True
コード例 #5
0
 def scrappy_decorator(self, fn, *args, **kwargs):
     count = 0
     e = None
     while count < self.__max_attempts:
         try:
             return fn(*args, **kwargs)
         except Exception as e:
             logging.warning(
                 utils.CrawlerError(e=e,
                                    msg="problem into scrapper. retry: %d" %
                                    count))
             count += 1
     raise e
コード例 #6
0
 def __set_neighb_channels(self, full_descr):
     neighb_channels = None
     try:
         # Setting neighbours channels into Cache. ChannelId
         neighb_channels = self.__get_neighb_channels(full_descr)
         self.__cache.set_channels(neighb_channels,
                                   scrapped=False,
                                   valid=True)
     except Exception as e:
         ch_ids_str = ','.join([ch['id'] for ch in neighb_channels])
         e = utils.CrawlerError(e=e,
                                msg=self.__crash_msg %
                                ("channel_ids", ch_ids_str))
         logging.error(e)