Ejemplo n.º 1
0
    def crawl(self):
        cid = self.key
        channel = CHANNELS[int(cid)]
        page = 1
        pagesize = 30

        while 1:
            try:
                data = api_shows(cid, page, pagesize)
                if data is not None:
                    page += 1
                else:
                    return
            except:
                self.logger.warning(get_exception_info())
                continue

            if not data.get('results'):
                break
            for item in data['results']:
                try:
                    show_id = item['tid']
                    reset = (item['completed'] == 0)
                    data = {
                        'channel': channel,
                        'image': item.get('show_vthumburl_hd') if item.get('show_vthumburl_hd') else item.get('show_thumburl_hd'),
                        'image2': item.get('show_thumburl_hd')
                    }
                    Scheduler.schedule(
                        AlbumCrawler.type, key=show_id, data=data, reset=reset)
                except:
                    self.logger.warning(get_exception_info())
Ejemplo n.º 2
0
    def crawl(self):
        catecode = self.data["catecode"]
        last_updated = self.data.get("updated", datetime.min)
        current_updated = datetime.max
        max_time = last_updated

        page = 1
        pagesize = 20
        while True:
            try:
                data = api_albums(catecode, page, pagesize)
                for item in data["videos"]:
                    try:
                        sid = item.get('sid')
                        detail = api_album(sid) if sid else None
                        model = self.extract_model(item, detail)

                        if sid:
                            videos = self.get_videos(sid)
                            if model['channel'] in [
                                    u'综艺'
                            ]:  #reverse order for zongyi
                                videos = [video for video in reversed(videos)]
                        else:
                            video = VideoItemModel({
                                "title":
                                model["title"],
                                "image":
                                model["image"],
                                "description":
                                model["description"],
                                "time":
                                model["time"],
                                "price":
                                model["price"],
                                "duration":
                                model["duration"],
                                "url":
                                model["url"]
                            })
                            videos = [video]

                        model['videos'] = videos
                        export(model)

                        current_updated = model["time"]
                        if max_time < current_updated:
                            max_time = current_updated
                    except:
                        self.logger.warning(get_exception_info())

                if current_updated < last_updated:
                    break
                if page * pagesize >= data["count"]:
                    break
            except:
                self.logger.warning(get_exception_info())
            page += 1

        self.data["updated"] = max_time
Ejemplo n.º 3
0
    def crawl(self):
        catecode = self.data["catecode"]
        last_updated = self.data.get("updated", datetime.min)
        current_updated = datetime.max
        max_time = last_updated

        page = 1
        pagesize = 20
        while True:
            try:
                data = api_albums(catecode, page, pagesize)
                for item in data["videos"]:
                    try:
                        sid = item.get('sid')
                        detail = api_album(sid) if sid else None
                        model = self.extract_model(item, detail)

                        if sid:
                            videos = self.get_videos(sid)
                            if model['channel'] in [u'综艺']: #reverse order for zongyi
                                videos = [video for video in reversed(videos)]
                        else:
                            video = VideoItemModel({
                                     "title" : model["title"],
                                     "image" : model["image"],
                                     "description" : model["description"],
                                     "time" : model["time"],
                                     "price" : model["price"],
                                     "duration" : model["duration"],
                                     "url" : model["url"]
                                     })
                            videos = [video]

                        model['videos'] = videos
                        export(model)

                        current_updated = model["time"]
                        if max_time < current_updated:
                            max_time = current_updated
                    except:
                        self.logger.warning(get_exception_info())

                if current_updated < last_updated:
                    break
                if page * pagesize >= data["count"]:
                    break
            except:
                self.logger.warning(get_exception_info())
            page += 1

        self.data["updated"] = max_time
Ejemplo n.º 4
0
    def crawl(self):
        for spec in TOP_SPEC:
            video_set = set()
            videos = []
            count = TOP_COUNT / len(spec['sources'])

            for src, param in spec['sources']:
                func = getattr(self, "crawl_%s" % src)
                if func:
                    try:
                        titles = func(param, count)
                        for title in titles:
                            if title not in video_set:
                                video_set.add(title)
                                videos.append(title)
                    except:
                        self.logger.warning(get_exception_info())

            rank = VideoRankModel({
                "source": self.data['source'],
                "title": spec["title"],
                "type": spec["type"],
                "videos": videos,
            })

            export(rank)
Ejemplo n.º 5
0
def _service_worker(category):
    global _RUNNING_CRAWLER

    signal.signal(signal.SIGTERM, _worker_term_handler)
    start_time = time.time()

    while time.time() <= start_time + CrawlerConf._CRAWLER_PROCESS_MAX_TIME:

        item = Scheduler.fetch(category)

        _RUNNING_CRAWLER = item

        timer = Timer(item.get('timeout', CrawlerConf._CRAWLER_TIMEOUT_DEFAULT), _worker_timeout_handler)
        timer.start()

        c = Crawler.create(item["type"], item["key"], item["data"])
        if c:
            try:
                c.crawl()
                success = True
                logger.info("CRAWL SUCCEED %s" % c)
            except Exception:
                msg = get_exception_info()
                success = False
                logger.error("CRAWL FAILED %s, %s" % (c, msg))
        else:
            logger.warn("CRAWL CREATE FAILED %s" % item)
            success = False

        timer.cancel()
        _RUNNING_CRAWLER = None
        Scheduler.finish(item['type'], item['key'], c.data if c else {}, success)
Ejemplo n.º 6
0
    def get_albums(self, cid):
        cid = "%s,0~0~0" % cid

        params = dict(PARAMS_PROTOTYPE)
        params.update(PARAMS_LIST)
        params['category_id'] = cid

        while True:
            try:
                ret = call_api("getViewObject", params)
                if not ret.albumIdList:
                    break
                # list[0] is ordered by time, list[1] is recommendation
                ids = ret.albumIdList[0]['idlist']
                if not ids:
                    break
                for album_id in ids:
                    album = ret.albumArray.get(int(album_id))
                    if album:
                        yield album
            except GeneratorExit:
                return
            except:
                self.logger.warning(get_exception_info())
                continue
            params['pn'] += 1
Ejemplo n.º 7
0
    def get_albums(self, cid):
        cid = "%s,0~0~0" % cid

        params = dict(PARAMS_PROTOTYPE)
        params.update(PARAMS_LIST)
        params['category_id'] = cid

        while True:
            try:
                ret = call_api("getViewObject", params)
                if not ret.albumIdList:
                    break
                # list[0] is ordered by time, list[1] is recommendation
                ids = ret.albumIdList[0]['idlist']
                if not ids:
                    break
                for album_id in ids:
                    album = ret.albumArray.get(int(album_id))
                    if album:
                        yield album
            except GeneratorExit:
                return
            except:
                self.logger.warning(get_exception_info())
                continue
            params['pn'] += 1
Ejemplo n.º 8
0
    def crawl(self):
        for spec in TOP_SPEC:
            video_set = set()
            videos = []
            count = TOP_COUNT / len(spec['sources'])

            for src, param in spec['sources']:
                func = getattr(self, "crawl_%s" % src)
                if func:
                    try:
                        titles = func(param, count)
                        for title in titles:
                            if title not in video_set:
                                video_set.add(title)
                                videos.append(title)
                    except:
                        self.logger.warning(get_exception_info())


            rank = VideoRankModel({
                            "source" : self.data['source'],
                            "title" : spec["title"],
                            "type" : spec["type"],
                            "videos" : videos,
                            })

            export(rank)
Ejemplo n.º 9
0
    def crawl(self):
        channel = self.key
        for year in range(self.data["year"], datetime.utcnow().year + 1):
            for site in SITES:
                try:
                    items = api_history(year, channel, site)
                except:
                    self.logger.warning(get_exception_info())
                    continue

                for item in items:
                    try:
                        self.process_album(item)
                    except:
                        self.logger.warning(get_exception_info())

        self.data["year"] = datetime.utcnow().year
Ejemplo n.º 10
0
    def crawl(self):
        channel = self.key
        for year in range(self.data["year"], datetime.utcnow().year + 1):
            for site in SITES:
                try:
                    items = api_history(year, channel, site)
                except:
                    self.logger.warning(get_exception_info())
                    continue

                for item in items:
                    try:
                        self.process_album(item)
                    except:
                        self.logger.warning(get_exception_info())

        self.data["year"] = datetime.utcnow().year
Ejemplo n.º 11
0
    def crawl(self):
        cid = self.key
        channel = CHANNELS[int(cid)]
        page = 1
        pagesize = 30

        while 1:
            try:
                data = api_shows(cid, page, pagesize)
                if data is not None:
                    page += 1
                else:
                    return
            except:
                self.logger.warning(get_exception_info())
                continue

            if not data.get('results'):
                break
            for item in data['results']:
                try:
                    show_id = item['tid']
                    reset = (item['completed'] == 0)
                    data = {
                        'channel':
                        channel,
                        'image':
                        item.get('show_vthumburl_hd')
                        if item.get('show_vthumburl_hd') else
                        item.get('show_thumburl_hd'),
                        'image2':
                        item.get('show_thumburl_hd')
                    }
                    Scheduler.schedule(AlbumCrawler.type,
                                       key=show_id,
                                       data=data,
                                       reset=reset)
                except:
                    self.logger.warning(get_exception_info())
Ejemplo n.º 12
0
    def handle(self, *args, **options):
        if len(args) < 1:
            self.hint()
            return

        command = args[0]
        _args = []
        for i in range(1, len(args)):
            arg = args[i]
            if arg.isdigit():
                arg = int(arg)
            _args.append(arg)

        func = getattr(scripts, command)
        if not callable(func):
            self.hint()
        else:
            try:
                func(*_args)
            except SystemExit:
                print 'process exit.'
            except:
                print get_exception_info()
Ejemplo n.º 13
0
    def handle(self, *args, **options):
        if len(args) < 1:
            self.hint()
            return

        command = args[0]
        _args = []
        for i in range(1, len(args)):
            arg = args[i]
            if arg.isdigit():
                arg = int(arg)
            _args.append(arg)

        func = getattr(scripts, command)
        if not callable(func):
            self.hint()
        else:
            try:
                func(*_args)
            except SystemExit:
                print 'process exit.'
            except:
                print get_exception_info()
Ejemplo n.º 14
0
    def crawl_video(self, show_id):
        videos = []
        page = 1
        pagesize = 30
        while True:
            data = api_videos(show_id, page, pagesize)
            if not data['results']:
                break

            for item in data['results']:
                try:
                    video = VideoItemModel({
                        "title":
                        item['title'],
                        "source_id":
                        item['videoid'],
                        "url":
                        "http://v.youku.com/v_show/id_%s.html" %
                        item['videoid'],
                    })

                    jsurl = "javascript: getUrl('youku', '%s')" % item[
                        "videoid"]
                    video["stream"] = [{"url": jsurl}]

                    # TODO:
                    #                     ret = api_plays(item['videoid'])
                    #                     results = ret.get('results', {})
                    #                     for key, fmt in FORMATS_NORMAL:
                    #                         if results.get(key):
                    #                             video["stream_low"] = self.extract_stream(results[key], fmt)
                    #                             break
                    #                     for key, fmt in FORMATS_HIGH:
                    #                         if results.get(key):
                    #                             video["stream"] = self.extract_stream(results[key], fmt)
                    #                             break
                    #                     for key, fmt in FORMATS_HD:
                    #                         if results.get(key):
                    #                             video["stream_high"] = self.extract_stream(results[key], fmt)
                    #                             break

                    videos.append(video)
                except:
                    self.logger.warning(get_exception_info())

            if pagesize * page >= data['total']:
                break
            page += 1
        return videos
Ejemplo n.º 15
0
    def crawl_video(self, show_id):
        videos = []
        page = 1
        pagesize = 30
        while True:
            data = api_videos(show_id, page, pagesize)
            if not data['results']:
                break

            for item in data['results']:
                try:
                    video = VideoItemModel({
                        "title": item['title'],
                        "source_id": item['videoid'],
                        "url": "http://v.youku.com/v_show/id_%s.html" % item['videoid'],
                    })

                    jsurl = "javascript: getUrl('youku', '%s')" % item[
                        "videoid"]
                    video["stream"] = [{"url": jsurl}]

                    # TODO:
#                     ret = api_plays(item['videoid'])
#                     results = ret.get('results', {})
#                     for key, fmt in FORMATS_NORMAL:
#                         if results.get(key):
#                             video["stream_low"] = self.extract_stream(results[key], fmt)
#                             break
#                     for key, fmt in FORMATS_HIGH:
#                         if results.get(key):
#                             video["stream"] = self.extract_stream(results[key], fmt)
#                             break
#                     for key, fmt in FORMATS_HD:
#                         if results.get(key):
#                             video["stream_high"] = self.extract_stream(results[key], fmt)
#                             break

                    videos.append(video)
                except:
                    self.logger.warning(get_exception_info())

            if pagesize * page >= data['total']:
                break
            page += 1
        return videos
Ejemplo n.º 16
0
    def get_video(self, album_id, video_id):
        album_id = str(album_id)
        video_id = str(video_id)

        params = dict(PARAMS_PROTOTYPE)
        params.update(PARAMS_ALBUM)

        try:
            ret = call_api("getAlbum", params,
                           [album_id, None, None, video_id, None, '1', '0'])
            if not ret.tv[0]._id or ret.tv[0]._id != video_id:
                ret = call_api("getAlbum", params,
                               [video_id, None, None, None, '1', '0'])
        except:
            self.logger.warning(get_exception_info())
            return

        for i in range(ret.tv['count']):
            if ret.tv[i]._id == video_id:
                return extract_video(ret.tv[i])
        raise Exception("No video found for video_id = %s" % video_id)
Ejemplo n.º 17
0
    def get_video(self, album_id, video_id):
        album_id = str(album_id)
        video_id = str(video_id)

        params = dict(PARAMS_PROTOTYPE)
        params.update(PARAMS_ALBUM)

        try:
            ret = call_api("getAlbum", params, [
                           album_id, None, None, video_id, None, '1', '0'])
            if not ret.tv[0]._id or ret.tv[0]._id != video_id:
                ret = call_api("getAlbum", params, [
                               video_id, None, None, None, '1', '0'])
        except:
            self.logger.warning(get_exception_info())
            return

        for i in range(ret.tv['count']):
            if ret.tv[i]._id == video_id:
                return extract_video(ret.tv[i])
        raise Exception("No video found for video_id = %s" % video_id)
Ejemplo n.º 18
0
    def crawl(self):
        cid = self.data['cid']
        current_time = int(time.time())

        for album_data in self.get_albums(cid):
            try:
                album = extract_album(album_data, self.data['source'])
                if not album:
                    continue
                checkup_time = time.mktime(album['time'].timetuple())

                # can't get video for paid item
                if (not album["price"]) and album.get('source_id'):
                    Scheduler.schedule(
                        type=AlbumCrawler.type,
                        key=album['source_id'],
                        data={"time": album["time"]},
                        reset=(current_time - checkup_time) < 86400)
            except:
                self.logger.warning(get_exception_info())

        self.data['updated'] = current_time
Ejemplo n.º 19
0
    def crawl(self):
        cid = self.data['cid']
        current_time = int(time.time())

        for album_data in self.get_albums(cid):
            try:
                album = extract_album(album_data, self.data['source'])
                if not album:
                    continue
                checkup_time = time.mktime(album['time'].timetuple())

                # can't get video for paid item
                if (not album["price"]) and album.get('source_id'):
                    Scheduler.schedule(
                        type=AlbumCrawler.type,
                        key=album['source_id'],
                        data={"time": album["time"]},
                        reset=(current_time - checkup_time) < 86400
                    )
            except:
                self.logger.warning(get_exception_info())

        self.data['updated'] = current_time
Ejemplo n.º 20
0
    def crawl(self):
        min_time = self.data['updated'] #上次爬取到最新视频的更新时间, 为本次爬取的时间下界
        max_time = None #本次抓取的最新视频的时间

        page = 1
        while True:
            url = "http://bdzy.cc/list/?0-%s.html" % page
            hxs = load_html(url) #读取网页html, 返回一个HtmlXPathSelector

            time = None
            for s in hxs.select("//body/.//tr[@class='row']"): #用xpath解析html
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0] #源站ID
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d")

                    if not max_time: #第一条是最新更新的
                        max_time = time
                    if time < min_time: #已经爬取到上次最新的数据
                        break

                    data = { #详情页爬虫任务的附加数据
                        "title" : title,
                        "time" : time,
                        "category" : category,
                        "region" : region,
                        }

                    #获取对应详情页爬虫的附加数据,用time字段判断该内容是否已经更新,需要重新抓取. 如果第一次创建,则数据为空
                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min

                    #创建相应的专辑爬虫,爬取相应的详情页. key为源站id
                    Scheduler.schedule(
                                       AlbumCrawler.type,
                                       source_id,
                                       data,
                                       reset = data["time"] > lasttime #是否需要强制重新抓取
                                       )
                except:
                    self.logger.warning(get_exception_info()) #纪录错误信息并继续
                    continue

            if time and time < min_time: #已经爬取到上次最新的数据
                break

            #获取总页数
            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])

            #超过总页数
            if page >= page_count:
                break
            page += 1

        if max_time:
            self.data = {'updated' : max_time} #保存上次爬取到的最新的时间