Python exportの例、contentservice.crawler.export Pythonの例

コード例 #1

0

ファイルを表示

ファイル: iqiyi.py プロジェクト: hitflame/ContentService

    def crawl(self):
        album_id = self.key

        params = dict(PARAMS_PROTOTYPE)
        params.update(PARAMS_ALBUM)

        ret = call_api("getAlbum", params,
                       [album_id, None, None, album_id, None, '1', '0'])

        if ret._id != album_id:
            return

        model = extract_album(ret, self.data['source'])

        videos = []
        if ret.tv['block']:
            block = ret.tv['block']
            for block_index in xrange(len(ret.tv['block'])):
                if block_index != 0:
                    block_now = block[block_index]
                    ret = call_api(
                        "getAlbum", params,
                        [album_id, None, None, None, '1', '1', block_now])
                videos.extend(self.extract_videos(album_id, ret.tv['other']))
        if not ret.tv['block']:
            videos.extend(self.extract_videos(album_id, ret.tv['other']))

        model['videos'] = videos
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #2

0

ファイルを表示

ファイル: rank.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        for spec in TOP_SPEC:
            video_set = set()
            videos = []
            count = TOP_COUNT / len(spec['sources'])

            for src, param in spec['sources']:
                func = getattr(self, "crawl_%s" % src)
                if func:
                    try:
                        titles = func(param, count)
                        for title in titles:
                            if title not in video_set:
                                video_set.add(title)
                                videos.append(title)
                    except:
                        self.logger.warning(get_exception_info())


            rank = VideoRankModel({
                            "source" : self.data['source'],
                            "title" : spec["title"],
                            "type" : spec["type"],
                            "videos" : videos,
                            })

            export(rank)

コード例 #3

0

ファイルを表示

ファイル: sohu.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        album_id = self.key
        channel = self.data["channel"]

        detail = api_album(album_id) if album_id else None

        title = detail["tv_name"]
        directors = detail["director"].split(";")
        actors = detail["actor"].split(";")
        region = detail["area"]
        categories = detail["tv_cont_cats"].split(";")
        ver_image = detail["ver_high_pic"]
        hor_image = detail["hor_high_pic"]
        url = detail["s_url"]
        description = detail["tv_desc"]

        # 视频导出的数据模型
        model = VideoSourceModel({
            "source" : self.data['source'],
            "source_id" : album_id,
            "title" : title,
            "url" : url,
            "directors" : directors,
            "actors" : actors,
            "region" : region,
            "categories" : categories,
            "channel" : channel,
            "description" : description,
            "image" : ver_image,
            "image2" : hor_image,
            })
        # 导出数据
        export(model)
        self.data['to_album_id'] = model['to_album_id']
        return

コード例 #4

0

ファイルを表示

ファイル: sohu.py プロジェクト: hitflame/ContentService

    def crawl(self):
        album_id = self.key
        channel = self.data["channel"]

        detail = api_album(album_id) if album_id else None

        title = detail["tv_name"]
        directors = detail["director"].split(";")
        actors = detail["actor"].split(";")
        region = detail["area"]
        categories = detail["tv_cont_cats"].split(";")
        ver_image = detail["ver_high_pic"]
        hor_image = detail["hor_high_pic"]
        url = detail["s_url"]
        description = detail["tv_desc"]

        # 视频导出的数据模型
        model = VideoSourceModel({
            "source": self.data['source'],
            "source_id": album_id,
            "title": title,
            "url": url,
            "directors": directors,
            "actors": actors,
            "region": region,
            "categories": categories,
            "channel": channel,
            "description": description,
            "image": ver_image,
            "image2": hor_image,
        })
        # 导出数据
        export(model)
        self.data['to_album_id'] = model['to_album_id']
        return

コード例 #5

0

ファイルを表示

ファイル: sohu.py プロジェクト: hitflame/ContentService

    def crawl(self):
        catecode = self.data["catecode"]
        last_updated = self.data.get("updated", datetime.min)
        current_updated = datetime.max
        max_time = last_updated

        page = 1
        pagesize = 20
        while True:
            try:
                data = api_albums(catecode, page, pagesize)
                for item in data["videos"]:
                    try:
                        sid = item.get('sid')
                        detail = api_album(sid) if sid else None
                        model = self.extract_model(item, detail)

                        if sid:
                            videos = self.get_videos(sid)
                            if model['channel'] in [
                                    u'综艺'
                            ]:  #reverse order for zongyi
                                videos = [video for video in reversed(videos)]
                        else:
                            video = VideoItemModel({
                                "title":
                                model["title"],
                                "image":
                                model["image"],
                                "description":
                                model["description"],
                                "time":
                                model["time"],
                                "price":
                                model["price"],
                                "duration":
                                model["duration"],
                                "url":
                                model["url"]
                            })
                            videos = [video]

                        model['videos'] = videos
                        export(model)

                        current_updated = model["time"]
                        if max_time < current_updated:
                            max_time = current_updated
                    except:
                        self.logger.warning(get_exception_info())

                if current_updated < last_updated:
                    break
                if page * pagesize >= data["count"]:
                    break
            except:
                self.logger.warning(get_exception_info())
            page += 1

        self.data["updated"] = max_time

コード例 #6

0

ファイルを表示

ファイル: iqiyi.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        album_id = self.key

        params = dict(PARAMS_PROTOTYPE)
        params.update(PARAMS_ALBUM)

        ret = call_api("getAlbum", params, [
                       album_id, None, None, album_id, None, '1', '0'])

        if ret._id != album_id:
            return

        model = extract_album(ret, self.data['source'])

        videos = []
        if ret.tv['block']:
            block = ret.tv['block']
            for block_index in xrange(len(ret.tv['block'])):
                if block_index != 0:
                    block_now = block[block_index]
                    ret = call_api("getAlbum", params, [
                                   album_id, None, None, None, '1', '1', block_now])
                videos.extend(self.extract_videos(
                    album_id, ret.tv['other']))
        if not ret.tv['block']:
            videos.extend(
                self.extract_videos(album_id, ret.tv['other']))

        model['videos'] = videos
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #7

0

ファイルを表示

    def crawl(self):
        for spec in TOP_SPEC:
            video_set = set()
            videos = []
            count = TOP_COUNT / len(spec['sources'])

            for src, param in spec['sources']:
                func = getattr(self, "crawl_%s" % src)
                if func:
                    try:
                        titles = func(param, count)
                        for title in titles:
                            if title not in video_set:
                                video_set.add(title)
                                videos.append(title)
                    except:
                        self.logger.warning(get_exception_info())

            rank = VideoRankModel({
                "source": self.data['source'],
                "title": spec["title"],
                "type": spec["type"],
                "videos": videos,
            })

            export(rank)

コード例 #8

0

ファイルを表示

ファイル: fengxing.py プロジェクト: hitflame/ContentService

    def crawl(self):
        videos = []
        mid = self.key
        url = DETAIL % mid
        detail = loadurl(url)
        description = detail.get('plots')
        description = ''.join(description.split())
        if self.data.get('channel') == u'鐢靛奖':
            dict_ = detail['pinfos']['mpurls']
            video = VideoItemModel({
                                    "title": self.data.get('title'),
                                    "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃
                                    "image": self.data.get('image'),
                                    "description": description,
                                    "stream": [{
                                                 'url': dict_['tv'].get('url'),
                                                 'size': dict_['tv'].get('bits'),
                                                 'format': 'mp4'
                                                }]
                                    })   
            videos.append(video)
        else:
            try:
                sort = detail['pinfos'].get('sort')[0]    
                episodes = detail['pinfos']['content'][sort]['fsps']
            except:
                episodes = detail['pinfos']['fsps']

            for episode in episodes:
                plots = episode.get('plots')
                plots = ''.join(plots.split())                
                video = VideoItemModel({
                                     "title": episode.get('taskname'),
                                     "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃
                                     "image": episode.get('picurl'),
                                     "description": plots,
                                     "stream": getstream(episode.get('mpurls'))
                                     })
                videos.append(video)           
        model = VideoSourceModel({
                                 "source": self.data.get('source'), 
                                 "source_id": mid, #婧愮珯ID
                                 "title": self.data["title"],
                                 "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃
                                 "image": self.data.get('image'), #鍥剧墖url
                                 "categories": self.data.get('category'), #鍒嗙被
                                 "channel": self.data.get('channel'), #棰戦亾
                                 "region": detail.get('country'), #鍦板尯
                                 "videos": videos, #瑙嗛涓撹緫
                                 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿
                                 "actors": detail.get('lactor'),
                                 "directors": detail.get('director'),
                                 "description": description,
                                 })
        #瀵煎嚭鏁版嵁
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #9

0

ファイルを表示

ファイル: fenghuang.py プロジェクト: hitflame/ContentService

 def crawl(self):
     timestr = self.data.get('videoLength', '00:00')
     duration = gettime(timestr)
     videos = []
     video = VideoItemModel({
         "title":
         self.data.get('title'),
         "url":
         self.data.get('videoURLMid'),  #网页地址
         "image":
         self.data.get('imgURL'),
         "description":
         self.data.get('desc'),
         "stream": [{
             "url": self.data.get('videoURLMid'),  #视频文件播放地址
             "size": self.data.get('videoSizeMid'),
             "format": "mp4",  #视频格式(协议)
             "duration": duration
         }],
         "stream_low": [{
             "url": self.data.get('videoURLLow'),
             "size": self.data.get('videoSizeLow'),
             "format": "mp4",
             "duration": duration
         }],
         "stream_high": [{
             "url": self.data.get('videoURLHigh'),
             "size": self.data.get('videoSizeHigh'),
             "format": "mp4",
             "duration": duration
         }]
     })
     videos.append(video)
     model = VideoSourceModel({
         "source":
         self.data.get('source'),
         "source_id":
         self.data.get('id'),  #源站ID
         "title":
         self.data.get("title"),
         "url":
         self.data.get('shareurl'),  #详情页的地址
         "image":
         self.data.get('imgURL'),  #图片url
         "channel":
         CHANNEL,  #频道
         "videos":
         videos,  #视频专辑
         "pubtime":
         parse_date(self.data.get('videoPublishTime')),  #上映时间
         "description":
         self.data.get('desc'),
     })
     #导出数据
     export(model)
     self.data['to_album_id'] = model['to_album_id']

コード例 #10

0

ファイルを表示

ファイル: video56.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        type = 4
        album_id = self.key
        title = self.data['title'].encode('utf-8')
        channel = self.data.get('channel')

        if channel in LONG_VIDEO_CHANNELS.items():
            album_data = api_album(type, album_id, title)
            album_data = album_data['data']
            pubtime = album_data.get("public_time")
            pubtime = datetime.strptime(pubtime, "%Y%m%d")

            videos = []
            for video in album_data['data']:
                video = clean_video(video)
                videos.append(video)

            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": album_data.get("bpic"),
                "image2": album_data.get("mpic"),
                "url": album_data.get("web_url"),
                "actors": album_data.get("actors"),
                "directors": album_data.get("director"),
                "categories": album_data.get("tname"),
                "tags": self.data.get("tags"),
                "channel": channel,
                "region": album_data.get("zname")[0],
                "description": album_data.get("introduce"),
                "pubtime": pubtime,
                "videos": videos,
            })
        else:
            video = VideoItemModel({
                                   "title": title,
                                   "description": self.data.get("description"),
                                   "url": "http://www.56.com/u13/v_%s.html" % album_id,
                                   "stream": [{"url": "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id}],
                                   "stream_high": [{"url": "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id}]
                                   })
            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": self.data.get("bpic"),
                "image2": self.data.get("mpic"),
                "tags": self.data.get("tags"),
                "url": self.data.get("web_url"),
                "channel": channel,
                "description": self.data.get("introduce"),
                "videos": [video],
            })
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #11

0

ファイルを表示

ファイル: youku.py プロジェクト: hitflame/ContentService

 def crawl(self):
     for spec in TOP_SPECS:
         titles = crawl_top(spec["url"])
         rank = VideoRankModel({
             "source": self.data['source'],
             "type": spec["type"],
             "title": spec["title"],
             "videos": titles,
         })
         export(rank)

コード例 #12

0

ファイルを表示

ファイル: youku.py プロジェクト: LandyGuo/ContentService

 def crawl(self):
     for spec in TOP_SPECS:
         titles = crawl_top(spec["url"])
         rank = VideoRankModel({
             "source": self.data['source'],
             "type": spec["type"],
             "title": spec["title"],
             "videos": titles,
         })
         export(rank)

コード例 #13

0

ファイルを表示

ファイル: fangying.py プロジェクト: LandyGuo/ContentService

 def crawl(self):
     url = self.data['url']
     title = self.data['title']
     items = api_recommend(url, 10)
     related = [item['title'] for item in items]
     album = VideoSourceModel({
                      'source' : self.data['source'],
                      'title' : title,
                      'related' : related,
                      })
     export(album)

コード例 #14

0

ファイルを表示

ファイル: fangying.py プロジェクト: hitflame/ContentService

 def crawl(self):
     url = self.data['url']
     title = self.data['title']
     items = api_recommend(url, 10)
     related = [item['title'] for item in items]
     album = VideoSourceModel({
         'source': self.data['source'],
         'title': title,
         'related': related,
     })
     export(album)

コード例 #15

0

ファイルを表示

ファイル: tudou.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        album_id = self.key
        detail_data = api_detail(album_id)
        detail_data = detail_data.get('detail')

        channel = detail_data.get('cats')
        title = detail_data.get('title')
        title = "".join(title.split())
        image = detail_data.get('img')
        url = detail_data.get('play_url')
        url_key = re.findall(
            "http://www.tudou.com/albumplay/(.+)/.+\.html", url)[0]
        album_url = "http://www.tudou.com/albumcover/%s.html" % url_key
        if channel == u"动漫":
            actors = detail_data.get('seiyuu')
        else:
            actors = detail_data.get('performer')
        if channel == u"综艺":
            directors = detail_data.get('host')
        else:
            directors = detail_data.get('director')
        categories = detail_data.get('genre')
        region = detail_data.get('area')[0]
        description = detail_data.get('desc')
        description = "".join(description.split())
        pubtime = detail_data.get('showdate')
        # 未知发布时间pubtime != 0
        if pubtime:
            pubtime = datetime.strptime(str(pubtime), "%Y")
        # 未知发布时间pubtime == 0
        if not pubtime:
            pubtime = datetime.utcfromtimestamp(0)

        videos = get_videos(album_id, url_key)

        model = VideoSourceModel({
            "source": self.data.get('source'),
            "source_id": album_id,
            "title": title,
            "image": image,
            "url": album_url,
            "actors": actors,
            "directors": directors,
            "categories": categories,
            "channel": channel,
            "region": region,
            "description": description,
            "pubtime": pubtime,
            "videos": videos,
        })

        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #16

0

ファイルを表示

    def crawl(self):
        album_id = self.key
        detail_data = api_detail(album_id)
        detail_data = detail_data.get('detail')

        channel = detail_data.get('cats')
        title = detail_data.get('title')
        title = "".join(title.split())
        image = detail_data.get('img')
        url = detail_data.get('play_url')
        url_key = re.findall("http://www.tudou.com/albumplay/(.+)/.+\.html",
                             url)[0]
        album_url = "http://www.tudou.com/albumcover/%s.html" % url_key
        if channel == u"动漫":
            actors = detail_data.get('seiyuu')
        else:
            actors = detail_data.get('performer')
        if channel == u"综艺":
            directors = detail_data.get('host')
        else:
            directors = detail_data.get('director')
        categories = detail_data.get('genre')
        region = detail_data.get('area')[0]
        description = detail_data.get('desc')
        description = "".join(description.split())
        pubtime = detail_data.get('showdate')
        # 未知发布时间pubtime != 0
        if pubtime:
            pubtime = datetime.strptime(str(pubtime), "%Y")
        # 未知发布时间pubtime == 0
        if not pubtime:
            pubtime = datetime.utcfromtimestamp(0)

        videos = get_videos(album_id, url_key)

        model = VideoSourceModel({
            "source": self.data.get('source'),
            "source_id": album_id,
            "title": title,
            "image": image,
            "url": album_url,
            "actors": actors,
            "directors": directors,
            "categories": categories,
            "channel": channel,
            "region": region,
            "description": description,
            "pubtime": pubtime,
            "videos": videos,
        })

        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #17

0

ファイルを表示

    def crawl(self):
        album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key
        hxs = load_html(album_url)

        urls = hxs.select("//div[@class='movievod']/li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(VideoItemModel({
                            "title" : title,
                            "url" : url,
                            "stream" : [{"url" : url, "format" : "qvod", "size" : size}],
                            }))

        kv = {}
        for s in hxs.select("//div[@class='videoDetail']/p"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract())
        try:
            image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0]
        except:
            image = None

        model = VideoSourceModel({
                                 "source" : self.data['source'],
                                 "source_id" : self.key,
                                 "title" : self.data["title"],
                                 "time" : self.data.get('time'),
                                 "url" : album_url,
                                 "image" : image,
                                 "completed" : self.data.get('completed'),
                                 "categories" : [self.data.get('category')],
                                 "channel" : self.data.get('category'),
                                 "region" : self.data.get('region'),
                                 "videos" : videos,
                                 "actors" : split(kv.get(u'影片主演:')),
                                 "directors" : split(kv.get(u'影片导演:')),
                                 "pubtime" : parse_date(kv.get(u'上映年份:')),
                                 "description" : description,
                                 "completed" : not kv.get(u'连载状态:'),
                                 })
        export(model)

コード例 #18

0

ファイルを表示

ファイル: top_sohu.py プロジェクト: hitflame/ContentService

    def crawl(self):
        for spec in TOP_sohu:
            rank_list = crawl_top(spec.get('url'),spec.get('channel'),50)
#             print 'source:%s channel:%s priority:%s num:%s'%(spec['source'],spec['channel'],spec['priority'],len(rank_list))
            rank = VideoTopModel({
                                'source': spec['source'],
                                'channel': spec['channel'],
                                'priority': spec['priority'],
                                'type': spec['type'],
                                'updatetime': datetime.now().isoformat(),
                                'list': rank_list
                               })
            export(rank)

コード例 #19

0

ファイルを表示

ファイル: hakuzy.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        album_url = "http://hakuzy.com/detail/?%s.html" % self.key
        hxs = load_html(album_url)

        urls = hxs.select("//td[@class='bt']/.//input[@id='copy_yah']/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(VideoItemModel({
                            "title" : title,
                            "url" : url,
                            "stream" : [{"url" : url, "format" : "qvod", "size" : size}],
                            }))

        kv = {}
        for s in hxs.select("/html/body/table[4]/tbody/tr[1]/td[2]/table/tbody/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract())
        try:
            image = hxs.select("//div[@class='img']/img/@src").extract()[0]
        except:
            image = None

        model = VideoSourceModel({
                                 "source" : SOURCE,
                                 "source_id" : self.key,
                                 "title" : self.data["title"],
                                 "time" : self.data.get('time'),
                                 "url" : album_url,
                                 "image" : image,
                                 "categories" : [self.data.get('category')],
                                 "channel" : self.data.get('category'),
                                 "region" : self.data.get('region'),
                                 "videos" : videos,
                                 "pubtime" : parse_date(kv.get(u"上映日期：")),
                                 "actors" : split(kv.get(u"影片演员：")),
                                 "directors" : split(kv.get(u"影片导演：")),
                                 "completed" : kv.get(u"影片状态：", "").find(u"连载") == -1,
                                 "description" : description,
                                 })
        export(model)

コード例 #20

0

ファイルを表示

ファイル: sohu.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        catecode = self.data["catecode"]
        last_updated = self.data.get("updated", datetime.min)
        current_updated = datetime.max
        max_time = last_updated

        page = 1
        pagesize = 20
        while True:
            try:
                data = api_albums(catecode, page, pagesize)
                for item in data["videos"]:
                    try:
                        sid = item.get('sid')
                        detail = api_album(sid) if sid else None
                        model = self.extract_model(item, detail)

                        if sid:
                            videos = self.get_videos(sid)
                            if model['channel'] in [u'综艺']: #reverse order for zongyi
                                videos = [video for video in reversed(videos)]
                        else:
                            video = VideoItemModel({
                                     "title" : model["title"],
                                     "image" : model["image"],
                                     "description" : model["description"],
                                     "time" : model["time"],
                                     "price" : model["price"],
                                     "duration" : model["duration"],
                                     "url" : model["url"]
                                     })
                            videos = [video]

                        model['videos'] = videos
                        export(model)

                        current_updated = model["time"]
                        if max_time < current_updated:
                            max_time = current_updated
                    except:
                        self.logger.warning(get_exception_info())

                if current_updated < last_updated:
                    break
                if page * pagesize >= data["count"]:
                    break
            except:
                self.logger.warning(get_exception_info())
            page += 1

        self.data["updated"] = max_time

コード例 #21

0

ファイルを表示

ファイル: fangying.py プロジェクト: LandyGuo/ContentService

    def process_album(self, item):
        sites = {}
        fangying_id = re.findall("f_(.+)\.html", item['link'])[0]

        for play in item['plays']:
            site = play['site']
            if site not in SITES:
                continue

            if play["url"].find("fangying.com") != -1:
                stream = []
            else:
                format = "thunder" if site == "thunder" else ""
                stream = [{"url" : play["url"], "format" : format}]

            video = VideoItemModel({
                                    "title" : play["title"],
                                    "url" : play["url"],
                                    "stream" : stream,
                                    })

            if not sites.has_key(site):
                sites[site] = []
            sites[site].append(dict(video))

        model = None
        for site, videos in sites.iteritems():
            model = VideoSourceModel({
                        "source" : self.data['source'],
                        "source_id" : fangying_id,
                        "videos" : videos,
                        "title" : item['title'],
                        "directors" : item['directors'].split("/"),
                        "actors" : item['performers'].split("/"),
                        "description" : item['description'],
                        'categories' : item['genres'].split("/"),
                        'region' : item['countries'].split("/")[0],
                        'duration' : parse_duration(item['duration']),
                        'image' : item['avatar_middle'],
                        'score' : float(item['douban_rating']) if item.get('douban_rating') else None,
                        'url' : item['link'],
                        'price' : 0.0,
                        'pubtime' : parse_pubtime(item['release_time']),
                        'channel' : CHANNELS.get(self.key)
                 })
            export(model)

        if model:
            Scheduler.schedule(RelationCrawler.type, key = fangying_id, data = {'title' : model['title'], 'url' : model['url']})

コード例 #22

0

ファイルを表示

ファイル: youku.py プロジェクト: hitflame/ContentService

    def crawl(self):
        album_id = self.key
        channel = self.data.get('channel')

        if channel in CHANNELS.values():
            model = self.crawl_show(album_id)
            model['image'] = self.data.get('image')
            model['image2'] = self.data.get('image2')
            if channel:
                model['channel'] = channel

            videos = self.crawl_video(album_id)
            if channel == u'综艺':
                videos = [video for video in reversed(videos)]

            model['videos'] = videos
            export(model)
            self.data['to_album_id'] = model['to_album_id']

コード例 #23

0

ファイルを表示

ファイル: youku.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        album_id = self.key
        channel = self.data.get('channel')

        if channel in CHANNELS.values():
            model = self.crawl_show(album_id)
            model['image'] = self.data.get('image')
            model['image2'] = self.data.get('image2')
            if channel:
                model['channel'] = channel

            videos = self.crawl_video(album_id)
            if channel == u'综艺':
                videos = [video for video in reversed(videos)]

            model['videos'] = videos
            export(model)
            self.data['to_album_id'] = model['to_album_id']

コード例 #24

0

ファイルを表示

ファイル: fenghuang.py プロジェクト: LandyGuo/ContentService

 def crawl(self): 
     timestr = self.data.get('videoLength','00:00')
     duration = gettime(timestr)
     videos = []
     video = VideoItemModel({
                             "title": self.data.get('title'),
                             "url": self.data.get('videoURLMid'), #网页地址
                             "image": self.data.get('imgURL'),
                             "description": self.data.get('desc'),
                             "stream": [{
                                        "url": self.data.get('videoURLMid'), #视频文件播放地址
                                        "size": self.data.get('videoSizeMid'),
                                        "format": "mp4", #视频格式(协议)
                                        "duration": duration
                                       }],
                             "stream_low":[{
                                      "url": self.data.get('videoURLLow'), 
                                      "size": self.data.get('videoSizeLow'),
                                      "format": "mp4",
                                      "duration": duration
                                     }],
                             "stream_high":[{
                                      "url": self.data.get('videoURLHigh'),
                                      "size": self.data.get('videoSizeHigh'),
                                      "format": "mp4", 
                                      "duration": duration
                                     }]
                         })
     videos.append(video)   
     model = VideoSourceModel({
                              "source": self.data.get('source'),
                              "source_id": self.data.get('id'), #源站ID
                              "title": self.data.get("title"),
                              "url": self.data.get('shareurl'), #详情页的地址
                              "image": self.data.get('imgURL'), #图片url
                              "channel": CHANNEL, #频道
                              "videos": videos, #视频专辑
                              "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间
                              "description": self.data.get('desc'),
                              })
     #导出数据
     export(model)
     self.data['to_album_id'] = model['to_album_id']

コード例 #25

0

ファイルを表示

ファイル: scripts.py プロジェクト: hitflame/ContentService

def update_region():
    conn = Connection()
    db = conn.content_video2
    count = 1
    source_videos = db.video.source.find()
    for source_video in source_videos:
        model = VideoSourceModel({
            "videos": source_video['videos'],
            "image": source_video['image'],
            "related": source_video['related'],
            "duration": source_video['duration'],
            "title": source_video['title'],
            "comments": source_video['comments'],
            "source": source_video['source'],
            "score": source_video['score'],
            "actors": source_video['actors'],
            "price": source_video['price'],
            "channel": source_video['channel'],
            "description": source_video['description'],
            "tags": source_video['tags'],
            "deleted": source_video['deleted'],
            "completed": source_video['completed'],
            "visits": source_video['visits'],
            "favorites": source_video['favorites'],
            "authorities": source_video['authorities'],
            "categories": source_video['categories'],
            "created": source_video['created'],
            "url": source_video['url'],
            "region": source_video['region'],
            "directors": source_video['directors'],
            "pubtime": source_video['pubtime'],
            "time": source_video['time'],
            "source_id": source_video['source_id']
        })
        export(model)
        count += 1
        print "count = %s" % count
    print "count = %s" % count
    print "map complete."

コード例 #26

0

ファイルを表示

ファイル: scripts.py プロジェクト: LandyGuo/ContentService

def update_region():
        conn = Connection()
        db = conn.content_video2
        count = 1
        source_videos =  db.video.source.find()
        for source_video in source_videos:
            model = VideoSourceModel({
                        "videos": source_video['videos'],
                        "image": source_video['image'],
                        "related": source_video['related'],
                        "duration": source_video['duration'],
                        "title": source_video['title'],
                        "comments": source_video['comments'],
                        "source": source_video['source'],
                        "score": source_video['score'],
                        "actors": source_video['actors'],
                        "price": source_video['price'],
                        "channel": source_video['channel'],
                        "description": source_video['description'],
                        "tags": source_video['tags'],
                        "deleted": source_video['deleted'],
                        "completed": source_video['completed'],
                        "visits": source_video['visits'],
                        "favorites": source_video['favorites'],
                        "authorities": source_video['authorities'],
                        "categories": source_video['categories'],
                        "created": source_video['created'],
                        "url": source_video['url'],
                        "region": source_video['region'],
                        "directors": source_video['directors'],
                        "pubtime": source_video['pubtime'],
                        "time": source_video['time'],
                        "source_id": source_video['source_id']
                        })
            export(model)
            count += 1
            print "count = %s" % count
        print "count = %s" % count
        print "map complete."

コード例 #27

0

ファイルを表示

ファイル: video56.py プロジェクト: hitflame/ContentService

    def crawl(self):
        type = 4
        album_id = self.key
        title = self.data['title'].encode('utf-8')
        channel = self.data.get('channel')

        if channel in LONG_VIDEO_CHANNELS.items():
            album_data = api_album(type, album_id, title)
            album_data = album_data['data']
            pubtime = album_data.get("public_time")
            pubtime = datetime.strptime(pubtime, "%Y%m%d")

            videos = []
            for video in album_data['data']:
                video = clean_video(video)
                videos.append(video)

            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": album_data.get("bpic"),
                "image2": album_data.get("mpic"),
                "url": album_data.get("web_url"),
                "actors": album_data.get("actors"),
                "directors": album_data.get("director"),
                "categories": album_data.get("tname"),
                "tags": self.data.get("tags"),
                "channel": channel,
                "region": album_data.get("zname")[0],
                "description": album_data.get("introduce"),
                "pubtime": pubtime,
                "videos": videos,
            })
        else:
            video = VideoItemModel({
                "title":
                title,
                "description":
                self.data.get("description"),
                "url":
                "http://www.56.com/u13/v_%s.html" % album_id,
                "stream": [{
                    "url":
                    "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id
                }],
                "stream_high": [{
                    "url":
                    "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id
                }]
            })
            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": self.data.get("bpic"),
                "image2": self.data.get("mpic"),
                "tags": self.data.get("tags"),
                "url": self.data.get("web_url"),
                "channel": channel,
                "description": self.data.get("introduce"),
                "videos": [video],
            })
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #28

0

ファイルを表示

ファイル: zy265.py プロジェクト: hitflame/ContentService

    def crawl(self):
        album_url = "http://www.265zy.com/detail/?%s.html" % self.key
        hxs = load_html(album_url)

        urls = hxs.select(
            "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(
                VideoItemModel({
                    "title":
                    title,
                    "url":
                    url,
                    "stream": [{
                        "url": url,
                        "format": "qvod",
                        "size": size
                    }],
                }))

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(
            hxs.select("//div[@class='intro']/.//text()").extract())
        try:
            image = urlparse.urljoin(
                "http://www.265zy.com/",
                hxs.select("//div[@class='img']/img/@src").extract()[0])
        except:
            image = None

        model = VideoSourceModel({
            "source":
            self.data['source'],
            "source_id":
            self.key,
            "title":
            self.data["title"],
            "image":
            image,
            "url":
            album_url,
            "time":
            self.data.get('time'),
            "categories": [self.data.get('category')],
            "channel":
            self.data.get('category'),
            "region":
            self.data.get('region'),
            "videos":
            videos,
            "actors":
            split(kv.get(u"影片演员：")),
            "pubtime":
            parse_date(kv.get(u"上映日期：")),
            "completed":
            kv.get(u"影片状态：", "").find(u"连载") == -1,
            "description":
            description,
        })
        export(model)

コード例 #29

0

ファイルを表示

ファイル: letv.py プロジェクト: LandyGuo/ContentService

    def crawl(self):
        source_id = self.key
        album_data = api_album(source_id, pcode, version)
        album_data = album_data['body']
        title = album_data.get("nameCn")
        pubtime = album_data.get("releaseDate")
        if re.match("^\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y")
        elif re.match("^\d+-\d+-\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y-%m-%d")
        else:
            pubtime = datetime.utcfromtimestamp(0)
        directors = album_data.get("directory").split(" ")
        actors = album_data.get("starring").split(" ")
        desc = album_data.get("description")
        desc = "".join(desc.split())
        region = album_data.get("area")
        categories = album_data.get("subCategory").split(" ")
        tags = album_data.get("tag").split(" ")
        url = "http://so.letv.com/tv/%s.html" % source_id

        videos = []
        b = 1
        s = 60
        o = -1
        m = 0
        series_data = api_series(source_id, b, s, o, m, pcode, version)
        for series in series_data['body']['videoInfo']:
            id = series['id']
            mid = series['mid']
            url = "http://www.letv.com/ptv/vplay/%s.html" % id
            vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % (
                mid, pcode, version)
            jsurl = "javascript:getUrl('letv', '%s')" % vurl
            video = VideoItemModel({
                "title": series.get("nameCn"),
                "url": url,
                "stream": [{
                    "url": jsurl
                }],
                "image": series.get("picAll"),
                "duration": series.get("duration")
            })
            videos.append(video)

        model = VideoSourceModel({
                                 "source_id": source_id,
                                 "source": self.data.get('source'),
                                 "url": url,
                                 "channel": self.data['channel'],
                                 'title': title,
                                 "image": self.data['image'],
                                 "pubtime": pubtime,
                                 "directors": directors,
                                 "actors": actors,
                                 "desc": desc,
                                 "region": region,
                                 "categories": categories,
                                 "tags": tags,
                                 "videos": videos
                                 })
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #30

0

ファイルを表示

    def crawl(self):
        album_id = self.key
        if self.data['channel'] in SHORT_VIDEO:
            url = "http://v.qq.com/page/%s/%s/%s/%s.html" % (
                album_id[0], album_id[1], album_id[-1], album_id)
            pubtime = datetime.strptime(
                self.data["pubtime"], "%Y-%m-%d %H:%M:%S")
            video = VideoItemModel({
                "title": self.data["title"],
                "url": url,
                "stream": [{
                           "url": "javascript:getUrl('tencent', '%s')" % url
                           }],
                "image": self.data["image"],
                "channel": self.data["channel"],
            })
            model = VideoSourceModel({
                                     "source": self.data["source"],
                                     "source_id": album_id,
                                     "title": self.data["title"],
                                     "url": url,
                                     "image": self.data["image"],
                                     "channel": self.data["channel"],
                                     "pubtime": pubtime,
                                     "videos": [video]
                                     })
            export(model)
            self.data['to_album_id'] = model['to_album_id']
        else:
            album_url = "http://v.qq.com/detail/%s/%s.html" % (
                album_id[0], album_id)
            album_data = api_album(album_id[0], album_id)
            if album_data['trailer'] == 1:
                play_url = "http://v.qq.com/prev/%s/%s" % (
                    album_id[0], album_id)
            else:
                play_url = "http://v.qq.com/cover/%s/%s" % (
                    album_id[0], album_id)
            description = album_data.get("columndesc")
            if not description:
                description = album_data.get("desc")
            description = "".join(description.split())
            try:
                pubtime = datetime.strptime(self.data.get("pubtime"), "%Y")
            except:
                pubtime = datetime.utcfromtimestamp(0)

            videos = []
            columnid = album_data.get('columnid')
            rely = album_data.get('rely')
            if columnid:  # columnid != 0
                for video_dict in rely:
                    for year, months in video_dict.iteritems():
                        for month in months:
                            videolist_id = "%s_%s" % (year, month)
                            videos_data = api_video(columnid, videolist_id)
                            for video in videos_data['items']:
                                time = video.get('date')
                                time = datetime.strptime(time, "%Y-%m-%d")
                                url = "http://v.qq.com/cover/%s/%s.html" % (
                                    video.get('coverid')[0], video.get('coverid'))
                                video = VideoItemModel({
                                    "title": video.get('sectitle'),
                                    "description": video.get('breif'),
                                    "url": url,
                                    "stream": [{
                                               "url": "javascript:getUrl('tencent', '%s')" % url
                                               }],
                                    "image": video.get('snapurl'),
                                    "time": time
                                })
                                videos.append(video)
            if not columnid:  # columnid == 0, only one video
                for video in album_data['videos']:
                    videos.append(clean_video(video, play_url))

            # self.data is not None: export(data)
            if self.data:
                model = VideoSourceModel({
                    "source": self.data.get('source'),
                    "source_id": album_id,
                    "title": album_data['columnname'] if album_data['columnname'] else self.data["title"],
                    "image": self.data.get("image"),
                    "url": album_url,
                    "actors": self.data.get("actors"),
                    "directors": self.data.get("directors"),
                    "categories": self.data.get("categories"),
                    "channel": self.data.get("channel"),
                    "region": self.data.get("region"),
                    "description": description,
                    "pubtime": pubtime,
                    "videos": videos,
                })
            # self.data is None: crawl web data first
            # (http://v.qq.com/cover/x/xxxxx.html), and export(data)
            else:
                hxs = load_html(play_url)
                channel = hxs.select(
                    "//div[@class='mod_crumbs']/a[1]/text()").extract()[0]
                album_hxs = hxs.select(
                    "//div[@class='mod_video_intro mod_video_intro_rich']")
                image = album_hxs.select("a/img/@src").extract()[0]
                title = album_hxs.select(
                    "div[@class='video_title']/strong/a/text()").extract()[0]
                directors = []
                for director_hxs in album_hxs.select("//div[@itemprop='director']/a"):
                    director = director_hxs.select("span/text()").extract()[0]
                    directors.append(director)
                actors = []
                for actor_hxs in album_hxs.select("//div[@itemprop='actors']/a"):
                    actor = actor_hxs.select("span/text()").extract()[0]
                    actors.append(actor)
                region = album_hxs.select(
                    "//div[@class='info_area']/span[@class='content']/a/text()").extract()[0]
                categories = []
                for categorie_hxs in album_hxs.select("//div[@class='info_category']/span[@class='content']/a"):
                    categorie = categorie_hxs.select("text()").extract()[0]
                    categories.append(categorie)
                pubtime = album_hxs.select(
                    "//div[@class='info_years']/span[@class='content']/a/text()").extract()[0]
                if re.match("^\d+$", pubtime):
                    pubtime = datetime.strptime(pubtime, "%Y")
                else:
                    pubtime = datetime.utcfromtimestamp(0)

                model = VideoSourceModel({
                    "source": self.data.get('source'),
                    "source_id": album_id,
                    "title": title,
                    "image": image,
                    "url": album_url,
                    "actors": actors,
                    "directors": directors,
                    "categories": categories,
                    "channel": channel,
                    "region": region,
                    "description": description,
                    "pubtime": pubtime,
                    "videos": videos,
                })
            export(model)
            self.data['to_album_id'] = model['to_album_id']

コード例 #31

0

ファイルを表示

    def crawl(self):
        source_id = self.key
        album_data = api_album(source_id, pcode, version)
        album_data = album_data['body']
        title = album_data.get("nameCn")
        pubtime = album_data.get("releaseDate")
        if re.match("^\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y")
        elif re.match("^\d+-\d+-\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y-%m-%d")
        else:
            pubtime = datetime.utcfromtimestamp(0)
        directors = album_data.get("directory").split(" ")
        actors = album_data.get("starring").split(" ")
        desc = album_data.get("description")
        desc = "".join(desc.split())
        region = album_data.get("area")
        categories = album_data.get("subCategory").split(" ")
        tags = album_data.get("tag").split(" ")
        url = "http://so.letv.com/tv/%s.html" % source_id

        videos = []
        b = 1
        s = 60
        o = -1
        m = 0
        series_data = api_series(source_id, b, s, o, m, pcode, version)
        for series in series_data['body']['videoInfo']:
            id = series['id']
            mid = series['mid']
            url = "http://www.letv.com/ptv/vplay/%s.html" % id
            vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % (
                mid, pcode, version)
            jsurl = "javascript:getUrl('letv', '%s')" % vurl
            video = VideoItemModel({
                "title": series.get("nameCn"),
                "url": url,
                "stream": [{
                    "url": jsurl
                }],
                "image": series.get("picAll"),
                "duration": series.get("duration")
            })
            videos.append(video)

        model = VideoSourceModel({
            "source_id": source_id,
            "source": self.data.get('source'),
            "url": url,
            "channel": self.data['channel'],
            'title': title,
            "image": self.data['image'],
            "pubtime": pubtime,
            "directors": directors,
            "actors": actors,
            "desc": desc,
            "region": region,
            "categories": categories,
            "tags": tags,
            "videos": videos
        })
        export(model)
        self.data['to_album_id'] = model['to_album_id']

コード例 #32

0

ファイルを表示

ファイル: fangying.py プロジェクト: hitflame/ContentService

    def process_album(self, item):
        sites = {}
        fangying_id = re.findall("f_(.+)\.html", item['link'])[0]

        for play in item['plays']:
            site = play['site']
            if site not in SITES:
                continue

            if play["url"].find("fangying.com") != -1:
                stream = []
            else:
                format = "thunder" if site == "thunder" else ""
                stream = [{"url": play["url"], "format": format}]

            video = VideoItemModel({
                "title": play["title"],
                "url": play["url"],
                "stream": stream,
            })

            if not sites.has_key(site):
                sites[site] = []
            sites[site].append(dict(video))

        model = None
        for site, videos in sites.iteritems():
            model = VideoSourceModel({
                "source":
                self.data['source'],
                "source_id":
                fangying_id,
                "videos":
                videos,
                "title":
                item['title'],
                "directors":
                item['directors'].split("/"),
                "actors":
                item['performers'].split("/"),
                "description":
                item['description'],
                'categories':
                item['genres'].split("/"),
                'region':
                item['countries'].split("/")[0],
                'duration':
                parse_duration(item['duration']),
                'image':
                item['avatar_middle'],
                'score':
                float(item['douban_rating'])
                if item.get('douban_rating') else None,
                'url':
                item['link'],
                'price':
                0.0,
                'pubtime':
                parse_pubtime(item['release_time']),
                'channel':
                CHANNELS.get(self.key)
            })
            export(model)

        if model:
            Scheduler.schedule(RelationCrawler.type,
                               key=fangying_id,
                               data={
                                   'title': model['title'],
                                   'url': model['url']
                               })

コード例 #33

0

ファイルを表示

    def crawl(self):
        #key为专辑源站ID
        album_id = self.key

        album_url = "http://bdzy.cc/detail/?%s.html" % album_id
        hxs = load_html(album_url)

        urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("bdhd://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            #视频剧集
            video = VideoItemModel({
                            "title" : title,
                            "url" : url, #网页地址 (这里没有，所以采用播放地址)
                            "stream" : [
                                        {
                                         "url" : url, #视频文件播放地址
                                         "size" : size,
                                         "format" : "bdhd" #视频格式(协议)
                                        }],
                            })

            videos.append(video)

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract())

        try:
            image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0]
        except:
            image = None

        #视频导出的数据模型
        model = VideoSourceModel({
                                 "source" : self.data['source'], #视频源
                                 "source_id" : album_id, #源站ID
                                 "title" : self.data["title"],
                                 "url" : album_url, #网页地址
                                 "image" : image, #图片url
                                 "time" : self.data.get('time'), #源站更新时间
                                 "categories" : [self.data.get('category')], #分类
                                 "channel" : self.data.get('category'), #频道
                                 "region" : self.data.get('region'), #地区
                                 "videos" : videos, #视频专辑数组
                                 "pubtime" : parse_date(kv.get(u"上映日期：")), #上映时间
                                 "actors" : split(kv.get(u"影片演员：")),
                                 "completed" : kv.get(u"影片状态：", "").find(u"连载") == -1, #是否完结
                                 "description" : description,
                                 })
        #导出数据
        export(model)