Exemple #1
0
def clean_video(video):
    video = video['video']
    flvid = video.get('flvid')
    duration = video.get('duration')
    duration = map(int, duration.split(":"))
    if len(duration) == 2:
        duration = duration[0] * 60 + duration[1]
    elif len(duration) == 3:
        duration = duration[0] * 3600 + duration[1] * 60 + duration[2]
    video = VideoItemModel({
        "title":
        video.get('title'),
        "image":
        video.get('img'),
        "duration":
        duration,
        "description":
        video.get('introduce'),
        "url":
        "http://www.56.com/u13/v_%s.html" % flvid,
        "stream": ["http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % flvid],
        "stream_high":
        ["http://vxml.56.com/html5/%s/?src=3g&res=qvga" % flvid]
    })
    return video
Exemple #2
0
    def crawl(self):
        videos = []
        mid = self.key
        url = DETAIL % mid
        detail = loadurl(url)
        description = detail.get('plots')
        description = ''.join(description.split())
        if self.data.get('channel') == u'鐢靛奖':
            dict_ = detail['pinfos']['mpurls']
            video = VideoItemModel({
                                    "title": self.data.get('title'),
                                    "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃
                                    "image": self.data.get('image'),
                                    "description": description,
                                    "stream": [{
                                                 'url': dict_['tv'].get('url'),
                                                 'size': dict_['tv'].get('bits'),
                                                 'format': 'mp4'
                                                }]
                                    })   
            videos.append(video)
        else:
            try:
                sort = detail['pinfos'].get('sort')[0]    
                episodes = detail['pinfos']['content'][sort]['fsps']
            except:
                episodes = detail['pinfos']['fsps']

            for episode in episodes:
                plots = episode.get('plots')
                plots = ''.join(plots.split())                
                video = VideoItemModel({
                                     "title": episode.get('taskname'),
                                     "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃
                                     "image": episode.get('picurl'),
                                     "description": plots,
                                     "stream": getstream(episode.get('mpurls'))
                                     })
                videos.append(video)           
        model = VideoSourceModel({
                                 "source": self.data.get('source'), 
                                 "source_id": mid, #婧愮珯ID
                                 "title": self.data["title"],
                                 "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃
                                 "image": self.data.get('image'), #鍥剧墖url
                                 "categories": self.data.get('category'), #鍒嗙被
                                 "channel": self.data.get('channel'), #棰戦亾
                                 "region": detail.get('country'), #鍦板尯
                                 "videos": videos, #瑙嗛涓撹緫
                                 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿
                                 "actors": detail.get('lactor'),
                                 "directors": detail.get('director'),
                                 "description": description,
                                 })
        #瀵煎嚭鏁版嵁
        export(model)
        self.data['to_album_id'] = model['to_album_id']
Exemple #3
0
    def crawl(self):
        catecode = self.data["catecode"]
        last_updated = self.data.get("updated", datetime.min)
        current_updated = datetime.max
        max_time = last_updated

        page = 1
        pagesize = 20
        while True:
            try:
                data = api_albums(catecode, page, pagesize)
                for item in data["videos"]:
                    try:
                        sid = item.get('sid')
                        detail = api_album(sid) if sid else None
                        model = self.extract_model(item, detail)

                        if sid:
                            videos = self.get_videos(sid)
                            if model['channel'] in [
                                    u'综艺'
                            ]:  #reverse order for zongyi
                                videos = [video for video in reversed(videos)]
                        else:
                            video = VideoItemModel({
                                "title":
                                model["title"],
                                "image":
                                model["image"],
                                "description":
                                model["description"],
                                "time":
                                model["time"],
                                "price":
                                model["price"],
                                "duration":
                                model["duration"],
                                "url":
                                model["url"]
                            })
                            videos = [video]

                        model['videos'] = videos
                        export(model)

                        current_updated = model["time"]
                        if max_time < current_updated:
                            max_time = current_updated
                    except:
                        self.logger.warning(get_exception_info())

                if current_updated < last_updated:
                    break
                if page * pagesize >= data["count"]:
                    break
            except:
                self.logger.warning(get_exception_info())
            page += 1

        self.data["updated"] = max_time
Exemple #4
0
 def crawl(self):
     timestr = self.data.get('videoLength', '00:00')
     duration = gettime(timestr)
     videos = []
     video = VideoItemModel({
         "title":
         self.data.get('title'),
         "url":
         self.data.get('videoURLMid'),  #网页地址
         "image":
         self.data.get('imgURL'),
         "description":
         self.data.get('desc'),
         "stream": [{
             "url": self.data.get('videoURLMid'),  #视频文件播放地址
             "size": self.data.get('videoSizeMid'),
             "format": "mp4",  #视频格式(协议)
             "duration": duration
         }],
         "stream_low": [{
             "url": self.data.get('videoURLLow'),
             "size": self.data.get('videoSizeLow'),
             "format": "mp4",
             "duration": duration
         }],
         "stream_high": [{
             "url": self.data.get('videoURLHigh'),
             "size": self.data.get('videoSizeHigh'),
             "format": "mp4",
             "duration": duration
         }]
     })
     videos.append(video)
     model = VideoSourceModel({
         "source":
         self.data.get('source'),
         "source_id":
         self.data.get('id'),  #源站ID
         "title":
         self.data.get("title"),
         "url":
         self.data.get('shareurl'),  #详情页的地址
         "image":
         self.data.get('imgURL'),  #图片url
         "channel":
         CHANNEL,  #频道
         "videos":
         videos,  #视频专辑
         "pubtime":
         parse_date(self.data.get('videoPublishTime')),  #上映时间
         "description":
         self.data.get('desc'),
     })
     #导出数据
     export(model)
     self.data['to_album_id'] = model['to_album_id']
Exemple #5
0
def clean_video(video):
    video = video['video']
    flvid = video.get('flvid')
    duration = video.get('duration')
    duration = map(int, duration.split(":"))
    if len(duration) == 2:
        duration = duration[0] * 60 + duration[1]
    elif len(duration) == 3:
        duration = duration[0] * 3600 + duration[1] * 60 + duration[2]
    video = VideoItemModel({
                           "title": video.get('title'),
                           "image": video.get('img'),
                           "duration": duration,
                           "description": video.get('introduce'),
                           "url": "http://www.56.com/u13/v_%s.html" % flvid,
                           "stream": ["http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % flvid],
                           "stream_high": ["http://vxml.56.com/html5/%s/?src=3g&res=qvga" % flvid]
                           })
    return video
Exemple #6
0
def clean_video(video, play_url):
    url = "%s/%s.html" % (play_url, video.get("vid"))
    new_video = VideoItemModel({
        "title": video.get("tt") + video.get("secondtitle"),
        "url": url,
        "stream": [{
                   "url": "javascript:getUrl('tencent', '%s')" % url
                   }],
        "image": video.get("screenshot"),
    })
    return new_video
Exemple #7
0
    def crawl(self):
        album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key
        hxs = load_html(album_url)

        urls = hxs.select("//div[@class='movievod']/li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(VideoItemModel({
                            "title" : title,
                            "url" : url,
                            "stream" : [{"url" : url, "format" : "qvod", "size" : size}],
                            }))

        kv = {}
        for s in hxs.select("//div[@class='videoDetail']/p"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract())
        try:
            image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0]
        except:
            image = None

        model = VideoSourceModel({
                                 "source" : self.data['source'],
                                 "source_id" : self.key,
                                 "title" : self.data["title"],
                                 "time" : self.data.get('time'),
                                 "url" : album_url,
                                 "image" : image,
                                 "completed" : self.data.get('completed'),
                                 "categories" : [self.data.get('category')],
                                 "channel" : self.data.get('category'),
                                 "region" : self.data.get('region'),
                                 "videos" : videos,
                                 "actors" : split(kv.get(u'影片主演:')),
                                 "directors" : split(kv.get(u'影片导演:')),
                                 "pubtime" : parse_date(kv.get(u'上映年份:')),
                                 "description" : description,
                                 "completed" : not kv.get(u'连载状态:'),
                                 })
        export(model)
Exemple #8
0
    def crawl_video(self, show_id):
        videos = []
        page = 1
        pagesize = 30
        while True:
            data = api_videos(show_id, page, pagesize)
            if not data['results']:
                break

            for item in data['results']:
                try:
                    video = VideoItemModel({
                        "title":
                        item['title'],
                        "source_id":
                        item['videoid'],
                        "url":
                        "http://v.youku.com/v_show/id_%s.html" %
                        item['videoid'],
                    })

                    jsurl = "javascript: getUrl('youku', '%s')" % item[
                        "videoid"]
                    video["stream"] = [{"url": jsurl}]

                    # TODO:
                    #                     ret = api_plays(item['videoid'])
                    #                     results = ret.get('results', {})
                    #                     for key, fmt in FORMATS_NORMAL:
                    #                         if results.get(key):
                    #                             video["stream_low"] = self.extract_stream(results[key], fmt)
                    #                             break
                    #                     for key, fmt in FORMATS_HIGH:
                    #                         if results.get(key):
                    #                             video["stream"] = self.extract_stream(results[key], fmt)
                    #                             break
                    #                     for key, fmt in FORMATS_HD:
                    #                         if results.get(key):
                    #                             video["stream_high"] = self.extract_stream(results[key], fmt)
                    #                             break

                    videos.append(video)
                except:
                    self.logger.warning(get_exception_info())

            if pagesize * page >= data['total']:
                break
            page += 1
        return videos
Exemple #9
0
    def extract_video(self, item):
        item = item["map"]
        video = VideoItemModel({
            "title":
            item.get("tv_name"),
            "image":
            item.get("ver_big_pic"),
            "description":
            item.get("tv_desc"),
            "duration":
            int(item.get("time_length", "0")),
            "url":
            item.get("url_html5", item.get("tv_url")),
            #out of date!   "time" : datetime.strptime(item.get("update_time", "1970-01-01 00:00:00")[:19], "%Y-%m-%d %H:%M:%S"),
        })

        playinfo = api_playinfo(item["tv_ver_id"])

        stream_nor = []
        stream_high = []
        stream_super = []
        stream_mobile = [{
            "url": playinfo.get("downloadurl", ""),
            "size": playinfo.get("file_size_mobile", 0),
            "format": "mp4",
        }]

        if playinfo.get("url_nor_mp4"):
            urls = playinfo["url_nor_mp4"].split(",")
            durations = playinfo["clipsDuration_nor"]
            sizes = playinfo["clipsBytes_nor"]
            stream_nor = self.extract_stream(urls, durations, sizes)
        if playinfo.get("url_high_mp4"):
            urls = playinfo["url_high_mp4"].split(",")
            durations = playinfo["clipsDuration_high"]
            sizes = playinfo["clipsBytes_high"]
            stream_high = self.extract_stream(urls, durations, sizes)
        if playinfo.get("url_super_mp4"):
            urls = playinfo["url_super_mp4"].split(",")
            durations = playinfo["clipsDuration_super"]
            sizes = playinfo["clipsBytes_super"]
            stream_super = self.extract_stream(urls, durations, sizes)

        video["stream_low"] = stream_nor
        video["stream_high"] = stream_super
        #video["stream"] = stream_high
        video["stream"] = stream_mobile
        return video
Exemple #10
0
def extract_video(video):
    m = re.match(".+/(.+).(?=m3u8|mp4)", video.res[len(video.res) - 1].vid)
    if m:
        vid = m.group(1)
    else:
        raise Exception("No vid found.")

    item = VideoItemModel({
        'url':
        "http://m.iqiyi.com/play.html?tvid=%s&vid=%s" % (video._id, vid),
        'title':
        video._n,
        'duration':
        int(video._dn),
        'description':
        video.desc,
    })

    return item
Exemple #11
0
def get_videos(album_id, url_key):
    list_videos = api_video(album_id)
    list_videos = list_videos.get('items')

    videos = []
    for item in list_videos:
        video = VideoItemModel({
            "title":
            item['title'],
            "url":
            "http://www.tudou.com/albumplay/%s/%s.html" %
            (url_key, item['itemCode']),
            "image":
            item['item_img_hd'],
            "duration":
            int(item['duration']),
            "stream": [{
                "url":
                "javascript:getUrl('tudou', '%s')" % item['vcode']
            }]
        })
        videos.append(video)
    return videos
Exemple #12
0
    def crawl(self):
        album_id = self.key
        if self.data['channel'] in SHORT_VIDEO:
            url = "http://v.qq.com/page/%s/%s/%s/%s.html" % (
                album_id[0], album_id[1], album_id[-1], album_id)
            pubtime = datetime.strptime(
                self.data["pubtime"], "%Y-%m-%d %H:%M:%S")
            video = VideoItemModel({
                "title": self.data["title"],
                "url": url,
                "stream": [{
                           "url": "javascript:getUrl('tencent', '%s')" % url
                           }],
                "image": self.data["image"],
                "channel": self.data["channel"],
            })
            model = VideoSourceModel({
                                     "source": self.data["source"],
                                     "source_id": album_id,
                                     "title": self.data["title"],
                                     "url": url,
                                     "image": self.data["image"],
                                     "channel": self.data["channel"],
                                     "pubtime": pubtime,
                                     "videos": [video]
                                     })
            export(model)
            self.data['to_album_id'] = model['to_album_id']
        else:
            album_url = "http://v.qq.com/detail/%s/%s.html" % (
                album_id[0], album_id)
            album_data = api_album(album_id[0], album_id)
            if album_data['trailer'] == 1:
                play_url = "http://v.qq.com/prev/%s/%s" % (
                    album_id[0], album_id)
            else:
                play_url = "http://v.qq.com/cover/%s/%s" % (
                    album_id[0], album_id)
            description = album_data.get("columndesc")
            if not description:
                description = album_data.get("desc")
            description = "".join(description.split())
            try:
                pubtime = datetime.strptime(self.data.get("pubtime"), "%Y")
            except:
                pubtime = datetime.utcfromtimestamp(0)

            videos = []
            columnid = album_data.get('columnid')
            rely = album_data.get('rely')
            if columnid:  # columnid != 0
                for video_dict in rely:
                    for year, months in video_dict.iteritems():
                        for month in months:
                            videolist_id = "%s_%s" % (year, month)
                            videos_data = api_video(columnid, videolist_id)
                            for video in videos_data['items']:
                                time = video.get('date')
                                time = datetime.strptime(time, "%Y-%m-%d")
                                url = "http://v.qq.com/cover/%s/%s.html" % (
                                    video.get('coverid')[0], video.get('coverid'))
                                video = VideoItemModel({
                                    "title": video.get('sectitle'),
                                    "description": video.get('breif'),
                                    "url": url,
                                    "stream": [{
                                               "url": "javascript:getUrl('tencent', '%s')" % url
                                               }],
                                    "image": video.get('snapurl'),
                                    "time": time
                                })
                                videos.append(video)
            if not columnid:  # columnid == 0, only one video
                for video in album_data['videos']:
                    videos.append(clean_video(video, play_url))

            # self.data is not None: export(data)
            if self.data:
                model = VideoSourceModel({
                    "source": self.data.get('source'),
                    "source_id": album_id,
                    "title": album_data['columnname'] if album_data['columnname'] else self.data["title"],
                    "image": self.data.get("image"),
                    "url": album_url,
                    "actors": self.data.get("actors"),
                    "directors": self.data.get("directors"),
                    "categories": self.data.get("categories"),
                    "channel": self.data.get("channel"),
                    "region": self.data.get("region"),
                    "description": description,
                    "pubtime": pubtime,
                    "videos": videos,
                })
            # self.data is None: crawl web data first
            # (http://v.qq.com/cover/x/xxxxx.html), and export(data)
            else:
                hxs = load_html(play_url)
                channel = hxs.select(
                    "//div[@class='mod_crumbs']/a[1]/text()").extract()[0]
                album_hxs = hxs.select(
                    "//div[@class='mod_video_intro mod_video_intro_rich']")
                image = album_hxs.select("a/img/@src").extract()[0]
                title = album_hxs.select(
                    "div[@class='video_title']/strong/a/text()").extract()[0]
                directors = []
                for director_hxs in album_hxs.select("//div[@itemprop='director']/a"):
                    director = director_hxs.select("span/text()").extract()[0]
                    directors.append(director)
                actors = []
                for actor_hxs in album_hxs.select("//div[@itemprop='actors']/a"):
                    actor = actor_hxs.select("span/text()").extract()[0]
                    actors.append(actor)
                region = album_hxs.select(
                    "//div[@class='info_area']/span[@class='content']/a/text()").extract()[0]
                categories = []
                for categorie_hxs in album_hxs.select("//div[@class='info_category']/span[@class='content']/a"):
                    categorie = categorie_hxs.select("text()").extract()[0]
                    categories.append(categorie)
                pubtime = album_hxs.select(
                    "//div[@class='info_years']/span[@class='content']/a/text()").extract()[0]
                if re.match("^\d+$", pubtime):
                    pubtime = datetime.strptime(pubtime, "%Y")
                else:
                    pubtime = datetime.utcfromtimestamp(0)

                model = VideoSourceModel({
                    "source": self.data.get('source'),
                    "source_id": album_id,
                    "title": title,
                    "image": image,
                    "url": album_url,
                    "actors": actors,
                    "directors": directors,
                    "categories": categories,
                    "channel": channel,
                    "region": region,
                    "description": description,
                    "pubtime": pubtime,
                    "videos": videos,
                })
            export(model)
            self.data['to_album_id'] = model['to_album_id']
Exemple #13
0
    def crawl(self):
        type = 4
        album_id = self.key
        title = self.data['title'].encode('utf-8')
        channel = self.data.get('channel')

        if channel in LONG_VIDEO_CHANNELS.items():
            album_data = api_album(type, album_id, title)
            album_data = album_data['data']
            pubtime = album_data.get("public_time")
            pubtime = datetime.strptime(pubtime, "%Y%m%d")

            videos = []
            for video in album_data['data']:
                video = clean_video(video)
                videos.append(video)

            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": album_data.get("bpic"),
                "image2": album_data.get("mpic"),
                "url": album_data.get("web_url"),
                "actors": album_data.get("actors"),
                "directors": album_data.get("director"),
                "categories": album_data.get("tname"),
                "tags": self.data.get("tags"),
                "channel": channel,
                "region": album_data.get("zname")[0],
                "description": album_data.get("introduce"),
                "pubtime": pubtime,
                "videos": videos,
            })
        else:
            video = VideoItemModel({
                "title":
                title,
                "description":
                self.data.get("description"),
                "url":
                "http://www.56.com/u13/v_%s.html" % album_id,
                "stream": [{
                    "url":
                    "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id
                }],
                "stream_high": [{
                    "url":
                    "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id
                }]
            })
            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": self.data.get("bpic"),
                "image2": self.data.get("mpic"),
                "tags": self.data.get("tags"),
                "url": self.data.get("web_url"),
                "channel": channel,
                "description": self.data.get("introduce"),
                "videos": [video],
            })
        export(model)
        self.data['to_album_id'] = model['to_album_id']
Exemple #14
0
    def crawl(self):
        source_id = self.key
        album_data = api_album(source_id, pcode, version)
        album_data = album_data['body']
        title = album_data.get("nameCn")
        pubtime = album_data.get("releaseDate")
        if re.match("^\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y")
        elif re.match("^\d+-\d+-\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y-%m-%d")
        else:
            pubtime = datetime.utcfromtimestamp(0)
        directors = album_data.get("directory").split(" ")
        actors = album_data.get("starring").split(" ")
        desc = album_data.get("description")
        desc = "".join(desc.split())
        region = album_data.get("area")
        categories = album_data.get("subCategory").split(" ")
        tags = album_data.get("tag").split(" ")
        url = "http://so.letv.com/tv/%s.html" % source_id

        videos = []
        b = 1
        s = 60
        o = -1
        m = 0
        series_data = api_series(source_id, b, s, o, m, pcode, version)
        for series in series_data['body']['videoInfo']:
            id = series['id']
            mid = series['mid']
            url = "http://www.letv.com/ptv/vplay/%s.html" % id
            vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % (
                mid, pcode, version)
            jsurl = "javascript:getUrl('letv', '%s')" % vurl
            video = VideoItemModel({
                "title": series.get("nameCn"),
                "url": url,
                "stream": [{
                    "url": jsurl
                }],
                "image": series.get("picAll"),
                "duration": series.get("duration")
            })
            videos.append(video)

        model = VideoSourceModel({
            "source_id": source_id,
            "source": self.data.get('source'),
            "url": url,
            "channel": self.data['channel'],
            'title': title,
            "image": self.data['image'],
            "pubtime": pubtime,
            "directors": directors,
            "actors": actors,
            "desc": desc,
            "region": region,
            "categories": categories,
            "tags": tags,
            "videos": videos
        })
        export(model)
        self.data['to_album_id'] = model['to_album_id']
Exemple #15
0
    def crawl(self):
        album_url = "http://www.265zy.com/detail/?%s.html" % self.key
        hxs = load_html(album_url)

        urls = hxs.select(
            "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(
                VideoItemModel({
                    "title":
                    title,
                    "url":
                    url,
                    "stream": [{
                        "url": url,
                        "format": "qvod",
                        "size": size
                    }],
                }))

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(
            hxs.select("//div[@class='intro']/.//text()").extract())
        try:
            image = urlparse.urljoin(
                "http://www.265zy.com/",
                hxs.select("//div[@class='img']/img/@src").extract()[0])
        except:
            image = None

        model = VideoSourceModel({
            "source":
            self.data['source'],
            "source_id":
            self.key,
            "title":
            self.data["title"],
            "image":
            image,
            "url":
            album_url,
            "time":
            self.data.get('time'),
            "categories": [self.data.get('category')],
            "channel":
            self.data.get('category'),
            "region":
            self.data.get('region'),
            "videos":
            videos,
            "actors":
            split(kv.get(u"影片演员:")),
            "pubtime":
            parse_date(kv.get(u"上映日期:")),
            "completed":
            kv.get(u"影片状态:", "").find(u"连载") == -1,
            "description":
            description,
        })
        export(model)
Exemple #16
0
    def process_album(self, item):
        sites = {}
        fangying_id = re.findall("f_(.+)\.html", item['link'])[0]

        for play in item['plays']:
            site = play['site']
            if site not in SITES:
                continue

            if play["url"].find("fangying.com") != -1:
                stream = []
            else:
                format = "thunder" if site == "thunder" else ""
                stream = [{"url": play["url"], "format": format}]

            video = VideoItemModel({
                "title": play["title"],
                "url": play["url"],
                "stream": stream,
            })

            if not sites.has_key(site):
                sites[site] = []
            sites[site].append(dict(video))

        model = None
        for site, videos in sites.iteritems():
            model = VideoSourceModel({
                "source":
                self.data['source'],
                "source_id":
                fangying_id,
                "videos":
                videos,
                "title":
                item['title'],
                "directors":
                item['directors'].split("/"),
                "actors":
                item['performers'].split("/"),
                "description":
                item['description'],
                'categories':
                item['genres'].split("/"),
                'region':
                item['countries'].split("/")[0],
                'duration':
                parse_duration(item['duration']),
                'image':
                item['avatar_middle'],
                'score':
                float(item['douban_rating'])
                if item.get('douban_rating') else None,
                'url':
                item['link'],
                'price':
                0.0,
                'pubtime':
                parse_pubtime(item['release_time']),
                'channel':
                CHANNELS.get(self.key)
            })
            export(model)

        if model:
            Scheduler.schedule(RelationCrawler.type,
                               key=fangying_id,
                               data={
                                   'title': model['title'],
                                   'url': model['url']
                               })
Exemple #17
0
    def crawl(self):
        #key为专辑源站ID
        album_id = self.key

        album_url = "http://bdzy.cc/detail/?%s.html" % album_id
        hxs = load_html(album_url)

        urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("bdhd://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            #视频剧集
            video = VideoItemModel({
                            "title" : title,
                            "url" : url, #网页地址 (这里没有,所以采用播放地址)
                            "stream" : [
                                        {
                                         "url" : url, #视频文件播放地址
                                         "size" : size,
                                         "format" : "bdhd" #视频格式(协议)
                                        }],
                            })

            videos.append(video)

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract())

        try:
            image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0]
        except:
            image = None

        #视频导出的数据模型
        model = VideoSourceModel({
                                 "source" : self.data['source'], #视频源
                                 "source_id" : album_id, #源站ID
                                 "title" : self.data["title"],
                                 "url" : album_url, #网页地址
                                 "image" : image, #图片url
                                 "time" : self.data.get('time'), #源站更新时间
                                 "categories" : [self.data.get('category')], #分类
                                 "channel" : self.data.get('category'), #频道
                                 "region" : self.data.get('region'), #地区
                                 "videos" : videos, #视频专辑数组
                                 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间
                                 "actors" : split(kv.get(u"影片演员:")),
                                 "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结
                                 "description" : description,
                                 })
        #导出数据
        export(model)