コード例 #1
0
ファイル: fengxing.py プロジェクト: hitflame/ContentService
    def crawl(self):
        videos = []
        mid = self.key
        url = DETAIL % mid
        detail = loadurl(url)
        description = detail.get('plots')
        description = ''.join(description.split())
        if self.data.get('channel') == u'鐢靛奖':
            dict_ = detail['pinfos']['mpurls']
            video = VideoItemModel({
                                    "title": self.data.get('title'),
                                    "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃
                                    "image": self.data.get('image'),
                                    "description": description,
                                    "stream": [{
                                                 'url': dict_['tv'].get('url'),
                                                 'size': dict_['tv'].get('bits'),
                                                 'format': 'mp4'
                                                }]
                                    })   
            videos.append(video)
        else:
            try:
                sort = detail['pinfos'].get('sort')[0]    
                episodes = detail['pinfos']['content'][sort]['fsps']
            except:
                episodes = detail['pinfos']['fsps']

            for episode in episodes:
                plots = episode.get('plots')
                plots = ''.join(plots.split())                
                video = VideoItemModel({
                                     "title": episode.get('taskname'),
                                     "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃
                                     "image": episode.get('picurl'),
                                     "description": plots,
                                     "stream": getstream(episode.get('mpurls'))
                                     })
                videos.append(video)           
        model = VideoSourceModel({
                                 "source": self.data.get('source'), 
                                 "source_id": mid, #婧愮珯ID
                                 "title": self.data["title"],
                                 "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃
                                 "image": self.data.get('image'), #鍥剧墖url
                                 "categories": self.data.get('category'), #鍒嗙被
                                 "channel": self.data.get('channel'), #棰戦亾
                                 "region": detail.get('country'), #鍦板尯
                                 "videos": videos, #瑙嗛涓撹緫
                                 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿
                                 "actors": detail.get('lactor'),
                                 "directors": detail.get('director'),
                                 "description": description,
                                 })
        #瀵煎嚭鏁版嵁
        export(model)
        self.data['to_album_id'] = model['to_album_id']
コード例 #2
0
ファイル: fenghuang.py プロジェクト: hitflame/ContentService
 def crawl(self):
     timestr = self.data.get('videoLength', '00:00')
     duration = gettime(timestr)
     videos = []
     video = VideoItemModel({
         "title":
         self.data.get('title'),
         "url":
         self.data.get('videoURLMid'),  #网页地址
         "image":
         self.data.get('imgURL'),
         "description":
         self.data.get('desc'),
         "stream": [{
             "url": self.data.get('videoURLMid'),  #视频文件播放地址
             "size": self.data.get('videoSizeMid'),
             "format": "mp4",  #视频格式(协议)
             "duration": duration
         }],
         "stream_low": [{
             "url": self.data.get('videoURLLow'),
             "size": self.data.get('videoSizeLow'),
             "format": "mp4",
             "duration": duration
         }],
         "stream_high": [{
             "url": self.data.get('videoURLHigh'),
             "size": self.data.get('videoSizeHigh'),
             "format": "mp4",
             "duration": duration
         }]
     })
     videos.append(video)
     model = VideoSourceModel({
         "source":
         self.data.get('source'),
         "source_id":
         self.data.get('id'),  #源站ID
         "title":
         self.data.get("title"),
         "url":
         self.data.get('shareurl'),  #详情页的地址
         "image":
         self.data.get('imgURL'),  #图片url
         "channel":
         CHANNEL,  #频道
         "videos":
         videos,  #视频专辑
         "pubtime":
         parse_date(self.data.get('videoPublishTime')),  #上映时间
         "description":
         self.data.get('desc'),
     })
     #导出数据
     export(model)
     self.data['to_album_id'] = model['to_album_id']
コード例 #3
0
    def crawl(self):
        album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key
        hxs = load_html(album_url)

        urls = hxs.select("//div[@class='movievod']/li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(VideoItemModel({
                            "title" : title,
                            "url" : url,
                            "stream" : [{"url" : url, "format" : "qvod", "size" : size}],
                            }))

        kv = {}
        for s in hxs.select("//div[@class='videoDetail']/p"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract())
        try:
            image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0]
        except:
            image = None

        model = VideoSourceModel({
                                 "source" : self.data['source'],
                                 "source_id" : self.key,
                                 "title" : self.data["title"],
                                 "time" : self.data.get('time'),
                                 "url" : album_url,
                                 "image" : image,
                                 "completed" : self.data.get('completed'),
                                 "categories" : [self.data.get('category')],
                                 "channel" : self.data.get('category'),
                                 "region" : self.data.get('region'),
                                 "videos" : videos,
                                 "actors" : split(kv.get(u'影片主演:')),
                                 "directors" : split(kv.get(u'影片导演:')),
                                 "pubtime" : parse_date(kv.get(u'上映年份:')),
                                 "description" : description,
                                 "completed" : not kv.get(u'连载状态:'),
                                 })
        export(model)
コード例 #4
0
ファイル: hakuzy.py プロジェクト: LandyGuo/ContentService
    def crawl(self):
        album_url = "http://hakuzy.com/detail/?%s.html" % self.key
        hxs = load_html(album_url)

        urls = hxs.select("//td[@class='bt']/.//input[@id='copy_yah']/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(VideoItemModel({
                            "title" : title,
                            "url" : url,
                            "stream" : [{"url" : url, "format" : "qvod", "size" : size}],
                            }))

        kv = {}
        for s in hxs.select("/html/body/table[4]/tbody/tr[1]/td[2]/table/tbody/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract())
        try:
            image = hxs.select("//div[@class='img']/img/@src").extract()[0]
        except:
            image = None

        model = VideoSourceModel({
                                 "source" : SOURCE,
                                 "source_id" : self.key,
                                 "title" : self.data["title"],
                                 "time" : self.data.get('time'),
                                 "url" : album_url,
                                 "image" : image,
                                 "categories" : [self.data.get('category')],
                                 "channel" : self.data.get('category'),
                                 "region" : self.data.get('region'),
                                 "videos" : videos,
                                 "pubtime" : parse_date(kv.get(u"上映日期:")),
                                 "actors" : split(kv.get(u"影片演员:")),
                                 "directors" : split(kv.get(u"影片导演:")),
                                 "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1,
                                 "description" : description,
                                 })
        export(model)
コード例 #5
0
ファイル: scripts.py プロジェクト: hitflame/ContentService
def import_douban():
    from contentservice.utils.datetimeutil import parse_date
    from contentservice.settings import MONGO_CONN_STR
    db = MongoClient(MONGO_CONN_STR).douban

    pdb.set_trace()

    def clean_title(title):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        if zhPattern.search(title):
            return title.split(" ")[0]
        else:
            return title

    for item in db.album.find():
        pubtime = None
        if item['pub_time']:
            pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0]))

        model = VideoSourceModel({
            "title":
            clean_title(item['title']),
            "categories":
            item['sub_category'],
            "image":
            item["img"],
            "related":
            item["related"],
            "score":
            item["score"],
            "actors":
            item["actors"],
            "region":
            item["area"][0] if item["area"] else None,
            "url":
            item["url"],
            "description":
            item["description"],
            "pubtime":
            pubtime,
            "channel":
            u"电影",
            "source":
            "douban",
            "source_id":
            re.findall("/(\d+)/", item['url'])[0],
        })
        model.on_import()
        print model['title']
コード例 #6
0
ファイル: scripts.py プロジェクト: LandyGuo/ContentService
def import_mtime():
    from datetime import datetime
    from contentservice.models.video import VideoSourceModel
    from contentservice.settings import MONGO_CONN_STR
    from contentservice.utils.datetimeutil import parse_date
    db = MongoClient(MONGO_CONN_STR).mtime
    pdb.set_trace()

    for item in db.album.find():
        area = item["area"] if item.get("area") else None
        if isinstance(area, list):
            area = area[0]
        categories = item.get("type")
        if isinstance(categories, basestring):
            categories = [categories]
        description = item.get("description")
        if isinstance(description, list):
            description = "\n".join(description)
        tags = item.get("tags")
        if isinstance(tags, basestring):
            tags = [tags]
        channel = ""
        if item.get("category_id") == "1":
            channel = u"电影"
        elif item.get("category_id") == "0":
            channel = u"电视剧"
        try:
            model = VideoSourceModel({
                                    "source" : "mtime",
                                    "source_id" : item["id"],
                                    "title" : item["title"],
                                    "description" : description,
                                    "tags" : tags,
                                    "time" : datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"),
                                    "duration" : item.get("duration"),
                                    "region" : area,
                                    "directors" : item.get("directors"),
                                    "score" : item.get("score"),
                                    "actors" : item.get("actors"),
                                    "categories" : categories,
                                    "channel" : channel,
                                    "url" : "http://movie.mtime.com/%s/" % item["id"],
                                    "pubtime" : parse_date(item["release_time"]),
                                      })
            model.on_import()
        except Exception, e:
            print e
        print model["title"]
コード例 #7
0
ファイル: fenghuang.py プロジェクト: LandyGuo/ContentService
 def crawl(self): 
     timestr = self.data.get('videoLength','00:00')
     duration = gettime(timestr)
     videos = []
     video = VideoItemModel({
                             "title": self.data.get('title'),
                             "url": self.data.get('videoURLMid'), #网页地址
                             "image": self.data.get('imgURL'),
                             "description": self.data.get('desc'),
                             "stream": [{
                                        "url": self.data.get('videoURLMid'), #视频文件播放地址
                                        "size": self.data.get('videoSizeMid'),
                                        "format": "mp4", #视频格式(协议)
                                        "duration": duration
                                       }],
                             "stream_low":[{
                                      "url": self.data.get('videoURLLow'), 
                                      "size": self.data.get('videoSizeLow'),
                                      "format": "mp4",
                                      "duration": duration
                                     }],
                             "stream_high":[{
                                      "url": self.data.get('videoURLHigh'),
                                      "size": self.data.get('videoSizeHigh'),
                                      "format": "mp4", 
                                      "duration": duration
                                     }]
                         })
     videos.append(video)   
     model = VideoSourceModel({
                              "source": self.data.get('source'),
                              "source_id": self.data.get('id'), #源站ID
                              "title": self.data.get("title"),
                              "url": self.data.get('shareurl'), #详情页的地址
                              "image": self.data.get('imgURL'), #图片url
                              "channel": CHANNEL, #频道
                              "videos": videos, #视频专辑
                              "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间
                              "description": self.data.get('desc'),
                              })
     #导出数据
     export(model)
     self.data['to_album_id'] = model['to_album_id']
コード例 #8
0
ファイル: scripts.py プロジェクト: LandyGuo/ContentService
def import_douban():
    from contentservice.utils.datetimeutil import parse_date
    from contentservice.settings import MONGO_CONN_STR
    db = MongoClient(MONGO_CONN_STR).douban

    pdb.set_trace()

    def clean_title(title):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        if zhPattern.search(title):
            return title.split(" ")[0]
        else:
            return title

    for item in db.album.find():
        pubtime = None
        if item['pub_time']:
            pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0]))

        model = VideoSourceModel({
                         "title" : clean_title(item['title']),
                         "categories" : item['sub_category'],
                         "image" : item["img"],
                         "related" : item["related"],
                         "score" : item["score"],
                         "actors" : item["actors"],
                         "region" : item["area"][0] if item["area"] else None,
                         "url" : item["url"],
                         "description" : item["description"],
                         "pubtime" : pubtime,
                         "channel" : u"电影",
                         "source" : "douban",
                         "source_id" : re.findall("/(\d+)/", item['url'])[0],
                         })
        model.on_import()
        print model['title']
コード例 #9
0
ファイル: zy265.py プロジェクト: hitflame/ContentService
    def crawl(self):
        album_url = "http://www.265zy.com/detail/?%s.html" % self.key
        hxs = load_html(album_url)

        urls = hxs.select(
            "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(
                VideoItemModel({
                    "title":
                    title,
                    "url":
                    url,
                    "stream": [{
                        "url": url,
                        "format": "qvod",
                        "size": size
                    }],
                }))

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(
            hxs.select("//div[@class='intro']/.//text()").extract())
        try:
            image = urlparse.urljoin(
                "http://www.265zy.com/",
                hxs.select("//div[@class='img']/img/@src").extract()[0])
        except:
            image = None

        model = VideoSourceModel({
            "source":
            self.data['source'],
            "source_id":
            self.key,
            "title":
            self.data["title"],
            "image":
            image,
            "url":
            album_url,
            "time":
            self.data.get('time'),
            "categories": [self.data.get('category')],
            "channel":
            self.data.get('category'),
            "region":
            self.data.get('region'),
            "videos":
            videos,
            "actors":
            split(kv.get(u"影片演员:")),
            "pubtime":
            parse_date(kv.get(u"上映日期:")),
            "completed":
            kv.get(u"影片状态:", "").find(u"连载") == -1,
            "description":
            description,
        })
        export(model)
コード例 #10
0
ファイル: scripts.py プロジェクト: hitflame/ContentService
def import_mtime():
    from datetime import datetime
    from contentservice.models.video import VideoSourceModel
    from contentservice.settings import MONGO_CONN_STR
    from contentservice.utils.datetimeutil import parse_date
    db = MongoClient(MONGO_CONN_STR).mtime
    pdb.set_trace()

    for item in db.album.find():
        area = item["area"] if item.get("area") else None
        if isinstance(area, list):
            area = area[0]
        categories = item.get("type")
        if isinstance(categories, basestring):
            categories = [categories]
        description = item.get("description")
        if isinstance(description, list):
            description = "\n".join(description)
        tags = item.get("tags")
        if isinstance(tags, basestring):
            tags = [tags]
        channel = ""
        if item.get("category_id") == "1":
            channel = u"电影"
        elif item.get("category_id") == "0":
            channel = u"电视剧"
        try:
            model = VideoSourceModel({
                "source":
                "mtime",
                "source_id":
                item["id"],
                "title":
                item["title"],
                "description":
                description,
                "tags":
                tags,
                "time":
                datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"),
                "duration":
                item.get("duration"),
                "region":
                area,
                "directors":
                item.get("directors"),
                "score":
                item.get("score"),
                "actors":
                item.get("actors"),
                "categories":
                categories,
                "channel":
                channel,
                "url":
                "http://movie.mtime.com/%s/" % item["id"],
                "pubtime":
                parse_date(item["release_time"]),
            })
            model.on_import()
        except Exception, e:
            print e
        print model["title"]
コード例 #11
0
    def crawl(self):
        #key为专辑源站ID
        album_id = self.key

        album_url = "http://bdzy.cc/detail/?%s.html" % album_id
        hxs = load_html(album_url)

        urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("bdhd://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            #视频剧集
            video = VideoItemModel({
                            "title" : title,
                            "url" : url, #网页地址 (这里没有,所以采用播放地址)
                            "stream" : [
                                        {
                                         "url" : url, #视频文件播放地址
                                         "size" : size,
                                         "format" : "bdhd" #视频格式(协议)
                                        }],
                            })

            videos.append(video)

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract())

        try:
            image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0]
        except:
            image = None

        #视频导出的数据模型
        model = VideoSourceModel({
                                 "source" : self.data['source'], #视频源
                                 "source_id" : album_id, #源站ID
                                 "title" : self.data["title"],
                                 "url" : album_url, #网页地址
                                 "image" : image, #图片url
                                 "time" : self.data.get('time'), #源站更新时间
                                 "categories" : [self.data.get('category')], #分类
                                 "channel" : self.data.get('category'), #频道
                                 "region" : self.data.get('region'), #地区
                                 "videos" : videos, #视频专辑数组
                                 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间
                                 "actors" : split(kv.get(u"影片演员:")),
                                 "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结
                                 "description" : description,
                                 })
        #导出数据
        export(model)