Ejemplo n.º 1
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        maidId = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        name_p = soup.find('div', attrs={'class': 'bodanAlbum-msg-lt-pic fl'})
        if name_p is not None:
            name = name_p.find('a').get('title')
            if name_p.find('img') is not None:
                poster = name_p.find('img').get("src")

        maidId_re = re.search(r'com/(.*?)\.', seed)
        if maidId_re:
            maidId = maidId_re.group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['mainId'] = maidId
        pages = 0
        total_num_p = soup.find('li', attrs={'class': 'itemOne'})
        if total_num_p is not None:
            total_num_re = re.search(r'\d+', total_num_p.get_text())
            if total_num_re:
                total_num = total_num_re.group()
                pages = int(total_num) / 20 + 2
        if pages != 0:
            for page in range(1, pages):
                sub_seed = seed.replace('.html', '-' + str(page) + '.html')
                self.secondSpider(sub_seed)
Ejemplo n.º 2
0
    def firstSpider(self, seed):
        name = ""
        mainId = ""
        programNum = ''
        uniqueFlag = ''

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        name_p = soup.find("span",
                           attrs={
                               'class': 'txt',
                               'id': 'userInfoNick'
                           })
        if name_p is not None:
            name = name_p.get_text()
        self.program["name"] = spiderTool.changeName(name)

        mainId_p = re.search(r'http://v.qq.com/vplus/(.*?)/videos', seed)
        if mainId_p:
            mainId = mainId_p.group(1)
        self.program["mainId"] = mainId

        if re.search(r'共(\d+)个视频', doc):
            programNum = re.search(r'共(\d+)个视频', doc).group(1)
        if re.search(r"visited_euin : '(.*?)'", doc):
            uniqueFlag = re.search(r"visited_euin : '(.*?)'", doc).group(1)
        if programNum != '' and uniqueFlag != '':
            pages = range(1, int(programNum) / 25 + 2)
        else:
            return
        for page in pages:
            sub_url = 'http://c.v.qq.com/vchannelinfo?uin=%s&qm=1&pagenum=%s&num=25&sorttype=0&orderflag=1&callback=jQuery' % (
                uniqueFlag, page)
            self.secondSpider(sub_url)
Ejemplo n.º 3
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""
        shootYear = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        tvId = re.search(r'data-player-tvid="([^"]+?)"', doc)
        videoId = re.search(r'data-player-videoid="([^"]+?)"', doc)
        if tvId and videoId:
            newUrl = 'http://cache.video.qiyi.com/vi/%s/%s/' % (
                tvId.group(1), videoId.group(1))
            doc = spiderTool.getHtmlBody(newUrl)
        else:
            return

        try:
            json_data = json.loads(doc)
            name = json_data["shortTitle"]
            poster = json_data["apic"]
            star = json_data["ma"].replace("|", ",")
            director = json_data["d"].replace("|", ",")
            ctype = json_data["tg"].replace(" ", ",")
            area = json_data["ar"]
            intro = json_data["info"]
        except:
            return

        #speical deal
        if re.search(r'华语'.decode('utf-8'), ''.join(area)):
            area = u'中国'

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        if re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed):
            mainId = re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html',
                              seed).group(1)
        elif re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed):
            mainId = re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/',
                              seed).group(1)
        self.program["mainId"] = mainId

        self.secondSpider()
Ejemplo n.º 4
0
    def firstSpider(self, detail):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        pcUrl = ""
        mainId = ""
        area = ""

        div_tag = detail.find("div", attrs={"class": "p-thumb"})
        if div_tag is None:
            return
        a_tag = div_tag.find("a", attrs={"target": "_blank"})
        if a_tag is None:
            return

        img = div_tag.find("img")
        if img is not None:
            poster_P = img.get("src")
            if poster_P is not None:
                poster = poster_P
            name_P = img.get("alt")
            if name_P is not None:
                name = name_P

        pcUrl_P = a_tag.get("href")
        if pcUrl_P is not None:
            pcUrl = "http:%s" % pcUrl_P

        mainId_P = re.search(r'http://v.youku.com/v_show/id_(.+)==', pcUrl)
        if mainId_P:
            mainId = time.strftime('%Y%m%d', time.localtime(
                time.time())) + mainId_P.group(1)

        shootYear = time.strftime('%Y', time.localtime(time.time()))
        intro = name

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['pcUrl'] = pcUrl
        self.program['mainId'] = mainId
        self.secondSpider(pcUrl, name, poster)
Ejemplo n.º 5
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""
        jsondata = {}

        postdata = {'movieId': seed}
        doc = self.http_post(self.postdetailurl, postdata,
                             self.http_post_header)
        try:
            data = json.loads(doc)
            # data = json.loads(doc, object_pairs_hook=OrderedDict)
        except:
            return

        mainId = seed

        if data.get('data').get('videoDetailView'):
            jsondata = data['data']['videoDetailView']

            if jsondata.get('brief'):
                intro = jsondata['brief']
            if jsondata.get('actors'):
                actor_list = jsondata['actors']
                star_list = []
                for actor in actor_list:
                    star_list.append(actor['name'])
                star = ','.join(star_list)
            if jsondata.get('title'):
                name = jsondata['title']
            if jsondata.get('cover'):
                poster = jsondata['cover']

        else:
            return

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        if jsondata.get('id'):
            self.secondSpider(jsondata.get('id'))
Ejemplo n.º 6
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("span",attrs={'class': 'a_cover'})
        if poster_p is not None:
            poster = poster_p.find("img").get("lz_src")

        starts_P = soup.find("dd", attrs={"class": "actor"})
        if starts_P is not None:
            start_list = []
            for each in starts_P.find_all('a'):
                start_list.append(each.get_text())
            star = ",".join(start_list)

        detail = soup.find("dd", attrs={"class": "type"})
        for each in detail.find_all("span",  attrs={"class": "item"}):
            if re.search("导演:".decode("utf8"),each.get_text()):
                director = each.get_text().split("导演:".decode("utf8"))[1]
            if re.search("类型:".decode("utf8"),each.get_text()):
                ctype = each.get_text().split("类型:".decode("utf8"))[1]
            if re.search("年份:".decode("utf8"),each.get_text()):
                shootYear = each.get_text().split("年份:".decode("utf8"))[1]
            if re.search("地区:".decode("utf8"),each.get_text()):
                area = each.get_text().split("地区:".decode("utf8"))[1]

        content_p = soup.find('p',  attrs={"class": "detail_all"})
        if content_p is not None:
            if re.search("简介:".decode("utf8"),content_p.get_text()):
                intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip()

        if re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html',seed):
            mainId = re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html',seed).group(2)
        name = soup.find("h3",attrs={'class':'film_name'}).get_text()
        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['director'] = spiderTool.listStringToJson('name',director)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        seed_sub = 'http://s.video.qq.com/loadplaylist?vkey=897_qqvideo_cpl_%s_qq' % mainId
        self.secondSpider(seed_sub)
Ejemplo n.º 7
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        pcUrl  = ""
        star = ""
        ctype = "美女".decode("utf8")
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        point = 0.0
        playTimes = 0

        playLength = ""
        setNum = ""

        soup = seed
        poster_p = soup.find("div",attrs={'class':'v-thumb'})
        if poster_p is not None:
            if poster_p.find('img') is not None:
                poster = poster_p.find("img").get("src")
                name = poster_p.find("img").get("alt")
            if poster_p.find('span', attrs={'class': 'v-time'}):
                playLength = poster_p.find('span', attrs={'class': 'v-time'}).get_text()

        pcUrl_p = soup.find('div', attrs={'class': 'v-link'})
        if pcUrl_p is not None:
            atag = pcUrl_p.find('a')
            if atag is not None:
                pcUrl = atag.get('href')
        mainId_p = re.search(r'http://v.youku.com/v_show/(.*?)\.html', pcUrl)
        if mainId_p:
            mainId = mainId_p.group(1)

        setNum_p = soup.find('span', attrs={'class': 'r'})
        if setNum_p is not None:
            setNum_text = setNum_p.get_text()
            if re.search(r'(\d+)分钟前'.decode('utf8'), setNum_text):
                setNum = time.strftime('%Y%m%d',time.localtime(time.time()))
            elif re.search(r'\d+小时前'.decode('utf8'), setNum_text):
                setNum = time.strftime('%Y%m%d',time.localtime(time.time()))

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['pcUrl'] = pcUrl
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        self.program['point'] = point
        self.program['playTimes'] = playTimes
        self.secondSpider(setNum, name, pcUrl, playLength, poster)
Ejemplo n.º 8
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = "木下佑香".decode('utf8')
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = "UMzMzODQ1Njg5Ng"
        area = ""
        point = 0.0
        playTimes = 0

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("div", attrs={'class': 'head-avatar'})
        if poster_p is not None:
            poster_a = poster_p.find('a')
            if poster_a is not None:
                name = poster_a.get("title")
            if poster_p.find('img') is not None:
                poster = poster_p.find("img").get("src")

        content_p = soup.find('div', attrs={"class": "userintro"})
        if content_p is not None:
            content = content_p.find('div', attrs={'class': 'desc'})
            if content is not None:
                intro = content.get_text().strip().split(
                    '自频道介绍:'.decode('utf8'))[1]

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        self.program['point'] = point
        self.program['playTimes'] = playTimes

        pages = 0
        total_num_p = soup.find_all('div', attrs={'class': 'title'})
        for item in total_num_p:
            if re.search(r'视频[^\\]*?\((\d+)\)'.decode('utf8'),
                         item.get_text()):
                total_num = re.search(r'\((\d+)\)', item.get_text()).group(1)
                pages = int(total_num) / 40 + 2
        if pages != 0:
            for page in range(1, pages):
                sub_seed = 'http://i.youku.com/u/UMzMzODQ1Njg5Ng==/videos/fun_ajaxload/?page_num=%d&page_order=0' % (
                    page)
                self.secondSpider(sub_seed)
Ejemplo n.º 9
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""
        shootYear = ""

        doc = spiderTool.getHtmlBody(seed)
        tvId = re.search(r'data-player-tvid="([^"]+?)"', doc)
        videoId = re.search(r'data-player-videoid="([^"]+?)"', doc)
        if tvId and videoId:
            newUrl = 'http://cache.video.qiyi.com/vi/%s/%s/' % (tvId.group(1), videoId.group(1))
            doc = spiderTool.getHtmlBody(newUrl)
        else:
            return

        try:
            json_data = json.loads(doc)
            name = json_data["shortTitle"]
            poster = json_data["apic"]
            star = json_data["ma"].replace("|", ",")
            director = json_data["d"].replace("|", ",")
            ctype = json_data["tg"].replace(" ", ",")
            programLanguage = json_data["ar"]
            intro = json_data["info"]
        except:
            return



        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['director'] = spiderTool.listStringToJson('name',director)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['programLanguage'] = programLanguage
        self.program['intro'] = intro
        if re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed):
            mainId = re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed).group(1)
        elif re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed):
            mainId = re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed).group(1)
        self.program["mainId"] = mainId
        self.program_sub = copy.deepcopy(PROGRAM_SUB)
        self.program_sub['setNumber'] = 1
        self.program_sub['setName'] = name
        self.program_sub['webUrl'] = seed
        self.program['programSub'].append(self.program_sub)
Ejemplo n.º 10
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        pcUrl  = ""
        star = ""
        ctype = "美女".decode("utf8")
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        point = 0.0
        playTimes = 0

        playLength = ""
        setNum = ""

        soup = seed
        atag = soup.find("a")
        if atag is None:
            return

        pcUrl = atag.get('href')

        poster_p = atag.find('img')
        if poster_p is not None:
            poster = poster_p.get('src')

        name_p = atag.find('p')
        if name_p is not None:
            name = "美女写真_".decode("utf8") + name_p.get_text()


        mainId_p = re.search(r'http://www\.mmwu\.tv/vod/(.*?)\.html', pcUrl)
        if mainId_p:
            mainId = mainId_p.group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['pcUrl'] = pcUrl
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        self.program['point'] = point
        self.program['playTimes'] = playTimes
        self.secondSpider( name, pcUrl, playLength, poster)
Ejemplo n.º 11
0
    def firstSpider(self, program):
        self.program = copy.deepcopy(BASE_CONTENT["program"])
        self.program["ptype"] = "新闻".decode("utf8")
        self.program['website'] = '优酷'.decode("utf8")
        self.program['getTime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime())
        self.program['totalSets'] = 1
        poster = ""
        name = ""
        mainId = ""
        pcUrl = ""

        poster_p = program.find("img")
        if poster_p is not None:
            poster = poster_p.get("src")
            name = poster_p.get("alt")

        pcUrl_P = program.find("a")
        if pcUrl_P is not None:
            pcUrl = "http:%s" % pcUrl_P.get("href")
        if re.search(r'id_(.*?)='.decode('utf8'), pcUrl):
            mainId = re.search(r'id_(.*?)='.decode('utf8'), pcUrl).group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['pcUrl'] = pcUrl
        self.program['mainId'] = mainId
        #print name,mainId,pcUrl,poster
        if self.program['name'] == '' or self.program['name'] is None \
                or self.program['mainId'] == ''or self.program['mainId'] is None:
            return

        json.dumps(PROGRAM_SUB)
        content = {'program': self.program}
        str = json.dumps(content)
        self.jsonData = str + "\n"
Ejemplo n.º 12
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        name = ""
        programLanguage = ""
        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("img", attrs={'itemprop': 'image'})
        if poster_p is not None:
            if poster_p.get("src") is not None:
                poster = poster_p.get("src")
                poster = "http:%s" % poster
            if poster_p.get("alt") is not None:
                name = poster_p.get("alt")

        video_type = soup.find("div", attrs={"class": "video_type cf"})
        if video_type is not None:
            divs = video_type.find_all('div', attrs={"class": "type_item"})
            for div in divs:
                text = div.get_text()
                if re.search(r'地 区:'.decode("utf8"), text):
                    area_p = div.find('span', attrs={"class": "type_txt"})
                    if area_p is not None:
                        area = area_p.get_text()
                if re.search(r'语 言:'.decode("utf8"), text):
                    programLanguage_p = div.find('span',
                                                 attrs={"class": "type_txt"})
                    if programLanguage_p is not None:
                        programLanguage = programLanguage_p.get_text()
                if re.search(r'上映时间:'.decode("utf8"), text):
                    shootYear_p = div.find('span', attrs={"class": "type_txt"})
                    if shootYear_p is not None:
                        shootYear = shootYear_p.get_text().split("-")[0]

        ctype_P = soup.find("div", attrs={"class": "tag_list"})
        if ctype_P is not None:
            ctype_list = []
            for each in ctype_P.find_all('a'):
                ctype_list.append(each.get_text())
            ctype = ",".join(ctype_list)

        person_p = soup.find("ul", attrs={"class": "actor_list cf"})
        if person_p is not None:
            lis = person_p.find_all('li')
            director_list = []
            star_list = []
            for li in lis:
                person = ""
                star_p = li.find("span", attrs={'class': 'name'})
                if star_p is not None:
                    person = star_p.get_text()
                if person != "" and li.find("span",
                                            attrs={"class": "director"}):
                    director_list.append(person)
                elif person != "":
                    star_list.append(person)
            star = ",".join(star_list)
            director = ",".join(director_list)

        content_p = soup.find('span',
                              attrs={"class": "txt _desc_txt_lineHight"})
        if content_p is not None:
            intro = content_p.get_text().strip()

        if re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html', seed):
            mainId = re.match(
                r'http://film\.qq\.com/(page|cover)/.*/(.*?).html',
                seed).group(2)
        if re.match(r'https://v\.qq\.com/detail/[\w\d]/(.*?).html', seed):
            mainId = re.match(r'https://v\.qq\.com/detail/[\w\d]/(.*?).html',
                              seed).group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['programLanguage'] = programLanguage
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        seed_sub = 'https://s.video.qq.com/loadplaylist?vkey=897_qqvideo_cpl_%s_qq' % mainId
        self.secondSpider(seed_sub)
Ejemplo n.º 13
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        if re.search(r'videoId=(?P<pid>\d+)', seed):
            mainId = re.search(r'videoId=(?P<pid>\d+)', seed).group('pid')

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        data = soup.find('video')
        if data is None:
            return

        name_p = data.find('name')
        name_r = re.search(r'<name>(.*?)</name>', doc)
        if name_p is not None:
            name = name_p.get_text().replace('<![CDATA[',
                                             '').replace(']]>', '').strip()
        elif name_r:
            name = name_r.group(1).replace('<![CDATA[',
                                           '').replace(']]>', '').strip()

        poster_p = data.find('smallImg')
        poster_r = re.search(r'<smallImg>(.*?)</smallImg>', doc)
        if poster_p is not None:
            poster = poster_p.get_text().replace('<![CDATA[',
                                                 '').replace(']]>',
                                                             '').strip()
        elif poster_r:
            poster = poster_r.group(1).replace('<![CDATA[',
                                               '').replace(']]>', '').strip()

        shootYear_P = data.find('screenTime')
        shootYear_r = re.search(r'<screenTime>(.*?)</screenTime>', doc)
        if shootYear_P is not None:
            shootYear = shootYear_P.get_text().replace('<![CDATA[',
                                                       '').replace(']]>',
                                                                   '').strip()
        elif shootYear_r:
            shootYear = shootYear_r.group(1).replace('<![CDATA[',
                                                     '').replace(']]>',
                                                                 '').strip()

        area_P = data.find('area')
        area_r = re.search(r'<area>(.*?)</area>', doc)
        if area_P is not None:
            area = area_P.get_text().replace('<![CDATA[',
                                             '').replace(']]>', '').strip()
        elif area_r:
            area = area_r.group(1).replace('<![CDATA[',
                                           '').replace(']]>', '').strip()

        director_P = data.find('director')
        director_r = re.search(r'<director>(.*?)</director>', doc)
        if director_P is not None:
            director = director_P.get_text().replace('<![CDATA[',
                                                     '').replace(']]>',
                                                                 '').strip()
        elif director_r:
            director = director_r.group(1).replace('<![CDATA[',
                                                   '').replace(']]>',
                                                               '').strip()

        star_P = data.find('performer')
        star_r = re.search(r'<performer>(.*?)</performer>', doc)
        if star_P is not None:
            star = star_P.get_text().replace('<![CDATA[',
                                             '').replace(']]>', '').strip()
        elif star_r:
            star = star_r.group(1).replace('<![CDATA[',
                                           '').replace(']]>', '').strip()

        ctype_P = data.find('cate')
        ctype_r = re.search(r'<cate>(.*?)</cate>', doc)
        if ctype_P is not None:
            ctype = ctype_P.get_text().replace('<![CDATA[',
                                               '').replace(']]>', '').strip()
        elif ctype_r:
            ctype = ctype_r.group(1).replace('<![CDATA[',
                                             '').replace(']]>', '').strip()

        intro_P = data.find('annotation')
        intro_r = re.search(r'<annotation>(.*?)</annotation>', doc)
        if intro_P is not None:
            intro = intro_P.get_text().replace('<![CDATA[',
                                               '').replace(']]>', '').strip()
        elif intro_r:
            intro = intro_r.group(1).replace('<![CDATA[',
                                             '').replace(']]>', '').strip()

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear.split("-")[0]
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        detail_sub = data.find('videomergeinfolist')
        if detail_sub is not None:
            self.secondSpider(detail_sub)
Ejemplo n.º 14
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        pid = ""
        doc = spiderTool.getHtmlBody(seed)
        pid_P = re.search(r'pid:\s*(?P<pid>\d+),', doc)
        pid_P1 = re.search(r'pid:\s*"(?P<pid>\d+)"', doc)
        if pid_P:
            pid = pid_P.group('pid')
        elif pid_P1:
            pid = pid_P1.group('pid')
        else:
            print "not pid"

        if pid == "" or pid == "0":
            soup = BeautifulSoup(doc, from_encoding="utf8")
            more_tag = soup.find("a", attrs={'class': 'more'})
            if more_tag is not None:
                url = more_tag.get('href')
                if url is not None:
                    if re.search(r'http://www\.le\.com/tv/(?P<pid>\d+)\.html,',
                                 doc):
                        pid = re.search(
                            r'http://www\.le\.com/tv/(?P<pid>\d+)\.html,',
                            doc).group('pid')

        if pid != '' or pid != '0':
            seed = 'http://static.app.m.letv.com/android/mod/mob/ctl/album/act/detail/id/%s/pcode/010110014/version/5.2.3.mindex.html' % (
                pid)
        else:
            return

        mainId = pid
        doc = spiderTool.getHtmlBody(seed)
        json_data = json.loads(doc)
        if json_data.has_key("body"):
            if type(json_data["body"]) is types.DictionaryType:
                json_data = json_data["body"]
            else:
                return

        if json_data.has_key("nameCn"):
            name = json_data["nameCn"]

        if json_data.has_key("nameCn"):
            name = json_data["nameCn"]
        if json_data.has_key("picCollections"):
            poster_dict = json_data["picCollections"]
            if type(poster_dict) is types.DictionaryType:
                if poster_dict.has_key('400*300'):
                    poster = poster_dict['400*300']
                if poster == "":
                    for each in poster_dict:
                        if poster_dict[each] != "":
                            poster = poster_dict[each]

        if json_data.has_key("score"):
            try:
                point = float(json_data["score"])
            except:
                point = 0.0

        if json_data.has_key("releaseDate"):
            shootYear_P = json_data["releaseDate"]
            if re.search(r'(\d{4})-\d{2}-\d{2}', shootYear_P):
                shootYear = re.search(r'(\d{4})-\d{2}-\d{2}',
                                      shootYear_P).group(1)
            elif re.search(r'^\d{4}$', shootYear_P):
                shootYear = shootYear_P
        if json_data.has_key("directory"):
            director_P = json_data["directory"]
            if type(director_P) is types.UnicodeType:
                director = director_P.strip().replace(" ", ",")

        if json_data.has_key("starring"):
            star_P = json_data["starring"]
            if type(star_P) is types.UnicodeType:
                star = star_P.strip().replace(" ", ",")

        if json_data.has_key("area"):
            area_P = json_data["area"]
            if type(area_P) is types.UnicodeType:
                area = area_P.strip().replace(" ", ",")

        if json_data.has_key("subCategory"):
            ctype_P = json_data["subCategory"]
            if type(ctype_P) is types.UnicodeType:
                ctype = ctype_P.strip().replace(" ", ",")

        if json_data.has_key('description'):
            intro = json_data['description']

        if json_data.has_key('language'):
            programLanguage = json_data['language']

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        seed_sub = "http://static.app.m.letv.com/android/mod/mob/ctl/videolist/act/detail/id/%s/vid/25520328/b/1/s/60/o/-1/m/0/pcode/010110014/version/5.2.3.mindex.html" % (
            pid)
        self.secondSpider(seed_sub)
Ejemplo n.º 15
0
    def firstSpider(self, detail):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        pcUrl = ""
        mainId = ""
        area = ""

        public = ""
        div_public = detail.find("div", attrs={"class": "site-piclist_info"})
        if div_public is not None:
            public_P = div_public.find('div', attrs={"class": "role_info"})
            if public_P is not None:
                public = public_P.get_text().strip()

        if re.search(r'\d+-\d+-\d+', public):
            shootYear = public.split("-")[0]
        else:
            public = ""

        div_tag = detail.find("div", attrs={"class": "site-piclist_pic"})
        if div_tag is None:
            return
        a_tag = div_tag.find("a", attrs={"target": "_blank"})
        if a_tag is None:
            return

        img = a_tag.find("img")
        if img is not None:
            poster_P = img.get("src")
            if poster_P is not None:
                poster = poster_P
            name_P = img.get("alt")
            if name_P is not None:
                name = name_P

        pcUrl_P = a_tag.get("href")
        if pcUrl_P is not None:
            pcUrl = pcUrl_P

        mainId_P = re.search(r'http://www\.iqiyi\.com/(\w+)\.html', pcUrl)
        if mainId_P:
            mainId = mainId_P.group(1)

        if shootYear != "":
            mainId = public.replace('-', '') + mainId

        intro = name

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['pcUrl'] = pcUrl
        self.program['mainId'] = mainId

        self.secondSpider(pcUrl, name, poster, public)
Ejemplo n.º 16
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        pid = ""
        pid_P = re.search(
            r'http://www.mgtv.com/\w/\d+/(?P<mainId>\d+)/\w/(?P<pid>\d+)\.html',
            seed)
        if pid_P:
            pid = pid_P.group('pid')
            mainId = pid_P.group('mainId')

        seedJson = ""
        if pid != '':
            seedJson = 'http://m.api.hunantv.com/video/getbyid?videoId=%s' % (
                pid)
        else:
            return
        if mainId == "":
            mainId = pid

        doc = spiderTool.getHtmlBody(seedJson)
        if re.search(r'<html>', doc):
            doc = spiderTool.getHtmlBody(seedJson)
        json_data = json.loads(doc.strip())

        if json_data.has_key("data"):
            if type(json_data["data"]) is types.DictionaryType:
                json_data = json_data["data"]
                if json_data.has_key("detail"):
                    json_data = json_data["detail"]
                else:
                    return
            else:
                return

        if json_data.has_key("collectionName"):
            name = json_data["collectionName"]

        if json_data.has_key("image"):
            poster = json_data["image"]

        if json_data.has_key("year"):
            shootYear = json_data["year"]

        if json_data.has_key("director"):
            director_P = json_data["director"]
            if type(director_P) is types.UnicodeType:
                director = director_P.strip().replace(" / ", ",")

        if json_data.has_key("player"):
            star_P = json_data["player"]
            if type(star_P) is types.UnicodeType:
                star = star_P.strip().replace(" / ", ",")

        if json_data.has_key("area"):
            area_P = json_data["area"]
            if type(area_P) is types.UnicodeType:
                area = area_P.strip().replace(" ", ",")

        if json_data.has_key('desc'):
            intro = json_data['desc']

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        if json_data.has_key('typeId'):
            typeId = json_data['typeId']
        if json_data.has_key('collectionId'):
            collectionId = json_data['collectionId']

        pageNum = 20
        if json_data.has_key("totalvideocount"):
            pageNum = json_data["totalvideocount"]
        seed_sub = "http://m.api.hunantv.com/video/getListV2?videoId=%s&pageId=0&pageNum=%s" % (
            pid, pageNum)
        self.secondSpider(seed_sub, seed, pid)
Ejemplo n.º 17
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        mainId_p = re.findall(r'http://www\.mgtv\.com/\w/\d+/(.*)\.html', seed)
        if mainId_p:
            mainId = mainId_p[0]

        # get
        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")
        videoinfo = soup.find('div',
                              attrs={'class': 'v-panel-info v-panel-mod'})
        if videoinfo is not None:
            pinfo_list = videoinfo.find_all('p')
            for pinfo in pinfo_list:
                pinfo_str = str(pinfo)
                # print(pinfo_str)
                if re.search(r'导演:'.decode('utf-8'), pinfo_str.decode('utf8')):
                    director = pinfo.find('a').get_text()
                    if re.search(r'暂无'.decode('utf-8'), director) or re.search(
                            r'未知'.decode('utf-8'), director):
                        director = ""
                elif re.search(r'主演:'.decode('utf-8'),
                               pinfo_str.decode('utf8')):
                    star_p = pinfo.find_all('a')
                    star_list = []
                    for li in star_p:
                        if not re.search(r'暂无'.decode('utf-8'),
                                         li.get_text()) or re.search(
                                             r'未知'.decode('utf-8'),
                                             li.get_text()):
                            star_list.append(li.get_text())
                    star = ','.join(star_list)
                elif re.search(r'地区:'.decode('utf-8'),
                               pinfo_str.decode('utf8')):
                    area_p = pinfo.find_all('a')
                    area_list = []
                    for li in area_p:
                        area_list.append(li.get_text())
                    area = ','.join(area_list)
                elif re.search(r'类型:'.decode('utf-8'),
                               pinfo_str.decode('utf8')):
                    ctype_p = pinfo.find_all('a')
                    ctype_list = []
                    for li in ctype_p:
                        li.append(li.get_text())
                    ctype = ','.join(ctype_list)
                elif re.search(r'简介:'.decode('utf-8'),
                               pinfo_str.decode('utf8')):
                    intro_p = pinfo.find('span', attrs={'class': 'details'})
                    if intro_p is not None:
                        intro = intro_p.get_text()

        seedJson = ""
        if mainId != '':
            seedJson = "http://pcweb.api.mgtv.com/episode/list?video_id=%s&page=0&size=40" % mainId
        else:
            return

        json_data = ""
        tatal_pages = 1
        current_page = 0
        doc = spiderTool.getHtmlBody(seedJson)
        try:
            data = json.loads(doc)
        except:
            # print("load json error1111!")
            return
        if data.get('data') is None:
            # print("get html error1111")
            return

        json_data = data['data']
        if json_data.get('total_page'):
            tatal_pages = json_data['total_page']
        if json_data.get('info'):
            name = json_data['info']['title']
            if intro == "":
                intro = json_data['info']['desc']
        # if json_data.get('current_page'):
        #     current_page = json_data['current_page']

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        for pageNo in range(1, tatal_pages + 1):
            subseed = "http://pcweb.api.mgtv.com/episode/list?video_id=%s&page=%s&size=40" % (
                mainId, pageNo)
            self.secondSpider(subseed)
Ejemplo n.º 18
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        name = ""
        programLanguage = ""
        point = -1.0
        doubanPoint = -1.0
        poster = ""
        playTimes = 0
        pcUrl = ""
        duration = ""
        alias = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        #get poster
        poster_p = soup.find('img', attrs={'class': 'cover_image'})
        if poster_p is not None:
            poster = poster_p.get('src')
            if not re.match(r'http://', poster):
                poster = "http:%s" % poster

        #get name
        page_body = soup.find('div', attrs={'class': 'b-page-body'})
        if page_body is not None:
            name_p = page_body.find('div', attrs={'class': 'v-title'})
            if name_p is not None:
                name = name_p.get_text()

        v_info = soup.find('div', attrs={'class': 'v_info'})
        if v_info is not None:
            #get intro
            intro_p = v_info.find('div', attrs={'class': 'intro'})
            if intro_p is not None:
                intro = intro_p.get_text().strip()

            #get ctype
            ctype_list = []
            ctype_p = v_info.find('div', attrs={'class': 's_tag'})
            if ctype_p is not None:
                ctype_p = ctype_p.find_all('li')
                for li in ctype_p:
                    ctype_list.append(li.get_text())
                ctype = ','.join(ctype_list)

        if re.match(r'http://www\.bilibili\.com/video/(.*)/', seed):
            mainId = re.match(r'http://www\.bilibili\.com/video/(.*)/',
                              seed).group(1)

        self.program['name'] = spiderTool.changeName(name)
        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        self.secondSpider()
Ejemplo n.º 19
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("ul", attrs={'class': 'focus_img_list'})
        poster_p_1 = soup.find("div", attrs={"class": "result_pic pr"})
        if poster_p is not None:
            poster_p_sub = poster_p.find('li').get('style')
            if re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub):
                poster = re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub).group(1)
        elif poster_p_1 is not None:
            img_tag = poster_p_1.find("img")
            if img_tag is not None:
                poster = img_tag.get("src")
                name = img_tag.get("alt")

        name_p = soup.find('a', attrs={'class': 'white'})
        if name_p is not None:
            name = name_p.get_text()

        detail = soup.find("div", attrs={"class": "result_detail-minH"})
        detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"})
        if detail is not None:
            for div_p in detail.find_all("div", attrs={"class": "topic_item clearfix"}):
                for each in div_p.find_all("div"):
                    a_list = []
                    for a_tag in each.find_all('a'):
                        a_list.append(a_tag.get_text())
                    a_str = ",".join(a_list)
                    if re.search("主演:".decode("utf8"),each.get_text()):
                        star = a_str
                    if re.search("导演:".decode("utf8"),each.get_text()):
                        director = a_str
                    if re.search("类型:".decode("utf8"),each.get_text()):
                        ctype = a_str
                    if re.search("语言:".decode("utf8"),each.get_text()):
                        programLanguage = a_str
                    if re.search("地区:".decode("utf8"),each.get_text()):
                        area = a_str

        elif detail_1 is not None:
            for p_tag in detail_1.find_all("p"):
                a_list = []
                for a_tag in p_tag.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ','.join(a_list)
                if re.search("导演:".decode("utf8"),p_tag.get_text()):
                    director = a_str
                if re.search("类型:".decode("utf8"),p_tag.get_text()):
                    ctype = a_str
                if re.search("语言:".decode("utf8"),p_tag.get_text()):
                    programLanguage = a_str
                if re.search("地区:".decode("utf8"),p_tag.get_text()):
                    area = a_str
                if re.search("主演:".decode("utf8"),p_tag.get_text()):
                    star = a_str

        content_p = soup.find('span',  attrs={"class": "showMoreText", "data-moreorless":"moreinfo", "style":"display: none;"})
        content_p_1 = soup.find('div',  attrs={"data-moreorless":"moreinfo"})
        if content_p is not None:
            if content_p.find("span"):
                content_p = content_p.find("span")
            if re.search("简介:".decode("utf8"),content_p.get_text()):
                intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip()
        elif content_p_1 is not None:
            if re.search("简介:".decode("utf8"),content_p_1.get_text()):
                intro = content_p_1.get_text().split("简介:".decode("utf8"))[1].strip()


        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['director'] = spiderTool.listStringToJson('name',director)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro

        seed_P = re.search(r'albumId:\s*(?P<albumId>\d+)[^\\]*?cid:\s*(?P<cid>\d+)'.decode('utf8'), doc)
        if seed_P:
            albumId = seed_P.group('albumId')
            cid = seed_P.group('cid')
            self.program['mainId'] = cid + "_" +albumId
            seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/' %(albumId)
            allNum = 0
            doc = spiderTool.getHtmlBody(seed_sub)
            try:
                json_data = json.loads(doc.split('=')[1])
                data = json_data["data"]["vlist"]
                allNum = json_data["data"]["allNum"]
            except:
                data = []
            for i in range(1,(int(allNum)/50 + 2)):
                seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/%s/50/' %(albumId, str(i))
                self.secondSpider(seed_sub)
Ejemplo n.º 20
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("img", attrs={'width': '195'})
        if poster_p is not None:
            poster = poster_p.get("src")
            name = poster_p.get("alt")

        starts_P = soup.find("dd", attrs={"class": "actor"})
        if starts_P is not None:
            start_list = []
            for each in starts_P.find_all('a'):
                start_list.append(each.get_text())
            star = ",".join(start_list)

        detail = soup.find("div", attrs={"class": "topic_item topic_item-rt"})
        if detail is not None:
            for div_p in detail.find_all("div", attrs={"class": "item"}):
                for div in div_p:
                    if re.search("主持人:".decode("utf8"), each.get_text()):
                        director = each.get_text().split(
                            "主持人:".decode("utf8"))[1]
                    if re.search("类型:".decode("utf8"), each.get_text()):
                        ctype = each.get_text().split("类型:".decode("utf8"))[1]
                    if re.search("语言:".decode("utf8"), each.get_text()):
                        programLanguage = each.get_text().split(
                            "语言:".decode("utf8"))[1]

        content_p = soup.find('span',
                              attrs={
                                  "class": "showMoreText",
                                  "data-moreorless": "moreinfo",
                                  "style": "display: none;"
                              })
        if content_p is not None:
            if content_p.find("span"):
                content_p = content_p.find("span")
            if re.search("简介:".decode("utf8"), content_p.get_text()):
                intro = content_p.get_text().split(
                    "简介:".decode("utf8"))[1].strip()

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro

        seed_P = re.search(
            r'sourceId:\s*(?P<sourceId>\d+),\s*cid:\s*(?P<cid>\d+)', doc)
        if seed_P:
            sourceId = seed_P.group('sourceId')
            cid = seed_P.group('cid')
            self.program['mainId'] = sourceId
            seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % (
                cid, sourceId)
            self.secondSpider(seed_sub)
Ejemplo n.º 21
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        name = ""
        programLanguage = ""
        point = -1.0
        doubanPoint = -1.0
        poster = ""
        playTimes = 0
        pcUrl = ""
        duration = ""
        alias = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")
        head = soup.find('head')

        meta_list = head.find_all('meta')
        for meta in meta_list:
            #get name
            if meta.get('property') == "og:title":
                name = meta.get('content')

            #get poster
            if meta.get('property') == "og:image":
                poster = meta.get('content')

            #get intro
            if meta.get('property') == "og:description":
                intro = meta.get('content')

            #get area
            if meta.get('property') == "og:area":
                area = meta.get('content')

        #get mainId
        if re.match(r'http://v\.yinyuetai\.com/video/(.*)', seed):
            mainId = re.match(r'http://v\.yinyuetai\.com/video/(.*)',
                              seed).group(1)

        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        self.program["name"] = spiderTool.changeName(name)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)

        self.secondSpider()
Ejemplo n.º 22
0
    def firstSpider(self, seed):
        name = ""
        pcUrl = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        point = ""
        mainId = ""
        area = ""

        if re.search(r'http://v.pptv.com/show/\w+\.html', seed):
            seed_post = seed.replace("/show/", "/page/")
        else:
            seed_post = seed

        doc = spiderTool.getHtmlBody(seed_post)

        soup = BeautifulSoup(doc, from_encoding="utf8")
        poster_P = soup.find("a", attrs={"class": "cover-a"})
        poster_P1 = soup.find("div", attrs={"class": "bd cf"})
        if poster_P is not None:
            img_tag = poster_P.find("img")
            if img_tag is not None:
                img = img_tag.get('data-src2')
                if img is not None:
                    poster = img
        elif poster_P1 is not None:
            img_tag = poster_P1.find("img")
            if img_tag is not None:
                img = img_tag.get('src')
                if img is not None:
                    poster = img

        if re.search(r'http://v.pptv.com/page/\w+\.html', seed):
            seed = seed.replace("/page/", "/show/")
        doc = spiderTool.getHtmlBody(seed)

        pid_P = re.search(r'"pid":\s*(?P<pid>\d+),'.decode('utf8'), doc)
        pid_P1 = re.search(r'"pid":\s*"(?P<pid>\d+)"'.decode('utf8'), doc)
        if pid_P:
            pid = pid_P.group('pid')
            seed_sub = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=' + pid
        elif pid_P1:
            pid = pid_P1.group('pid')
            seed_sub = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=' + pid
        else:
            return

        id_P = re.search(r'"id":\s*(?P<id>\d+),'.decode('utf8'), doc)
        id_P1 = re.search(r'"id":\s*"(?P<id>\d+)"'.decode('utf8'), doc)
        if id_P:
            id = id_P.group('id')
            seedData = 'http://svcdn.pptv.com/show/v1/meta.json?cid=%s&sid=%s&psize=50' % (
                id, pid)
            mainId = "dianying" + id
        elif id_P1:
            id = id_P1.group('id')
            mainId = "dianying" + id
            seedData = 'http://svcdn.pptv.com/show/v1/meta.json?cid=%s&sid=%s&psize=50' % (
                id, pid)
        else:
            return

        doc = spiderTool.getHtmlBody(seedData)
        try:
            json_data = json.loads(doc)
        except:
            json_data = {}
        if json_data.has_key("data"):
            data = json_data["data"]
        else:
            return

        if data.has_key("title"):
            name_P = data["title"]
            if type(name_P) is types.UnicodeType:
                name = name_P

        def getListName(dict_P):
            dict_list = []
            if type(dict_P) is types.ListType:
                for each in dict_P:
                    if type(each) is not types.DictionaryType:
                        continue
                    name = ""
                    if each.has_key("name"):
                        name = each["name"]
                    elif each.has_key("text"):
                        name = each["text"]
                    if re.search(r'未知'.decode('utf8'), name) or name == '':
                        continue
                    dict_list.append(name)
            return ",".join(dict_list)

        if data.has_key("directors"):
            director = getListName(data["directors"])

        if data.has_key("actors"):
            star = getListName(data["actors"])

        if data.has_key("ct"):
            ctype = getListName(data["ct"])

        if data.has_key("area"):
            area = getListName(data["area"])

        if data.has_key("score"):
            point = data["score"]
        try:
            point = float(point)
        except:
            point = 0.0

        if data.has_key("summary"):
            intro = data["summary"]

        self.program["name"] = spiderTool.changeName(name)
        self.program['pcUrl'] = seed
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['point'] = point
        self.program['mainId'] = mainId

        self.secondSpider(seed, name)
Ejemplo n.º 23
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        name_p = soup.find('a', attrs={'class': 'white'})
        if name_p is not None:
            name = name_p.get_text()

        poster_p = soup.find("ul", attrs={'class': 'album-imgs'})
        if poster_p is not None:
            if poster_p.find('img') is not None:
                poster = poster_p.find('img').get("src")

        starts_P = soup.find("dd", attrs={"class": "actor"})
        if starts_P is not None:
            start_list = []
            for each in starts_P.find_all('a'):
                start_list.append(each.get_text())
            star = ",".join(start_list)

        detail = soup.find("div", attrs={"class": "msg-hd-lt fl"})
        if detail is not None:
            for p_tag in detail.find_all('p'):
                a_list = []
                for a_tag in p_tag.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ','.join(a_list)
                if re.search("导演:".decode("utf8"), p_tag.get_text()):
                    director = a_str
                if re.search("语言:".decode("utf8"), p_tag.get_text()):
                    programLanguage = a_str
                if re.search("配音:".decode("utf8"), p_tag.get_text()):
                    star = a_str
                if re.search("地区:".decode("utf8"), p_tag.get_text()):
                    area = a_str
                if re.search("类型:".decode("utf8"), p_tag.get_text()):
                    ctype = a_str

        content_p = soup.find('div ', attrs={"data-moreorless": "lessinfo"})
        if content_p is not None:
            if re.search("简介:".decode("utf8"), content_p.get_text()):
                intro = content_p.get_text().split(
                    "简介:".decode("utf8"))[1].strip()

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro

        seed_P = re.search(
            r'sourceId:\s*(?P<sourceId>\d+),\s*cid:\s*(?P<cid>\d+)', doc)
        if seed_P:
            sourceId = seed_P.group('sourceId')
            cid = seed_P.group('cid')
            self.program['mainId'] = sourceId
            if sourceId != '0':
                seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % (
                    cid, sourceId)
                self.secondSpider(seed_sub)
                return

        seed_P = re.search(
            r'albumId:\s*(?P<albumId>\d+),[^\\]*?cid:\s*(?P<cid>\d+)', doc)
        if seed_P:
            albumId = seed_P.group('albumId')
            cid = seed_P.group('cid')
            self.program['mainId'] = albumId
            seed_sub = 'http://cache.video.qiyi.com/jp/avlist/%s/' % (albumId)
            self.secondSpider(seed_sub)
            return
Ejemplo n.º 24
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        detail_p = soup.find("div", attrs={'class': 'k_jianjie'})
        if detail_p is not None:
            #get poster
            poster_p = detail_p.find("div", attrs={'id': 'k_jianjie-2b'})
            if poster_p is not None and poster_p.find('img') is not None:
                poster = poster_p.find('img').get('src')
                name = poster_p.find('img').get('alt')

            #get detail
            detail_1 = detail_p.find("div", attrs={'id': 'k_jianjie-3a'})
            if detail_1 is not None:
                detail_list = detail_1.find_all('ul')
                if detail_list is not None:
                    for ul in detail_list:
                        ul_p = str(ul)
                        if ul.find('li').get('class') == [
                                'k_jianjie-3a-1-name'
                        ] and name == "":
                            name = ul.find('li').get_text()
                        elif re.search(r'状态:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            if re.search(r'预告'.decode('utf8'),
                                         ul_p.decode('utf8')):
                                return
                        elif re.search(r'别名:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            alias_p = ul.find_all('li')
                            alias = ""
                            for li in alias_p:
                                if not re.search(r'别名:'.decode('utf8'),
                                                 (str(li)).decode('utf8')):
                                    alias = li.get_text()
                        elif re.search(r'导演:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            director_p = ul.find_all('li')
                            for li in director_p:
                                if not re.search(r'导演:'.decode('utf8'),
                                                 (str(li)).decode('utf8')):
                                    director_p = li.get_text().strip()
                                    director_p = director_p.replace('/', '')
                                    director = ','.join(director_p.split())
                        elif re.search(r'演员:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            star_p = ul.find_all('li')
                            for li in star_p:
                                if not re.search(r'演员:'.decode('utf8'),
                                                 (str(li)).decode('utf8')):
                                    star_p = li.get_text().strip()
                                    star_p = star_p.replace('/', '')
                                    star = ','.join(star_p.split())
                        elif re.search(r'地区:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            area_p = ul.find_all('li')
                            for li in area_p:
                                if not re.search(r'地区:'.decode('utf8'),
                                                 (str(li)).decode('utf8')):
                                    area = li.get_text().strip()
                        elif re.search(r'语言:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            language_p = ul.find_all('li')
                            for li in language_p:
                                if not re.search(r'语言:'.decode('utf8'),
                                                 (str(li)).decode('utf8')):
                                    programLanguage = li.get_text().strip()
                        elif re.search(r'剧情:'.decode('utf8'),
                                       ul_p.decode('utf8')):
                            intro_p = ul.find_all('li')
                            for li in intro_p:
                                if not re.search(r'剧情:'.decode('utf8'),
                                                 (str(li)).decode('utf8')):
                                    intro = li.get_text().strip()
        #get mainId
        if re.match(r'http://www\.yehetang\.com/movie/(.*)\.html', seed):
            mainId = re.match(r'http://www\.yehetang\.com/movie/(.*)\.html',
                              seed).group(1)

        if self.ctype.get(seed):
            ctype = self.ctype[seed]

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        video_p = soup.find('div', attrs={'id': 'play_1'})
        if video_p is not None:
            video_list = video_p.find_all('li')
            self.secondSpider(video_list)
Ejemplo n.º 25
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("img", attrs={'width': '195'})
        poster_p_1 = soup.find(
            'div', attrs={'class': 'album-picCon album-picCon-onePic'})
        poster_p_2 = soup.find('ul', attrs={'class': 'album-imgs'})
        if poster_p is not None:
            poster = poster_p.get("src")
            name = poster_p.get("alt")
        elif poster_p_1 is not None:
            img_tag = poster_p_1.find("img")
            if img_tag is not None:
                poster = img_tag.get("src")
                name = img_tag.get("alt")
        elif poster_p_2 is not None:
            poster = poster_p_2.find('img').get('src')

        if name == "":
            name_p = soup.find('div',
                               attrs={'class': 'album-playArea clearfix'})
            if name_p is not None:
                name = name_p.find('h1').get_text()

        detail = soup.find("div", attrs={"class": "result_detail-minH"})
        detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"})
        if detail is not None:
            for div_p in detail.find_all(
                    "div", attrs={"class": "topic_item clearfix"}):
                for each in div_p.find_all("div"):
                    a_list = []
                    for a_tag in each.find_all('a'):
                        a_list.append(a_tag.get_text())
                    a_str = ",".join(a_list)
                    if re.search("主持人:".decode("utf8"), each.get_text()):
                        star = a_str
                    if re.search("类型:".decode("utf8"), each.get_text()):
                        ctype = a_str
                    if re.search("语言:".decode("utf8"), each.get_text()):
                        programLanguage = a_str
                    if re.search("地区:".decode("utf8"), each.get_text()):
                        area = a_str
        elif detail_1:
            for each in detail_1.find_all("p"):
                a_list = []
                for a_tag in each.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ",".join(a_list)
                if re.search("主持人:".decode("utf8"), each.get_text()):
                    star = a_str
                if re.search("类型:".decode("utf8"), each.get_text()):
                    ctype = a_str
                if re.search("语言:".decode("utf8"), each.get_text()):
                    programLanguage = a_str
                if re.search("地区:".decode("utf8"), each.get_text()):
                    area = a_str

        content_p = soup.find('span',
                              attrs={
                                  "class": "showMoreText",
                                  "data-moreorless": "moreinfo",
                                  "style": "display: none;"
                              })
        content_p_1 = soup.find("span", attrs={"class": "bigPic-b-jtxt"})
        if content_p is not None:
            if content_p.find("span"):
                content_p = content_p.find("span")
            if re.search("简介:".decode("utf8"), content_p.get_text()):
                intro = content_p.get_text().split(
                    "简介:".decode("utf8"))[1].strip()
        elif content_p_1 is not None:
            intro = content_p_1.get_text()

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro

        seed_P = re.search(
            r'sourceId:\s*(?P<sourceId>\d+),\s+cid:\s*(?P<cid>\d+)', doc)
        if seed_P:
            sourceId = seed_P.group('sourceId')
            cid = seed_P.group('cid')
            self.program['mainId'] = sourceId
            seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % (
                cid, sourceId)
            self.secondSpider(seed_sub)
Ejemplo n.º 26
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        alias = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("div", attrs={'class': 'result_pic'})
        if poster_p is not None:
            poster = poster_p.find('img').get('src')

        name_p = soup.find('h1', attrs={'class': 'main_title'})
        if name_p is not None:
            name = name_p.find('a').get_text()
            if re.search(r'\d+.\.\d+', name_p.get_text()):
                point = float(
                    re.search(r'\d+.\.\d+', name_p.get_text()).groups())

        detail = soup.find_all("div", attrs={"class": "topic_item clearfix"})
        for each in detail:
            if each.find('div', attrs={"class": "look_point"}):
                ctype_p = each.find('div', attrs={"class": "look_point"})
                a_list = []
                for a_tag in ctype_p.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ','.join(a_list)
                ctype = a_str

            for p_tag in each.find_all('em'):
                a_list = []
                for a_tag in p_tag.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ','.join(a_list)
                if re.search("导演:".decode("utf8"), p_tag.get_text()):
                    director = a_str
                if re.search("语言:".decode("utf8"), p_tag.get_text()):
                    programLanguage = a_str
                if re.search("配音:".decode("utf8"), p_tag.get_text()):
                    star = a_str
                if re.search("地区:".decode("utf8"), p_tag.get_text()):
                    area = a_str

        content_p = soup.find('p', attrs={"data-movlbshowmore-ele": "whole"})
        if content_p is not None:
            intro = content_p.get_text().strip()

        mainId_P = re.search(r'http://www\.iqiyi\.com/lib/(\w+)\.html', seed)
        if mainId_P:
            mainId = mainId_P.group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        self.secondSpider(doc)
Ejemplo n.º 27
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        name = ""
        programLanguage = ""
        point = -1.0
        doubanPoint = -1.0
        poster = ""
        playTimes = 0
        pcUrl = ""
        duration = ""
        alias = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")
        introPage = soup.find('div', attrs={'class': 'info-content'})
        if introPage is not None:
            #get poster
            poster_p = introPage.find('div',
                                      attrs={'class': 'bangumi-preview'})
            if poster_p is not None:
                poster = poster_p.find('img').get('src')
                if not re.match(r'http://', poster):
                    poster = "http:%s" % poster

            #get info
            info_p = introPage.find('div', attrs={'class': 'bangumi-info-r'})
            if info_p is not None:
                head = info_p.find('div', attrs={'class': 'b-head'})
                if head is not None:
                    # get name
                    name = head.find('h1').get_text()

                    #get ctype
                    ctype_list = []
                    ctype_p = head.find_all('span')
                    for span in ctype_p:
                        ctype_list.append(span.get_text())
                    ctype = ','.join(ctype_list)

                #get playtimes
                info_count = info_p.find('div', attrs={'class': 'info-count'})
                if info_count is not None:
                    #playTimes = info_count.find('em').get_text()
                    playTimes = 0

                #get actors

                #get desc
                info_desc = info_p.find('div', attrs={'class': 'info-desc'})
                if info_desc is not None:
                    intro = info_desc.get_text().strip()

        if re.match(r'http://bangumi\.bilibili\.com/anime/(.*)', seed):
            mainId = re.match(r'http://bangumi\.bilibili\.com/anime/(.*)',
                              seed).group(1)

        self.program['name'] = spiderTool.changeName(name)
        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        videoList = []
        subprg_list = soup.find_all(
            'li', attrs={'class': 'v1-bangumi-list-part-child'})
        for li in subprg_list:
            if li.find('a').get('class') == ['v1-complete-text']:
                videoList.append(li)

        self.secondSpider(videoList)
Ejemplo n.º 28
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        name = ""
        programLanguage = ""
        point = -1.0
        doubanPoint = -1.0
        poster = ""
        playTimes = 0
        pcUrl = ""
        duration = ""
        alias = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        pageInfo = soup.find('div', attrs={'id': 'pageInfo'})
        if pageInfo is not None:
            #name = pageInfo.get('data-title')
            poster = pageInfo.get('data-pic')
            #intro = pageInfo.get('data-desc')

        introPage = soup.find('div', attrs={'class': 'introduction'})
        if introPage is not None:
            #get intro
            intro_p = introPage.find('div', attrs={'class': 'desc gheight'})
            if intro_p is not None:
                intro = intro_p.find('div').get_text()

        #get platytimes
        playTimes_p = soup.find('section',
                                attrs={'class': 'clearfix wp area crumb'})
        if playTimes_p is not None:
            #playTimes = long(playTimes_p.find('span', attrs={'class': 'sp2'}).get_text())
            playTimes = 0

        if re.match(r'http://www\.acfun\.tv/v/(.*)', seed):
            mainId = re.match(r'http://www\.acfun\.tv/v/(.*)', seed).group(1)

        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        videoList = []
        soup = BeautifulSoup(doc, from_encoding="utf8")
        data = soup.find_all('script')
        for li in data:
            if re.match(r'.*pageInfo.*', ''.join(li)):
                pageInfo = li.get_text()
                pageInfo_p = re.findall(r'pageInfo\s+=\s+(.*)', pageInfo)
                if pageInfo_p:
                    try:
                        pageInfo_json = json.loads(pageInfo_p[0])
                    except:
                        break
                    #get ctype
                    ctype_list = []
                    for tag in pageInfo_json['tagList']:
                        ctype_list.append(tag['name'])
                    ctype = ','.join(ctype_list)
                    #get name
                    name = pageInfo_json['title']
                    videoList = pageInfo_json['videoList']

        self.program["name"] = spiderTool.changeName(name)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)

        self.secondSpider(videoList)
Ejemplo n.º 29
0
    def firstSpider(self, seed):
        point = 0.0
        poster = ""
        name = ""
        shootYear = ""
        alias = ""
        area = ""
        star = ""
        director = ""
        ctype = ""
        playTimes = 0
        intro = ""
        mainId = ""
        youkushootYear = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc)

        seed_v = re.search(r'http://v\.youku\.com/v_show/id_', seed)
        seed_Re = re.search(r'http://www\.youku\.com/show_page/id', seed)
        if seed_v:
            seed_P = soup.find('a', attrs={'class': 'desc-link'})
            if seed_P is not None:
                seed = seed_P.get('href')
                seed = "http:%s" % seed
                doc = spiderTool.getHtmlBody(seed)
                soup = BeautifulSoup(doc)
        elif not seed_Re:
            seed_P = soup.find('h1', attrs={'class': 'title'})
            if seed_P is not None:
                seed_aTag = seed_P.find('a')
                if seed_aTag is not None:
                    seed = seed_aTag.get('href')
                    seed = "http:%s" % seed
                    doc = spiderTool.getHtmlBody(seed)
                    soup = BeautifulSoup(doc)

        poster_p = soup.find("div", attrs={'class': 'p-thumb'})
        if poster_p is not None:
            poster = poster_p.find('img').get("src")

        p_base_content = soup.find('div', attrs={'class': 'p-base'})
        if p_base_content is not None:
            for li in p_base_content.find_all('li'):
                li_p = str(li)
                if li.find('span', attrs={'class': 'star-num'}) is not None:
                    point = li.find('span', attrs={
                        'class': 'star-num'
                    }).get_text()
                elif li.get('class') == ['p-row', 'p-title']:
                    name_p = re.findall(r'/a>:(.*)<span'.decode('utf8'),
                                        li_p.decode('utf8'))
                    if name_p:
                        name = name_p[0]
                    if re.search(r'<'.decode('utf8'), name):
                        name_p = re.findall(r'(.*)<span'.decode('utf8'), name)
                        if name_p:
                            name = name_p[0]
                elif li.find('span', attrs={'class': 'pub'}) is not None:
                    shootYear_P = li.find('span', attrs={'class': 'pub'})
                    if re.search(r'优酷'.decode('utf8'), li_p.decode('utf8')):
                        youkushootYear_text = re.findall(
                            r'/label>(.*)</span', str(shootYear_P))
                        if youkushootYear_text:
                            youkushootYear_text = youkushootYear_text[0]
                            youkushootYear = ''.join(
                                youkushootYear_text.split('-')[0])
                    else:
                        shootYear_text = re.findall(r'/label>(.*)</span',
                                                    str(shootYear_P))
                        if shootYear_text:
                            shootYear_text = shootYear_text[0]
                            shootYear = ''.join(shootYear_text.split('-')[0])
                elif re.search(r'<li>地区'.decode('utf8'), li_p.decode('utf8')):
                    area_p = li.get_text()
                    area_p = re.findall(r'地区:(.*)'.decode('utf8'), area_p)
                    if area_p:
                        area = area_p[0]
                        area = area.replace('/', ',')
                elif re.search(r'<li>类型'.decode('utf8'), li_p.decode('utf8')):
                    ctype_p = li.get_text()
                    ctype_p = re.findall(r'类型:(.*)'.decode('utf8'), ctype_p)
                    if ctype_p:
                        ctype = ctype_p[0]
                        ctype = ctype.replace('/', ',')
                elif re.search(r'<li>导演'.decode('utf8'), li_p.decode('utf8')):
                    director_p = li.get_text()
                    director_p = re.findall(r'导演:(.*)'.decode('utf8'),
                                            director_p)
                    if director_p:
                        director = director_p[0]
                elif li.get('class') == ['p-performer']:
                    star_list = []
                    for each in li.find_all('a'):
                        star_list.append(each.get_text())
                        star = ','.join(star_list)
                elif re.search(r'<li>总播放数'.decode('utf8'),
                               li_p.decode('utf8')):
                    playTimesStr = li.get_text()
                    playTimesStr = re.findall(r'总播放数:(.*)'.decode('utf8'),
                                              playTimesStr)
                    if playTimesStr:
                        playTimesStr = playTimesStr[0]
                        playTimes_list = re.findall(r'(\d+)', playTimesStr)
                        playTimes = long(''.join(playTimes_list))
                elif li.get('class') == ['p-row', 'p-intro']:
                    intro = li.find('span').get_text().strip()
                else:
                    continue

            if shootYear == "":
                shootYear = youkushootYear

        if re.match(r'http://list\.youku\.com/show/(id_(.+))\.html', seed):
            mainId = re.match(r'http://list\.youku\.com/show/(id_(.+))\.html',
                              seed).group(2)

        self.program["name"] = spiderTool.changeName(name)
        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        showid = ""
        showid_url = ""
        p_list_p = soup.find_all('script', attrs={'type': 'text/javascript'})
        if p_list_p is not None:
            for each in p_list_p:
                if re.search(r'PageConfig', str(each)):
                    showid_p = re.findall(r'showid:"(.*)", videoId', str(each))
                    if showid_p:
                        showid = showid_p[0]
                        showid_url = "http://list.youku.com/show/module?id=%s&tab=showInfo&callback=jQuery111208239648697137207" % showid

        if showid_url != "":
            sub_doc = spiderTool.getHtmlBody(showid_url)
            if re.search(r'window\.jQuery', sub_doc):
                sub_doc_p = re.search(r'\((.*)\);', sub_doc)
                sub_doc = sub_doc_p.group(1)

            try:
                data = json.loads(sub_doc)
            except:
                #print("load json error1111!")
                return
            if data.get('html') is None:
                #print("get html error1111")
                return

            sub_soup = BeautifulSoup(data['html'])
            reload_list_p = re.findall(r'id="reload_(\d+)"', data['html'])
            reload_list = list(set(reload_list_p))
            if reload_list:

                def numeric_compare(x, y):
                    x = int(x)
                    y = int(y)
                    if x > y:
                        return 1
                    elif x == y:
                        return 0
                    else:  # x<y
                        return -1

                reload_list.sort(numeric_compare)
                reload_list.reverse()
                #print(reload_list)
                for reload in reload_list:
                    sub_seed = "http://list.youku.com/show/episode?id=%s&stage=reload_%s&callback=jQuery111208239648697137207" % (
                        mainId, reload)
                    self.secondSpider(sub_seed)
Ejemplo n.º 30
0
    def firstSpider(self, seed):
        point = 0.0
        poster = ""
        name = ""
        shootYear = ""
        alias = ""
        area = ""
        star = ""
        director = ""
        ctype = ""
        playTimes = 0
        intro = ""
        mainId = ""

        doc = spiderTool.getHtmlBody(seed)
        if re.search(r'playlistId\s*=\s*"(\d+)"', doc):
            mainId = re.search(r'playlistId\s*=\s*"(\d+)"', doc).group(1)
        elif re.search(r'PLAYLIST_ID\s*=\s*"(\d+)"', doc):
            mainId = re.search(r'PLAYLIST_ID\s*=\s*"(\d+)"', doc).group(1)
        elif re.search(r'http://film\.sohu\.com/album/(\d+)\.html', seed):
            mainId = re.search(r'http://film\.sohu\.com/album/(\d+)\.html',
                               seed).group(1)
        else:
            return

        seed = "http://pl.hd.sohu.com/videolist?playlistid=%s&callback=__get_videolist" % (
            mainId)
        try:
            doc = spiderTool.getHtmlBody(seed).decode('gbk').encode('utf8')
            doc = doc.split('__get_videolist(')[1][:-2]
            data = json.loads(doc)
        except:
            return

        if data.has_key('albumName'):
            name = data['albumName']
        if data.has_key('mainActors'):
            star = ','.join(data['mainActors'])
        if data.has_key('categories'):
            ctype = ','.join(data['categories'])
        if data.has_key('publishYear'):
            shootYear = str(data['publishYear'])
        if data.has_key('albumDesc'):
            intro = data['albumDesc']
        if data.has_key('largeVerPicUrl'):
            poster = data['largeVerPicUrl']
        if data.has_key('directors'):
            director = ','.join(data['directors'])
        if data.has_key('area'):
            area = data['area']

        self.program["name"] = spiderTool.changeName(name)
        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        if data.has_key('videos'):
            videos = data['videos']
            self.secondSpider(videos)