def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" shootYear = "" area = "" doc = spiderTool.getHtmlBody(seed) tvId = re.search(r'data-player-tvid="([^"]+?)"', doc) videoId = re.search(r'data-player-videoid="([^"]+?)"', doc) if tvId and videoId: newUrl = 'http://cache.video.qiyi.com/vi/%s/%s/' % ( tvId.group(1), videoId.group(1)) doc = spiderTool.getHtmlBody(newUrl) else: return try: json_data = json.loads(doc) name = json_data["shortTitle"] poster = json_data["apic"] star = json_data["ma"].replace("|", ",") director = json_data["d"].replace("|", ",") ctype = json_data["tg"].replace(" ", ",") area = json_data["ar"] intro = json_data["info"] except: return #speical deal if re.search(r'华语'.decode('utf-8'), ''.join(area)): area = u'中国' self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro if re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed): mainId = re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed).group(1) elif re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed): mainId = re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed).group(1) self.program["mainId"] = mainId self.secondSpider()
def firstSpider(self, detail): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" pcUrl = "" mainId = "" area = "" div_tag = detail.find("div", attrs={"class": "p-thumb"}) if div_tag is None: return a_tag = div_tag.find("a", attrs={"target": "_blank"}) if a_tag is None: return img = div_tag.find("img") if img is not None: poster_P = img.get("src") if poster_P is not None: poster = poster_P name_P = img.get("alt") if name_P is not None: name = name_P pcUrl_P = a_tag.get("href") if pcUrl_P is not None: pcUrl = "http:%s" % pcUrl_P mainId_P = re.search(r'http://v.youku.com/v_show/id_(.+)==', pcUrl) if mainId_P: mainId = time.strftime('%Y%m%d', time.localtime( time.time())) + mainId_P.group(1) shootYear = time.strftime('%Y', time.localtime(time.time())) intro = name self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['pcUrl'] = pcUrl self.program['mainId'] = mainId self.secondSpider(pcUrl, name, poster)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" jsondata = {} postdata = {'movieId': seed} doc = self.http_post(self.postdetailurl, postdata, self.http_post_header) try: data = json.loads(doc) # data = json.loads(doc, object_pairs_hook=OrderedDict) except: return mainId = seed if data.get('data').get('videoDetailView'): jsondata = data['data']['videoDetailView'] if jsondata.get('brief'): intro = jsondata['brief'] if jsondata.get('actors'): actor_list = jsondata['actors'] star_list = [] for actor in actor_list: star_list.append(actor['name']) star = ','.join(star_list) if jsondata.get('title'): name = jsondata['title'] if jsondata.get('cover'): poster = jsondata['cover'] else: return self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId if jsondata.get('id'): self.secondSpider(jsondata.get('id'))
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("span",attrs={'class': 'a_cover'}) if poster_p is not None: poster = poster_p.find("img").get("lz_src") starts_P = soup.find("dd", attrs={"class": "actor"}) if starts_P is not None: start_list = [] for each in starts_P.find_all('a'): start_list.append(each.get_text()) star = ",".join(start_list) detail = soup.find("dd", attrs={"class": "type"}) for each in detail.find_all("span", attrs={"class": "item"}): if re.search("导演:".decode("utf8"),each.get_text()): director = each.get_text().split("导演:".decode("utf8"))[1] if re.search("类型:".decode("utf8"),each.get_text()): ctype = each.get_text().split("类型:".decode("utf8"))[1] if re.search("年份:".decode("utf8"),each.get_text()): shootYear = each.get_text().split("年份:".decode("utf8"))[1] if re.search("地区:".decode("utf8"),each.get_text()): area = each.get_text().split("地区:".decode("utf8"))[1] content_p = soup.find('p', attrs={"class": "detail_all"}) if content_p is not None: if re.search("简介:".decode("utf8"),content_p.get_text()): intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip() if re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html',seed): mainId = re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html',seed).group(2) name = soup.find("h3",attrs={'class':'film_name'}).get_text() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId seed_sub = 'http://s.video.qq.com/loadplaylist?vkey=897_qqvideo_cpl_%s_qq' % mainId self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" pcUrl = "" star = "" ctype = "美女".decode("utf8") shootYear = "" intro = "" mainId = "" area = "" point = 0.0 playTimes = 0 playLength = "" setNum = "" soup = seed poster_p = soup.find("div",attrs={'class':'v-thumb'}) if poster_p is not None: if poster_p.find('img') is not None: poster = poster_p.find("img").get("src") name = poster_p.find("img").get("alt") if poster_p.find('span', attrs={'class': 'v-time'}): playLength = poster_p.find('span', attrs={'class': 'v-time'}).get_text() pcUrl_p = soup.find('div', attrs={'class': 'v-link'}) if pcUrl_p is not None: atag = pcUrl_p.find('a') if atag is not None: pcUrl = atag.get('href') mainId_p = re.search(r'http://v.youku.com/v_show/(.*?)\.html', pcUrl) if mainId_p: mainId = mainId_p.group(1) setNum_p = soup.find('span', attrs={'class': 'r'}) if setNum_p is not None: setNum_text = setNum_p.get_text() if re.search(r'(\d+)分钟前'.decode('utf8'), setNum_text): setNum = time.strftime('%Y%m%d',time.localtime(time.time())) elif re.search(r'\d+小时前'.decode('utf8'), setNum_text): setNum = time.strftime('%Y%m%d',time.localtime(time.time())) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['pcUrl'] = pcUrl self.program['star'] = spiderTool.listStringToJson('name',star) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId self.program['point'] = point self.program['playTimes'] = playTimes self.secondSpider(setNum, name, pcUrl, playLength, poster)
def firstSpider(self, seed): name = "" poster = "" star = "木下佑香".decode('utf8') ctype = "" shootYear = "" intro = "" mainId = "UMzMzODQ1Njg5Ng" area = "" point = 0.0 playTimes = 0 doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("div", attrs={'class': 'head-avatar'}) if poster_p is not None: poster_a = poster_p.find('a') if poster_a is not None: name = poster_a.get("title") if poster_p.find('img') is not None: poster = poster_p.find("img").get("src") content_p = soup.find('div', attrs={"class": "userintro"}) if content_p is not None: content = content_p.find('div', attrs={'class': 'desc'}) if content is not None: intro = content.get_text().strip().split( '自频道介绍:'.decode('utf8'))[1] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId self.program['point'] = point self.program['playTimes'] = playTimes pages = 0 total_num_p = soup.find_all('div', attrs={'class': 'title'}) for item in total_num_p: if re.search(r'视频[^\\]*?\((\d+)\)'.decode('utf8'), item.get_text()): total_num = re.search(r'\((\d+)\)', item.get_text()).group(1) pages = int(total_num) / 40 + 2 if pages != 0: for page in range(1, pages): sub_seed = 'http://i.youku.com/u/UMzMzODQ1Njg5Ng==/videos/fun_ajaxload/?page_num=%d&page_order=0' % ( page) self.secondSpider(sub_seed)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" shootYear = "" doc = spiderTool.getHtmlBody(seed) tvId = re.search(r'data-player-tvid="([^"]+?)"', doc) videoId = re.search(r'data-player-videoid="([^"]+?)"', doc) if tvId and videoId: newUrl = 'http://cache.video.qiyi.com/vi/%s/%s/' % (tvId.group(1), videoId.group(1)) doc = spiderTool.getHtmlBody(newUrl) else: return try: json_data = json.loads(doc) name = json_data["shortTitle"] poster = json_data["apic"] star = json_data["ma"].replace("|", ",") director = json_data["d"].replace("|", ",") ctype = json_data["tg"].replace(" ", ",") programLanguage = json_data["ar"] intro = json_data["info"] except: return self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['intro'] = intro if re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed): mainId = re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed).group(1) elif re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed): mainId = re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed).group(1) self.program["mainId"] = mainId self.program_sub = copy.deepcopy(PROGRAM_SUB) self.program_sub['setNumber'] = 1 self.program_sub['setName'] = name self.program_sub['webUrl'] = seed self.program['programSub'].append(self.program_sub)
def firstSpider(self, seed): name = "" poster = "" pcUrl = "" star = "" ctype = "美女".decode("utf8") shootYear = "" intro = "" mainId = "" area = "" point = 0.0 playTimes = 0 playLength = "" setNum = "" soup = seed atag = soup.find("a") if atag is None: return pcUrl = atag.get('href') poster_p = atag.find('img') if poster_p is not None: poster = poster_p.get('src') name_p = atag.find('p') if name_p is not None: name = "美女写真_".decode("utf8") + name_p.get_text() mainId_p = re.search(r'http://www\.mmwu\.tv/vod/(.*?)\.html', pcUrl) if mainId_p: mainId = mainId_p.group(1) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['pcUrl'] = pcUrl self.program['star'] = spiderTool.listStringToJson('name',star) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId self.program['point'] = point self.program['playTimes'] = playTimes self.secondSpider( name, pcUrl, playLength, poster)
def seedSpider(self): for seed in self.seedList: self.program = copy.deepcopy(BASE_CONTENT["program"]) self.firstSpider(seed) self.program['pcUrl'] = seed self.program["ptype"] = "电视剧".decode("utf8") self.program['website'] = '芒果TV'.decode("utf8") self.program['getTime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) self.program['totalSets'] = len(self.program['programSub']) # add poster if self.prgdata.get(seed): poster = self.prgdata[seed]['poster'] self.program['poster'] = spiderTool.listStringToJson( 'url', poster) print self.program['name'], self.program['totalSets'] if self.program['name'] == '' or self.program['name'] is None \ or self.program['mainId'] == ''or self.program['mainId'] is None \ or self.program['totalSets'] < 1: continue json.dumps(PROGRAM_SUB) content = {'program': self.program} str = json.dumps(content) self.dataFile.write(str + '\n')
def firstSpider(self, seed): name = "" poster = "" maidId = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") name_p = soup.find('div', attrs={'class': 'bodanAlbum-msg-lt-pic fl'}) if name_p is not None: name = name_p.find('a').get('title') if name_p.find('img') is not None: poster = name_p.find('img').get("src") maidId_re = re.search(r'com/(.*?)\.', seed) if maidId_re: maidId = maidId_re.group(1) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['mainId'] = maidId pages = 0 total_num_p = soup.find('li', attrs={'class': 'itemOne'}) if total_num_p is not None: total_num_re = re.search(r'\d+', total_num_p.get_text()) if total_num_re: total_num = total_num_re.group() pages = int(total_num) / 20 + 2 if pages != 0: for page in range(1, pages): sub_seed = seed.replace('.html', '-' + str(page) + '.html') self.secondSpider(sub_seed)
def firstSpider(self, seed): name = "" mainId = "" programNum = '' uniqueFlag = '' ctype = "英雄联盟".decode('utf8') doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") name_p = soup.find("span",attrs={'class':'txt', 'id':'userInfoNick'}) if name_p is not None: name = name_p.get_text() self.program["name"] = spiderTool.changeName(name) mainId_p = re.search(r'http://v.qq.com/vplus/(.*?)/videos', seed) if mainId_p: mainId = mainId_p.group(1) self.program["mainId"] = mainId self.program['ctype'] = spiderTool.listStringToJson('name',ctype) if re.search(r'共(\d+)个视频', doc): programNum = re.search(r'共(\d+)个视频', doc).group(1) if re.search(r"visited_euin : '(.*?)'", doc): uniqueFlag = re.search(r"visited_euin : '(.*?)'",doc).group(1) if programNum != '' and uniqueFlag != '': pages = range(1, int(programNum)/25 +2) else: return for page in pages: sub_url = 'http://c.v.qq.com/vchannelinfo?uin=%s&qm=1&pagenum=%s&num=25&sorttype=0&orderflag=1&callback=jQuery' %(uniqueFlag, page) self.secondSpider(sub_url)
def firstSpider(self, program): self.program = copy.deepcopy(BASE_CONTENT["program"]) self.program["ptype"] = "新闻".decode("utf8") self.program['website'] = '优酷'.decode("utf8") self.program['getTime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) self.program['totalSets'] = 1 poster = "" name = "" mainId = "" pcUrl = "" poster_p = program.find("img") if poster_p is not None: poster = poster_p.get("src") name = poster_p.get("alt") pcUrl_P = program.find("a") if pcUrl_P is not None: pcUrl = "http:%s" % pcUrl_P.get("href") if re.search(r'id_(.*?)='.decode('utf8'), pcUrl): mainId = re.search(r'id_(.*?)='.decode('utf8'), pcUrl).group(1) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['pcUrl'] = pcUrl self.program['mainId'] = mainId #print name,mainId,pcUrl,poster if self.program['name'] == '' or self.program['name'] is None \ or self.program['mainId'] == ''or self.program['mainId'] is None: return json.dumps(PROGRAM_SUB) content = {'program': self.program} str = json.dumps(content) self.jsonData = str + "\n"
def firstSpider(self, detail): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" pcUrl = "" mainId = "" area = "" public = "" div_public = detail.find("div", attrs={"class": "site-piclist_info"}) if div_public is not None: public_P = div_public.find('div', attrs={"class": "role_info"}) if public_P is not None: public = public_P.get_text().strip() if re.search(r'\d+-\d+-\d+', public): shootYear = public.split("-")[0] else: public = "" div_tag = detail.find("div", attrs={"class": "site-piclist_pic"}) if div_tag is None: return a_tag = div_tag.find("a", attrs={"target": "_blank"}) if a_tag is None: return img = a_tag.find("img") if img is not None: poster_P = img.get("src") if poster_P is not None: poster = poster_P name_P = img.get("alt") if name_P is not None: name = name_P pcUrl_P = a_tag.get("href") if pcUrl_P is not None: pcUrl = pcUrl_P mainId_P = re.search(r'http://www\.iqiyi\.com/(\w+)\.html', pcUrl) if mainId_P: mainId = mainId_P.group(1) if shootYear != "": mainId = public.replace('-', '') + mainId intro = name self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['pcUrl'] = pcUrl self.program['mainId'] = mainId self.secondSpider(pcUrl, name, poster, public)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" mainId_p = re.findall(r'http://www\.mgtv\.com/\w/\d+/(.*)\.html', seed) if mainId_p: mainId = mainId_p[0] # get doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") videoinfo = soup.find('div', attrs={'class': 'v-panel-info v-panel-mod'}) if videoinfo is not None: pinfo_list = videoinfo.find_all('p') for pinfo in pinfo_list: pinfo_str = str(pinfo) # print(pinfo_str) if re.search(r'导演:'.decode('utf-8'), pinfo_str.decode('utf8')): director = pinfo.find('a').get_text() if re.search(r'暂无'.decode('utf-8'), director) or re.search( r'未知'.decode('utf-8'), director): director = "" elif re.search(r'主演:'.decode('utf-8'), pinfo_str.decode('utf8')): star_p = pinfo.find_all('a') star_list = [] for li in star_p: if not re.search(r'暂无'.decode('utf-8'), li.get_text()) or re.search( r'未知'.decode('utf-8'), li.get_text()): star_list.append(li.get_text()) star = ','.join(star_list) elif re.search(r'地区:'.decode('utf-8'), pinfo_str.decode('utf8')): area_p = pinfo.find_all('a') area_list = [] for li in area_p: area_list.append(li.get_text()) area = ','.join(area_list) elif re.search(r'类型:'.decode('utf-8'), pinfo_str.decode('utf8')): ctype_p = pinfo.find_all('a') ctype_list = [] for li in ctype_p: li.append(li.get_text()) ctype = ','.join(ctype_list) elif re.search(r'简介:'.decode('utf-8'), pinfo_str.decode('utf8')): intro_p = pinfo.find('span', attrs={'class': 'details'}) if intro_p is not None: intro = intro_p.get_text() seedJson = "" if mainId != '': seedJson = "http://pcweb.api.mgtv.com/episode/list?video_id=%s&page=0&size=40" % mainId else: return json_data = "" tatal_pages = 1 current_page = 0 doc = spiderTool.getHtmlBody(seedJson) try: data = json.loads(doc) except: # print("load json error1111!") return if data.get('data') is None: # print("get html error1111") return json_data = data['data'] if json_data.get('total_page'): tatal_pages = json_data['total_page'] if json_data.get('info'): name = json_data['info']['title'] if intro == "": intro = json_data['info']['desc'] # if json_data.get('current_page'): # current_page = json_data['current_page'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId for pageNo in range(1, tatal_pages + 1): subseed = "http://pcweb.api.mgtv.com/episode/list?video_id=%s&page=%s&size=40" % ( mainId, pageNo) self.secondSpider(subseed)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" pid = "" doc = spiderTool.getHtmlBody(seed) pid_P = re.search(r'pid:\s*(?P<pid>\d+),', doc) pid_P1 = re.search(r'pid:\s*"(?P<pid>\d+)"', doc) if pid_P: pid = pid_P.group('pid') elif pid_P1: pid = pid_P1.group('pid') else: print "not pid" if pid == "" or pid == "0": soup = BeautifulSoup(doc, from_encoding="utf8") more_tag = soup.find("a", attrs={'class': 'more'}) if more_tag is not None: url = more_tag.get('href') if url is not None: if re.search(r'http://www\.le\.com/tv/(?P<pid>\d+)\.html,', doc): pid = re.search( r'http://www\.le\.com/tv/(?P<pid>\d+)\.html,', doc).group('pid') if pid != '' or pid != '0': seed = 'http://static.app.m.letv.com/android/mod/mob/ctl/album/act/detail/id/%s/pcode/010110014/version/5.2.3.mindex.html' % ( pid) else: return mainId = pid doc = spiderTool.getHtmlBody(seed) json_data = json.loads(doc) if json_data.has_key("body"): if type(json_data["body"]) is types.DictionaryType: json_data = json_data["body"] else: return if json_data.has_key("nameCn"): name = json_data["nameCn"] if json_data.has_key("nameCn"): name = json_data["nameCn"] if json_data.has_key("picCollections"): poster_dict = json_data["picCollections"] if type(poster_dict) is types.DictionaryType: if poster_dict.has_key('400*300'): poster = poster_dict['400*300'] if poster == "": for each in poster_dict: if poster_dict[each] != "": poster = poster_dict[each] if json_data.has_key("score"): try: point = float(json_data["score"]) except: point = 0.0 if json_data.has_key("releaseDate"): shootYear_P = json_data["releaseDate"] if re.search(r'(\d{4})-\d{2}-\d{2}', shootYear_P): shootYear = re.search(r'(\d{4})-\d{2}-\d{2}', shootYear_P).group(1) elif re.search(r'^\d{4}$', shootYear_P): shootYear = shootYear_P if json_data.has_key("directory"): director_P = json_data["directory"] if type(director_P) is types.UnicodeType: director = director_P.strip().replace(" ", ",") if json_data.has_key("starring"): star_P = json_data["starring"] if type(star_P) is types.UnicodeType: star = star_P.strip().replace(" ", ",") if json_data.has_key("area"): area_P = json_data["area"] if type(area_P) is types.UnicodeType: area = area_P.strip().replace(" ", ",") if json_data.has_key("subCategory"): ctype_P = json_data["subCategory"] if type(ctype_P) is types.UnicodeType: ctype = ctype_P.strip().replace(" ", ",") if json_data.has_key('description'): intro = json_data['description'] if json_data.has_key('language'): programLanguage = json_data['language'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId seed_sub = "http://static.app.m.letv.com/android/mod/mob/ctl/videolist/act/detail/id/%s/vid/25520328/b/1/s/60/o/-1/m/0/pcode/010110014/version/5.2.3.mindex.html" % ( pid) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 alias = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("div", attrs={'class': 'result_pic'}) if poster_p is not None: poster = poster_p.find('img').get('src') name_p = soup.find('h1', attrs={'class': 'main_title'}) if name_p is not None: name = name_p.find('a').get_text() if re.search(r'\d+.\.\d+', name_p.get_text()): point = float( re.search(r'\d+.\.\d+', name_p.get_text()).groups()) detail = soup.find_all("div", attrs={"class": "topic_item clearfix"}) for each in detail: if each.find('div', attrs={"class": "look_point"}): ctype_p = each.find('div', attrs={"class": "look_point"}) a_list = [] for a_tag in ctype_p.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) ctype = a_str for p_tag in each.find_all('em'): a_list = [] for a_tag in p_tag.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) if re.search("导演:".decode("utf8"), p_tag.get_text()): director = a_str if re.search("语言:".decode("utf8"), p_tag.get_text()): programLanguage = a_str if re.search("配音:".decode("utf8"), p_tag.get_text()): star = a_str if re.search("地区:".decode("utf8"), p_tag.get_text()): area = a_str content_p = soup.find('p', attrs={"data-movlbshowmore-ele": "whole"}) if content_p is not None: intro = content_p.get_text().strip() mainId_P = re.search(r'http://www\.iqiyi\.com/lib/(\w+)\.html', seed) if mainId_P: mainId = mainId_P.group(1) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId self.secondSpider(doc)
def firstSpider(self, seed): name = "" pcUrl = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" point = "" mainId = "" area = "" if re.search(r'http://v.pptv.com/show/\w+\.html', seed): seed_post = seed.replace("/show/", "/page/") else: seed_post = seed doc = spiderTool.getHtmlBody(seed_post) soup = BeautifulSoup(doc, from_encoding="utf8") poster_P = soup.find("a", attrs={"class": "cover-a"}) poster_P1 = soup.find("div", attrs={"class": "bd cf"}) if poster_P is not None: img_tag = poster_P.find("img") if img_tag is not None: img = img_tag.get('data-src2') if img is not None: poster = img elif poster_P1 is not None: img_tag = poster_P1.find("img") if img_tag is not None: img = img_tag.get('src') if img is not None: poster = img if re.search(r'http://v.pptv.com/page/\w+\.html', seed): seed = seed.replace("/page/", "/show/") doc = spiderTool.getHtmlBody(seed) pid_P = re.search(r'"pid":\s*(?P<pid>\d+),'.decode('utf8'), doc) pid_P1 = re.search(r'"pid":\s*"(?P<pid>\d+)"'.decode('utf8'), doc) if pid_P: pid = pid_P.group('pid') seed_sub = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=' + pid elif pid_P1: pid = pid_P1.group('pid') seed_sub = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=' + pid else: return id_P = re.search(r'"id":\s*(?P<id>\d+),'.decode('utf8'), doc) id_P1 = re.search(r'"id":\s*"(?P<id>\d+)"'.decode('utf8'), doc) if id_P: id = id_P.group('id') seedData = 'http://svcdn.pptv.com/show/v1/meta.json?cid=%s&sid=%s&psize=50' % ( id, pid) mainId = "dianying" + id elif id_P1: id = id_P1.group('id') mainId = "dianying" + id seedData = 'http://svcdn.pptv.com/show/v1/meta.json?cid=%s&sid=%s&psize=50' % ( id, pid) else: return doc = spiderTool.getHtmlBody(seedData) try: json_data = json.loads(doc) except: json_data = {} if json_data.has_key("data"): data = json_data["data"] else: return if data.has_key("title"): name_P = data["title"] if type(name_P) is types.UnicodeType: name = name_P def getListName(dict_P): dict_list = [] if type(dict_P) is types.ListType: for each in dict_P: if type(each) is not types.DictionaryType: continue name = "" if each.has_key("name"): name = each["name"] elif each.has_key("text"): name = each["text"] if re.search(r'未知'.decode('utf8'), name) or name == '': continue dict_list.append(name) return ",".join(dict_list) if data.has_key("directors"): director = getListName(data["directors"]) if data.has_key("actors"): star = getListName(data["actors"]) if data.has_key("ct"): ctype = getListName(data["ct"]) if data.has_key("area"): area = getListName(data["area"]) if data.has_key("score"): point = data["score"] try: point = float(point) except: point = 0.0 if data.has_key("summary"): intro = data["summary"] self.program["name"] = spiderTool.changeName(name) self.program['pcUrl'] = seed self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['point'] = point self.program['mainId'] = mainId self.secondSpider(seed, name)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 poster = "" playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") #get poster poster_p = soup.find('img', attrs={'class': 'cover_image'}) if poster_p is not None: poster = poster_p.get('src') if not re.match(r'http://', poster): poster = "http:%s" % poster #get name page_body = soup.find('div', attrs={'class': 'b-page-body'}) if page_body is not None: name_p = page_body.find('div', attrs={'class': 'v-title'}) if name_p is not None: name = name_p.get_text() v_info = soup.find('div', attrs={'class': 'v_info'}) if v_info is not None: #get intro intro_p = v_info.find('div', attrs={'class': 'intro'}) if intro_p is not None: intro = intro_p.get_text().strip() #get ctype ctype_list = [] ctype_p = v_info.find('div', attrs={'class': 's_tag'}) if ctype_p is not None: ctype_p = ctype_p.find_all('li') for li in ctype_p: ctype_list.append(li.get_text()) ctype = ','.join(ctype_list) if re.match(r'http://www\.bilibili\.com/video/(.*)/', seed): mainId = re.match(r'http://www\.bilibili\.com/video/(.*)/', seed).group(1) self.program['name'] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId self.secondSpider()
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("ul", attrs={'class': 'focus_img_list'}) poster_p_1 = soup.find("div", attrs={"class": "result_pic pr"}) if poster_p is not None: poster_p_sub = poster_p.find('li').get('style') if re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub): poster = re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub).group(1) elif poster_p_1 is not None: img_tag = poster_p_1.find("img") if img_tag is not None: poster = img_tag.get("src") name = img_tag.get("alt") name_p = soup.find('a', attrs={'class': 'white'}) if name_p is not None: name = name_p.get_text() detail = soup.find("div", attrs={"class": "result_detail-minH"}) detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"}) if detail is not None: for div_p in detail.find_all("div", attrs={"class": "topic_item clearfix"}): for each in div_p.find_all("div"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主演:".decode("utf8"),each.get_text()): star = a_str if re.search("导演:".decode("utf8"),each.get_text()): director = a_str if re.search("类型:".decode("utf8"),each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"),each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"),each.get_text()): area = a_str elif detail_1 is not None: for p_tag in detail_1.find_all("p"): a_list = [] for a_tag in p_tag.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) if re.search("导演:".decode("utf8"),p_tag.get_text()): director = a_str if re.search("类型:".decode("utf8"),p_tag.get_text()): ctype = a_str if re.search("语言:".decode("utf8"),p_tag.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"),p_tag.get_text()): area = a_str if re.search("主演:".decode("utf8"),p_tag.get_text()): star = a_str content_p = soup.find('span', attrs={"class": "showMoreText", "data-moreorless":"moreinfo", "style":"display: none;"}) content_p_1 = soup.find('div', attrs={"data-moreorless":"moreinfo"}) if content_p is not None: if content_p.find("span"): content_p = content_p.find("span") if re.search("简介:".decode("utf8"),content_p.get_text()): intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip() elif content_p_1 is not None: if re.search("简介:".decode("utf8"),content_p_1.get_text()): intro = content_p_1.get_text().split("简介:".decode("utf8"))[1].strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search(r'albumId:\s*(?P<albumId>\d+)[^\\]*?cid:\s*(?P<cid>\d+)'.decode('utf8'), doc) if seed_P: albumId = seed_P.group('albumId') cid = seed_P.group('cid') self.program['mainId'] = cid + "_" +albumId seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/' %(albumId) allNum = 0 doc = spiderTool.getHtmlBody(seed_sub) try: json_data = json.loads(doc.split('=')[1]) data = json_data["data"]["vlist"] allNum = json_data["data"]["allNum"] except: data = [] for i in range(1,(int(allNum)/50 + 2)): seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/%s/50/' %(albumId, str(i)) self.secondSpider(seed_sub)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 poster = "" playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") head = soup.find('head') meta_list = head.find_all('meta') for meta in meta_list: #get name if meta.get('property') == "og:title": name = meta.get('content') #get poster if meta.get('property') == "og:image": poster = meta.get('content') #get intro if meta.get('property') == "og:description": intro = meta.get('content') #get area if meta.get('property') == "og:area": area = meta.get('content') #get mainId if re.match(r'http://v\.yinyuetai\.com/video/(.*)', seed): mainId = re.match(r'http://v\.yinyuetai\.com/video/(.*)', seed).group(1) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId self.program["name"] = spiderTool.changeName(name) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.secondSpider()
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("img", attrs={'width': '195'}) poster_p_1 = soup.find( 'div', attrs={'class': 'album-picCon album-picCon-onePic'}) poster_p_2 = soup.find('ul', attrs={'class': 'album-imgs'}) if poster_p is not None: poster = poster_p.get("src") name = poster_p.get("alt") elif poster_p_1 is not None: img_tag = poster_p_1.find("img") if img_tag is not None: poster = img_tag.get("src") name = img_tag.get("alt") elif poster_p_2 is not None: poster = poster_p_2.find('img').get('src') if name == "": name_p = soup.find('div', attrs={'class': 'album-playArea clearfix'}) if name_p is not None: name = name_p.find('h1').get_text() detail = soup.find("div", attrs={"class": "result_detail-minH"}) detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"}) if detail is not None: for div_p in detail.find_all( "div", attrs={"class": "topic_item clearfix"}): for each in div_p.find_all("div"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主持人:".decode("utf8"), each.get_text()): star = a_str if re.search("类型:".decode("utf8"), each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"), each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"), each.get_text()): area = a_str elif detail_1: for each in detail_1.find_all("p"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主持人:".decode("utf8"), each.get_text()): star = a_str if re.search("类型:".decode("utf8"), each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"), each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"), each.get_text()): area = a_str content_p = soup.find('span', attrs={ "class": "showMoreText", "data-moreorless": "moreinfo", "style": "display: none;" }) content_p_1 = soup.find("span", attrs={"class": "bigPic-b-jtxt"}) if content_p is not None: if content_p.find("span"): content_p = content_p.find("span") if re.search("简介:".decode("utf8"), content_p.get_text()): intro = content_p.get_text().split( "简介:".decode("utf8"))[1].strip() elif content_p_1 is not None: intro = content_p_1.get_text() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search( r'sourceId:\s*(?P<sourceId>\d+),\s+cid:\s*(?P<cid>\d+)', doc) if seed_P: sourceId = seed_P.group('sourceId') cid = seed_P.group('cid') self.program['mainId'] = sourceId seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % ( cid, sourceId) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") name_p = soup.find('a', attrs={'class': 'white'}) if name_p is not None: name = name_p.get_text() poster_p = soup.find("ul", attrs={'class': 'album-imgs'}) if poster_p is not None: if poster_p.find('img') is not None: poster = poster_p.find('img').get("src") starts_P = soup.find("dd", attrs={"class": "actor"}) if starts_P is not None: start_list = [] for each in starts_P.find_all('a'): start_list.append(each.get_text()) star = ",".join(start_list) detail = soup.find("div", attrs={"class": "msg-hd-lt fl"}) if detail is not None: for p_tag in detail.find_all('p'): a_list = [] for a_tag in p_tag.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) if re.search("导演:".decode("utf8"), p_tag.get_text()): director = a_str if re.search("语言:".decode("utf8"), p_tag.get_text()): programLanguage = a_str if re.search("配音:".decode("utf8"), p_tag.get_text()): star = a_str if re.search("地区:".decode("utf8"), p_tag.get_text()): area = a_str if re.search("类型:".decode("utf8"), p_tag.get_text()): ctype = a_str content_p = soup.find('div ', attrs={"data-moreorless": "lessinfo"}) if content_p is not None: if re.search("简介:".decode("utf8"), content_p.get_text()): intro = content_p.get_text().split( "简介:".decode("utf8"))[1].strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search( r'sourceId:\s*(?P<sourceId>\d+),\s*cid:\s*(?P<cid>\d+)', doc) if seed_P: sourceId = seed_P.group('sourceId') cid = seed_P.group('cid') self.program['mainId'] = sourceId if sourceId != '0': seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % ( cid, sourceId) self.secondSpider(seed_sub) return seed_P = re.search( r'albumId:\s*(?P<albumId>\d+),[^\\]*?cid:\s*(?P<cid>\d+)', doc) if seed_P: albumId = seed_P.group('albumId') cid = seed_P.group('cid') self.program['mainId'] = albumId seed_sub = 'http://cache.video.qiyi.com/jp/avlist/%s/' % (albumId) self.secondSpider(seed_sub) return
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") detail_p = soup.find("div", attrs={'class': 'k_jianjie'}) if detail_p is not None: #get poster poster_p = detail_p.find("div", attrs={'id': 'k_jianjie-2b'}) if poster_p is not None and poster_p.find('img') is not None: poster = poster_p.find('img').get('src') name = poster_p.find('img').get('alt') #get detail detail_1 = detail_p.find("div", attrs={'id': 'k_jianjie-3a'}) if detail_1 is not None: detail_list = detail_1.find_all('ul') if detail_list is not None: for ul in detail_list: ul_p = str(ul) if ul.find('li').get('class') == [ 'k_jianjie-3a-1-name' ] and name == "": name = ul.find('li').get_text() elif re.search(r'状态:'.decode('utf8'), ul_p.decode('utf8')): if re.search(r'预告'.decode('utf8'), ul_p.decode('utf8')): return elif re.search(r'别名:'.decode('utf8'), ul_p.decode('utf8')): alias_p = ul.find_all('li') alias = "" for li in alias_p: if not re.search(r'别名:'.decode('utf8'), (str(li)).decode('utf8')): alias = li.get_text() elif re.search(r'导演:'.decode('utf8'), ul_p.decode('utf8')): director_p = ul.find_all('li') for li in director_p: if not re.search(r'导演:'.decode('utf8'), (str(li)).decode('utf8')): director_p = li.get_text().strip() director_p = director_p.replace('/', '') director = ','.join(director_p.split()) elif re.search(r'演员:'.decode('utf8'), ul_p.decode('utf8')): star_p = ul.find_all('li') for li in star_p: if not re.search(r'演员:'.decode('utf8'), (str(li)).decode('utf8')): star_p = li.get_text().strip() star_p = star_p.replace('/', '') star = ','.join(star_p.split()) elif re.search(r'地区:'.decode('utf8'), ul_p.decode('utf8')): area_p = ul.find_all('li') for li in area_p: if not re.search(r'地区:'.decode('utf8'), (str(li)).decode('utf8')): area = li.get_text().strip() elif re.search(r'语言:'.decode('utf8'), ul_p.decode('utf8')): language_p = ul.find_all('li') for li in language_p: if not re.search(r'语言:'.decode('utf8'), (str(li)).decode('utf8')): programLanguage = li.get_text().strip() elif re.search(r'剧情:'.decode('utf8'), ul_p.decode('utf8')): intro_p = ul.find_all('li') for li in intro_p: if not re.search(r'剧情:'.decode('utf8'), (str(li)).decode('utf8')): intro = li.get_text().strip() #get mainId if re.match(r'http://www\.yehetang\.com/movie/(.*)\.html', seed): mainId = re.match(r'http://www\.yehetang\.com/movie/(.*)\.html', seed).group(1) if self.ctype.get(seed): ctype = self.ctype[seed] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId video_p = soup.find('div', attrs={'id': 'play_1'}) if video_p is not None: video_list = video_p.find_all('li') self.secondSpider(video_list)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("img", attrs={'itemprop': 'image'}) if poster_p is not None: if poster_p.get("src") is not None: poster = poster_p.get("src") poster = "http:%s" % poster if poster_p.get("alt") is not None: name = poster_p.get("alt") video_type = soup.find("div", attrs={"class": "video_type cf"}) if video_type is not None: divs = video_type.find_all('div', attrs={"class": "type_item"}) for div in divs: text = div.get_text() if re.search(r'地 区:'.decode("utf8"), text): area_p = div.find('span', attrs={"class": "type_txt"}) if area_p is not None: area = area_p.get_text() if re.search(r'语 言:'.decode("utf8"), text): programLanguage_p = div.find('span', attrs={"class": "type_txt"}) if programLanguage_p is not None: programLanguage = programLanguage_p.get_text() if re.search(r'上映时间:'.decode("utf8"), text): shootYear_p = div.find('span', attrs={"class": "type_txt"}) if shootYear_p is not None: shootYear = shootYear_p.get_text().split("-")[0] ctype_P = soup.find("div", attrs={"class": "tag_list"}) if ctype_P is not None: ctype_list = [] for each in ctype_P.find_all('a'): ctype_list.append(each.get_text()) ctype = ",".join(ctype_list) person_p = soup.find("ul", attrs={"class": "actor_list cf"}) if person_p is not None: lis = person_p.find_all('li') director_list = [] star_list = [] for li in lis: person = "" star_p = li.find("span", attrs={'class': 'name'}) if star_p is not None: person = star_p.get_text() if person != "" and li.find("span", attrs={"class": "director"}): director_list.append(person) elif person != "": star_list.append(person) star = ",".join(star_list) director = ",".join(director_list) content_p = soup.find('span', attrs={"class": "txt _desc_txt_lineHight"}) if content_p is not None: intro = content_p.get_text().strip() if re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html', seed): mainId = re.match( r'http://film\.qq\.com/(page|cover)/.*/(.*?).html', seed).group(2) if re.match(r'https://v\.qq\.com/detail/[\w\d]/(.*?).html', seed): mainId = re.match(r'https://v\.qq\.com/detail/[\w\d]/(.*?).html', seed).group(1) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['programLanguage'] = programLanguage self.program['intro'] = intro self.program['mainId'] = mainId seed_sub = 'https://s.video.qq.com/loadplaylist?vkey=897_qqvideo_cpl_%s_qq' % mainId self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" if re.search(r'videoId=(?P<pid>\d+)', seed): mainId = re.search(r'videoId=(?P<pid>\d+)', seed).group('pid') doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") data = soup.find('video') if data is None: return name_p = data.find('name') name_r = re.search(r'<name>(.*?)</name>', doc) if name_p is not None: name = name_p.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif name_r: name = name_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() poster_p = data.find('smallImg') poster_r = re.search(r'<smallImg>(.*?)</smallImg>', doc) if poster_p is not None: poster = poster_p.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif poster_r: poster = poster_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() shootYear_P = data.find('screenTime') shootYear_r = re.search(r'<screenTime>(.*?)</screenTime>', doc) if shootYear_P is not None: shootYear = shootYear_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif shootYear_r: shootYear = shootYear_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() area_P = data.find('area') area_r = re.search(r'<area>(.*?)</area>', doc) if area_P is not None: area = area_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif area_r: area = area_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() director_P = data.find('director') director_r = re.search(r'<director>(.*?)</director>', doc) if director_P is not None: director = director_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif director_r: director = director_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() star_P = data.find('performer') star_r = re.search(r'<performer>(.*?)</performer>', doc) if star_P is not None: star = star_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif star_r: star = star_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() ctype_P = data.find('cate') ctype_r = re.search(r'<cate>(.*?)</cate>', doc) if ctype_P is not None: ctype = ctype_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif ctype_r: ctype = ctype_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() intro_P = data.find('annotation') intro_r = re.search(r'<annotation>(.*?)</annotation>', doc) if intro_P is not None: intro = intro_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif intro_r: intro = intro_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear.split("-")[0] self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId detail_sub = data.find('videomergeinfolist') if detail_sub is not None: self.secondSpider(detail_sub)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 poster = "" playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") pageInfo = soup.find('div', attrs={'id': 'pageInfo'}) if pageInfo is not None: #name = pageInfo.get('data-title') poster = pageInfo.get('data-pic') #intro = pageInfo.get('data-desc') introPage = soup.find('div', attrs={'class': 'introduction'}) if introPage is not None: #get intro intro_p = introPage.find('div', attrs={'class': 'desc gheight'}) if intro_p is not None: intro = intro_p.find('div').get_text() #get platytimes playTimes_p = soup.find('section', attrs={'class': 'clearfix wp area crumb'}) if playTimes_p is not None: #playTimes = long(playTimes_p.find('span', attrs={'class': 'sp2'}).get_text()) playTimes = 0 if re.match(r'http://www\.acfun\.tv/v/(.*)', seed): mainId = re.match(r'http://www\.acfun\.tv/v/(.*)', seed).group(1) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId videoList = [] soup = BeautifulSoup(doc, from_encoding="utf8") data = soup.find_all('script') for li in data: if re.match(r'.*pageInfo.*', ''.join(li)): pageInfo = li.get_text() pageInfo_p = re.findall(r'pageInfo\s+=\s+(.*)', pageInfo) if pageInfo_p: try: pageInfo_json = json.loads(pageInfo_p[0]) except: break #get ctype ctype_list = [] for tag in pageInfo_json['tagList']: ctype_list.append(tag['name']) ctype = ','.join(ctype_list) #get name name = pageInfo_json['title'] videoList = pageInfo_json['videoList'] self.program["name"] = spiderTool.changeName(name) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.secondSpider(videoList)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 poster = "" playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") introPage = soup.find('div', attrs={'class': 'info-content'}) if introPage is not None: #get poster poster_p = introPage.find('div', attrs={'class': 'bangumi-preview'}) if poster_p is not None: poster = poster_p.find('img').get('src') if not re.match(r'http://', poster): poster = "http:%s" % poster #get info info_p = introPage.find('div', attrs={'class': 'bangumi-info-r'}) if info_p is not None: head = info_p.find('div', attrs={'class': 'b-head'}) if head is not None: # get name name = head.find('h1').get_text() #get ctype ctype_list = [] ctype_p = head.find_all('span') for span in ctype_p: ctype_list.append(span.get_text()) ctype = ','.join(ctype_list) #get playtimes info_count = info_p.find('div', attrs={'class': 'info-count'}) if info_count is not None: #playTimes = info_count.find('em').get_text() playTimes = 0 #get actors #get desc info_desc = info_p.find('div', attrs={'class': 'info-desc'}) if info_desc is not None: intro = info_desc.get_text().strip() if re.match(r'http://bangumi\.bilibili\.com/anime/(.*)', seed): mainId = re.match(r'http://bangumi\.bilibili\.com/anime/(.*)', seed).group(1) self.program['name'] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId videoList = [] subprg_list = soup.find_all( 'li', attrs={'class': 'v1-bangumi-list-part-child'}) for li in subprg_list: if li.find('a').get('class') == ['v1-complete-text']: videoList.append(li) self.secondSpider(videoList)
def firstSpider(self, seed): point = 0.0 poster = "" name = "" shootYear = "" alias = "" area = "" star = "" director = "" ctype = "" playTimes = 0 intro = "" mainId = "" doc = spiderTool.getHtmlBody(seed) if re.search(r'playlistId\s*=\s*"(\d+)"', doc): mainId = re.search(r'playlistId\s*=\s*"(\d+)"', doc).group(1) elif re.search(r'PLAYLIST_ID\s*=\s*"(\d+)"', doc): mainId = re.search(r'PLAYLIST_ID\s*=\s*"(\d+)"', doc).group(1) elif re.search(r'http://film\.sohu\.com/album/(\d+)\.html', seed): mainId = re.search(r'http://film\.sohu\.com/album/(\d+)\.html', seed).group(1) else: return seed = "http://pl.hd.sohu.com/videolist?playlistid=%s&callback=__get_videolist" % ( mainId) try: doc = spiderTool.getHtmlBody(seed).decode('gbk').encode('utf8') doc = doc.split('__get_videolist(')[1][:-2] data = json.loads(doc) except: return if data.has_key('albumName'): name = data['albumName'] if data.has_key('mainActors'): star = ','.join(data['mainActors']) if data.has_key('categories'): ctype = ','.join(data['categories']) if data.has_key('publishYear'): shootYear = str(data['publishYear']) if data.has_key('albumDesc'): intro = data['albumDesc'] if data.has_key('largeVerPicUrl'): poster = data['largeVerPicUrl'] if data.has_key('directors'): director = ','.join(data['directors']) if data.has_key('area'): area = data['area'] self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId if data.has_key('videos'): videos = data['videos'] self.secondSpider(videos)
def firstSpider(self, seed): point = 0.0 poster = "" name = "" shootYear = "" alias = "" area = "" star = "" director = "" ctype = "" playTimes = 0 intro = "" mainId = "" youkushootYear = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) seed_v = re.search(r'http://v\.youku\.com/v_show/id_', seed) seed_Re = re.search(r'http://www\.youku\.com/show_page/id', seed) if seed_v: seed_P = soup.find('a', attrs={'class': 'desc-link'}) if seed_P is not None: seed = seed_P.get('href') seed = "http:%s" % seed doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) elif not seed_Re: seed_P = soup.find('h1', attrs={'class': 'title'}) if seed_P is not None: seed_aTag = seed_P.find('a') if seed_aTag is not None: seed = seed_aTag.get('href') seed = "http:%s" % seed doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) poster_p = soup.find("div", attrs={'class': 'p-thumb'}) if poster_p is not None: poster = poster_p.find('img').get("src") p_base_content = soup.find('div', attrs={'class': 'p-base'}) if p_base_content is not None: for li in p_base_content.find_all('li'): li_p = str(li) if li.find('span', attrs={'class': 'star-num'}) is not None: point = li.find('span', attrs={ 'class': 'star-num' }).get_text() elif li.get('class') == ['p-row', 'p-title']: name_p = re.findall(r'/a>:(.*)<span'.decode('utf8'), li_p.decode('utf8')) if name_p: name = name_p[0] if re.search(r'<'.decode('utf8'), name): name_p = re.findall(r'(.*)<span'.decode('utf8'), name) if name_p: name = name_p[0] elif li.find('span', attrs={'class': 'pub'}) is not None: shootYear_P = li.find('span', attrs={'class': 'pub'}) if re.search(r'优酷'.decode('utf8'), li_p.decode('utf8')): youkushootYear_text = re.findall( r'/label>(.*)</span', str(shootYear_P)) if youkushootYear_text: youkushootYear_text = youkushootYear_text[0] youkushootYear = ''.join( youkushootYear_text.split('-')[0]) else: shootYear_text = re.findall(r'/label>(.*)</span', str(shootYear_P)) if shootYear_text: shootYear_text = shootYear_text[0] shootYear = ''.join(shootYear_text.split('-')[0]) elif re.search(r'<li>地区'.decode('utf8'), li_p.decode('utf8')): area_p = li.get_text() area_p = re.findall(r'地区:(.*)'.decode('utf8'), area_p) if area_p: area = area_p[0] area = area.replace('/', ',') elif re.search(r'<li>类型'.decode('utf8'), li_p.decode('utf8')): ctype_p = li.get_text() ctype_p = re.findall(r'类型:(.*)'.decode('utf8'), ctype_p) if ctype_p: ctype = ctype_p[0] ctype = ctype.replace('/', ',') elif re.search(r'<li>导演'.decode('utf8'), li_p.decode('utf8')): director_p = li.get_text() director_p = re.findall(r'导演:(.*)'.decode('utf8'), director_p) if director_p: director = director_p[0] elif li.get('class') == ['p-performer']: star_list = [] for each in li.find_all('a'): star_list.append(each.get_text()) star = ','.join(star_list) elif re.search(r'<li>总播放数'.decode('utf8'), li_p.decode('utf8')): playTimesStr = li.get_text() playTimesStr = re.findall(r'总播放数:(.*)'.decode('utf8'), playTimesStr) if playTimesStr: playTimesStr = playTimesStr[0] playTimes_list = re.findall(r'(\d+)', playTimesStr) playTimes = long(''.join(playTimes_list)) elif li.get('class') == ['p-row', 'p-intro']: intro = li.find('span').get_text().strip() else: continue if shootYear == "": shootYear = youkushootYear if re.match(r'http://list\.youku\.com/show/(id_(.+))\.html', seed): mainId = re.match(r'http://list\.youku\.com/show/(id_(.+))\.html', seed).group(2) self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId showid = "" showid_url = "" p_list_p = soup.find_all('script', attrs={'type': 'text/javascript'}) if p_list_p is not None: for each in p_list_p: if re.search(r'PageConfig', str(each)): showid_p = re.findall(r'showid:"(.*)", videoId', str(each)) if showid_p: showid = showid_p[0] showid_url = "http://list.youku.com/show/module?id=%s&tab=showInfo&callback=jQuery111208239648697137207" % showid if showid_url != "": sub_doc = spiderTool.getHtmlBody(showid_url) if re.search(r'window\.jQuery', sub_doc): sub_doc_p = re.search(r'\((.*)\);', sub_doc) sub_doc = sub_doc_p.group(1) try: data = json.loads(sub_doc) except: #print("load json error1111!") return if data.get('html') is None: #print("get html error1111") return sub_soup = BeautifulSoup(data['html']) reload_list_p = re.findall(r'id="reload_(\d+)"', data['html']) reload_list = list(set(reload_list_p)) if reload_list: def numeric_compare(x, y): x = int(x) y = int(y) if x > y: return 1 elif x == y: return 0 else: # x<y return -1 reload_list.sort(numeric_compare) reload_list.reverse() #print(reload_list) for reload in reload_list: sub_seed = "http://list.youku.com/show/episode?id=%s&stage=reload_%s&callback=jQuery111208239648697137207" % ( mainId, reload) self.secondSpider(sub_seed)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("img", attrs={'width': '195'}) if poster_p is not None: poster = poster_p.get("src") name = poster_p.get("alt") starts_P = soup.find("dd", attrs={"class": "actor"}) if starts_P is not None: start_list = [] for each in starts_P.find_all('a'): start_list.append(each.get_text()) star = ",".join(start_list) detail = soup.find("div", attrs={"class": "topic_item topic_item-rt"}) if detail is not None: for div_p in detail.find_all("div", attrs={"class": "item"}): for div in div_p: if re.search("主持人:".decode("utf8"), each.get_text()): director = each.get_text().split( "主持人:".decode("utf8"))[1] if re.search("类型:".decode("utf8"), each.get_text()): ctype = each.get_text().split("类型:".decode("utf8"))[1] if re.search("语言:".decode("utf8"), each.get_text()): programLanguage = each.get_text().split( "语言:".decode("utf8"))[1] content_p = soup.find('span', attrs={ "class": "showMoreText", "data-moreorless": "moreinfo", "style": "display: none;" }) if content_p is not None: if content_p.find("span"): content_p = content_p.find("span") if re.search("简介:".decode("utf8"), content_p.get_text()): intro = content_p.get_text().split( "简介:".decode("utf8"))[1].strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search( r'sourceId:\s*(?P<sourceId>\d+),\s*cid:\s*(?P<cid>\d+)', doc) if seed_P: sourceId = seed_P.group('sourceId') cid = seed_P.group('cid') self.program['mainId'] = sourceId seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % ( cid, sourceId) self.secondSpider(seed_sub)