def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" shootYear = "" doc = spiderTool.getHtmlBody(seed) tvId = re.search(r'data-player-tvid="([^"]+?)"', doc) videoId = re.search(r'data-player-videoid="([^"]+?)"', doc) if tvId and videoId: newUrl = 'http://cache.video.qiyi.com/vi/%s/%s/' % (tvId.group(1), videoId.group(1)) doc = spiderTool.getHtmlBody(newUrl) else: return try: json_data = json.loads(doc) name = json_data["shortTitle"] poster = json_data["apic"] star = json_data["ma"].replace("|", ",") director = json_data["d"].replace("|", ",") ctype = json_data["tg"].replace(" ", ",") programLanguage = json_data["ar"] intro = json_data["info"] except: return self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['intro'] = intro if re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed): mainId = re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed).group(1) elif re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed): mainId = re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed).group(1) self.program["mainId"] = mainId self.program_sub = copy.deepcopy(PROGRAM_SUB) self.program_sub['setNumber'] = 1 self.program_sub['setName'] = name self.program_sub['webUrl'] = seed self.program['programSub'].append(self.program_sub)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("img", attrs={'width': '195'}) poster_p_1 = soup.find( 'div', attrs={'class': 'album-picCon album-picCon-onePic'}) poster_p_2 = soup.find('ul', attrs={'class': 'album-imgs'}) if poster_p is not None: poster = poster_p.get("src") name = poster_p.get("alt") elif poster_p_1 is not None: img_tag = poster_p_1.find("img") if img_tag is not None: poster = img_tag.get("src") name = img_tag.get("alt") elif poster_p_2 is not None: poster = poster_p_2.find('img').get('src') if name == "": name_p = soup.find('div', attrs={'class': 'album-playArea clearfix'}) if name_p is not None: name = name_p.find('h1').get_text() detail = soup.find("div", attrs={"class": "result_detail-minH"}) detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"}) if detail is not None: for div_p in detail.find_all( "div", attrs={"class": "topic_item clearfix"}): for each in div_p.find_all("div"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主持人:".decode("utf8"), each.get_text()): star = a_str if re.search("类型:".decode("utf8"), each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"), each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"), each.get_text()): area = a_str elif detail_1: for each in detail_1.find_all("p"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主持人:".decode("utf8"), each.get_text()): star = a_str if re.search("类型:".decode("utf8"), each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"), each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"), each.get_text()): area = a_str content_p = soup.find('span', attrs={ "class": "showMoreText", "data-moreorless": "moreinfo", "style": "display: none;" }) content_p_1 = soup.find("span", attrs={"class": "bigPic-b-jtxt"}) if content_p is not None: if content_p.find("span"): content_p = content_p.find("span") if re.search("简介:".decode("utf8"), content_p.get_text()): intro = content_p.get_text().split( "简介:".decode("utf8"))[1].strip() elif content_p_1 is not None: intro = content_p_1.get_text() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search( r'sourceId:\s*(?P<sourceId>\d+),\s+cid:\s*(?P<cid>\d+)', doc) if seed_P: sourceId = seed_P.group('sourceId') cid = seed_P.group('cid') self.program['mainId'] = sourceId seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % ( cid, sourceId) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" pcUrl = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" point = "" mainId = "" area = "" if re.search(r'http://v.pptv.com/show/\w+\.html', seed): seed_post = seed.replace("/show/", "/page/") else: seed_post = seed doc = spiderTool.getHtmlBody(seed_post) soup = BeautifulSoup(doc, from_encoding="utf8") poster_P = soup.find("a", attrs={"class": "cover-a"}) poster_P1 = soup.find("div", attrs={"class": "bd cf"}) if poster_P is not None: img_tag = poster_P.find("img") if img_tag is not None: img = img_tag.get('data-src2') if img is not None: poster = img elif poster_P1 is not None: img_tag = poster_P1.find("img") if img_tag is not None: img = img_tag.get('src') if img is not None: poster = img if re.search(r'http://v.pptv.com/page/\w+\.html', seed): seed = seed.replace("/page/", "/show/") doc = spiderTool.getHtmlBody(seed) pid_P = re.search(r'"pid":\s*(?P<pid>\d+),'.decode('utf8'), doc) if pid_P: pid = pid_P.group('pid') seed_sub = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=' + pid mainId = pid else: return id_P = re.search(r'"id":\s*(?P<id>\d+),'.decode('utf8'), doc) if id_P: id = id_P.group('id') seed = 'http://svcdn.pptv.com/show/v1/meta.json?cid=%s&sid=%s&psize=50' % ( id, pid) else: return doc = spiderTool.getHtmlBody(seed) json_data = json.loads(doc) if json_data.has_key("data"): data = json_data["data"] else: return if data.has_key("set"): name_P = data["set"] if type(name_P) is types.ListType: if len(name_P) == 2: name = name_P[0] pcUrl = name_P[1] def getListName(dict_P): dict_list = [] if type(dict_P) is types.ListType: for each in dict_P: if type(each) is not types.DictionaryType: continue name = "" if each.has_key("name"): name = each["name"] elif each.has_key("text"): name = each["text"] if re.search(r'未知'.decode('utf8'), name) or name == '': continue dict_list.append(name) return ",".join(dict_list) if data.has_key("directors"): star = getListName(data["directors"]) if data.has_key("ct"): ctype = getListName(data["ct"]) if data.has_key("area"): area = getListName(data["area"]) if data.has_key("score"): point = data["score"] try: point = float(point) except: point = 0.0 if data.has_key("summary"): intro = data["summary"] self.program["name"] = spiderTool.changeName(name) self.program['pcUrl'] = pcUrl self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['point'] = point self.program['mainId'] = mainId self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" pid = "" if re.search(r'http://www\.le\.com/jilu/(?P<pid>\d+)\.html', seed): pid = re.search(r'http://www\.le\.com/jilu/(?P<pid>\d+)\.html', seed).group('pid') elif re.search(r'http://www\.letv\.com/jilu/(?P<pid>\d+)\.html', seed): pid = re.search(r'http://www\.letv\.com/jilu/(?P<pid>\d+)\.html', seed).group('pid') if pid != '' or pid != '0': seed = 'http://static.app.m.letv.com/android/mod/mob/ctl/album/act/detail/id/%s/pcode/010110014/version/5.2.3.mindex.html' % ( pid) else: return mainId = pid doc = spiderTool.getHtmlBody(seed) try: json_data = json.loads(doc) except: json_data = {} if json_data.has_key("body"): if type(json_data["body"]) is types.DictionaryType: json_data = json_data["body"] else: return if json_data.has_key("nameCn"): name = json_data["nameCn"] if json_data.has_key("nameCn"): name = json_data["nameCn"] if json_data.has_key("picCollections"): poster_dict = json_data["picCollections"] if type(poster_dict) is types.DictionaryType: if poster_dict.has_key('400*300'): poster = poster_dict['400*300'] if poster == "": for each in poster_dict: if poster_dict[each] != "": poster = poster_dict[each] if self.images.has_key(pid): poster = self.images[pid] if json_data.has_key("score"): try: point = float(json_data["score"]) except: point = 0.0 if json_data.has_key("releaseDate"): shootYear_P = json_data["releaseDate"] if re.search(r'(\d{4})-\d{2}-\d{2}', shootYear_P): shootYear = re.search(r'(\d{4})-\d{2}-\d{2}', shootYear_P).group(1) elif re.search(r'^\d{4}$', shootYear_P): shootYear = shootYear_P if json_data.has_key("directory"): director_P = json_data["directory"] if type(director_P) is types.UnicodeType: director = director_P.strip().replace(" ", ",") if json_data.has_key("starring"): star_P = json_data["starring"] if type(star_P) is types.UnicodeType: star = star_P.strip().replace(" ", ",") if json_data.has_key("area"): area_P = json_data["area"] if type(area_P) is types.UnicodeType: area = area_P.strip().replace(" ", ",") if json_data.has_key("subCategory"): ctype_P = json_data["subCategory"] if type(ctype_P) is types.UnicodeType: ctype = ctype_P.strip().replace(" ", ",") if json_data.has_key('description'): intro = json_data['description'] if json_data.has_key('language'): programLanguage = json_data['language'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId seed_sub = "http://static.app.m.letv.com/android/mod/mob/ctl/videolist/act/detail/id/%s/vid/25520328/b/1/s/60/o/-1/m/0/pcode/010110014/version/5.2.3.mindex.html" % ( pid) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("ul", attrs={'class': 'focus_img_list'}) poster_p_1 = soup.find("div", attrs={"class": "result_pic pr"}) if poster_p is not None: poster_p_sub = poster_p.find('li').get('style') if re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub): poster = re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub).group(1) elif poster_p_1 is not None: img_tag = poster_p_1.find("img") if img_tag is not None: poster = img_tag.get("src") name = img_tag.get("alt") name_p = soup.find('a', attrs={'class': 'white'}) if name_p is not None: name = name_p.get_text() detail = soup.find("div", attrs={"class": "result_detail-minH"}) detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"}) if detail is not None: for div_p in detail.find_all("div", attrs={"class": "topic_item clearfix"}): for each in div_p.find_all("div"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主演:".decode("utf8"),each.get_text()): star = a_str if re.search("导演:".decode("utf8"),each.get_text()): director = a_str if re.search("类型:".decode("utf8"),each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"),each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"),each.get_text()): area = a_str elif detail_1 is not None: for p_tag in detail_1.find_all("p"): a_list = [] for a_tag in p_tag.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) if re.search("导演:".decode("utf8"),p_tag.get_text()): director = a_str if re.search("类型:".decode("utf8"),p_tag.get_text()): ctype = a_str if re.search("语言:".decode("utf8"),p_tag.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"),p_tag.get_text()): area = a_str if re.search("主演:".decode("utf8"),p_tag.get_text()): star = a_str content_p = soup.find('span', attrs={"class": "showMoreText", "data-moreorless":"moreinfo", "style":"display: none;"}) content_p_1 = soup.find('div', attrs={"data-moreorless":"moreinfo"}) if content_p is not None: if content_p.find("span"): content_p = content_p.find("span") if re.search("简介:".decode("utf8"),content_p.get_text()): intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip() elif content_p_1 is not None: if re.search("简介:".decode("utf8"),content_p_1.get_text()): intro = content_p_1.get_text().split("简介:".decode("utf8"))[1].strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search(r'albumId:\s*(?P<albumId>\d+)[^\\]*?cid:\s*(?P<cid>\d+)'.decode('utf8'), doc) if seed_P: albumId = seed_P.group('albumId') cid = seed_P.group('cid') self.program['mainId'] = cid + "_" +albumId seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/' %(albumId) allNum = 0 doc = spiderTool.getHtmlBody(seed_sub) try: json_data = json.loads(doc.split('=')[1]) data = json_data["data"]["vlist"] allNum = json_data["data"]["allNum"] except: data = [] for i in range(1,(int(allNum)/50 + 2)): seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/%s/50/' %(albumId, str(i)) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" pid = "" pid_P = re.search( r'http://www.mgtv.com/\w/\d+/(?P<mainId>\d+)/\w/(?P<pid>\d+)\.html', seed) if pid_P: pid = pid_P.group('pid') mainId = pid_P.group('mainId') seedJson = "" if pid != '': seedJson = 'http://m.api.hunantv.com/video/getbyid?videoId=%s' % ( pid) else: return if mainId == "": mainId = pid doc = spiderTool.getHtmlBody(seedJson) if re.search(r'<html>', doc): doc = spiderTool.getHtmlBody(seedJson) json_data = json.loads(doc.strip()) if json_data.has_key("data"): if type(json_data["data"]) is types.DictionaryType: json_data = json_data["data"] if json_data.has_key("detail"): json_data = json_data["detail"] else: return else: return if json_data.has_key("collectionName"): name = json_data["collectionName"] if json_data.has_key("image"): poster = json_data["image"] if json_data.has_key("year"): shootYear = json_data["year"] if json_data.has_key("director"): director_P = json_data["director"] if type(director_P) is types.UnicodeType: director = director_P.strip().replace(" / ", ",") if json_data.has_key("player"): star_P = json_data["player"] if type(star_P) is types.UnicodeType: star = star_P.strip().replace(" / ", ",") if json_data.has_key("area"): area_P = json_data["area"] if type(area_P) is types.UnicodeType: area = area_P.strip().replace(" ", ",") if json_data.has_key('desc'): intro = json_data['desc'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId if json_data.has_key('typeId'): typeId = json_data['typeId'] if json_data.has_key('collectionId'): collectionId = json_data['collectionId'] pageNum = 20 if json_data.has_key("totalvideocount"): pageNum = json_data["totalvideocount"] seed_sub = "http://m.api.hunantv.com/video/getListV2?videoId=%s&pageId=0&pageNum=%s" % ( pid, pageNum) try: self.secondSpider(seed_sub, seed, pid) except: print "spider error: " + seed
def firstSpider(self, seed): point = 0.0 poster = "" name = "" shootYear = "" alias = "" area = "" star = "" director = "" ctype = "" playTimes = 0 intro = "" mainId = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") seed_Re = re.search(r'http://www\.youku\.com/show_page/id', seed) if not seed_Re: seed_P = soup.find('h1', attrs={'class': 'title'}) if seed_P is not None: seed_aTag = seed_P.find('a') if seed_aTag is not None: seed = seed_aTag.get('href') doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") point_P = soup.find('span', attrs={'class': 'rating'}) if point_P is not None: if point_P.find('em', attrs={'class': 'num'}) is not None: point = point_P.find('em', attrs={'class': 'num'}).get_text() poster_p = soup.find("li", attrs={'class': 'thumb'}) if poster_p is not None: poster = poster_p.find('img').get("src") name_P = soup.find('h1', attrs={'class': 'title'}) if name_P is not None: name_span = name_P.find('span', attrs={'class': 'name'}) if name_span is not None: name = name_span.get_text() if re.search(r'\d{4}', name): shootYear = re.search(r'\d+', name).group() area_P = soup.find('span', attrs={'class': 'area'}) if area_P is not None: area_list = [] for each in area_P.find_all("a"): area_list.append(each.get_text()) area = ",".join(area_list) ctype_all = soup.find_all("span", attrs={"class": "type"}) for ctype_P in ctype_all: if re.search(r'类型:'.decode('utf8'), ctype_P.get_text()): ctype_list = [] for each in ctype_P.find_all('a'): ctype_list.append(each.get_text()) ctype = ",".join(ctype_list) star_P = soup.find("span", attrs={"class": "actor"}) if star_P is not None: star_list = [] for each in star_P.find_all('a'): star_list.append(each.get_text()) star = ",".join(star_list) director_P = soup.find("span", attrs={"class": "director"}) if director_P is not None: director_list = [] for each in director_P.find_all('a'): director_list.append(each.get_text()) director = ",".join(director_list) playTimes_P = soup.find('span', attrs={'class': 'play'}) if playTimes_P is not None: playTimesStr = playTimes_P.get_text() playTimes_list = re.findall(r'(\d+)', playTimesStr) playTimes = long(''.join(playTimes_list)) content_p = soup.find('span', attrs={ "class": "short", 'id': 'show_info_short' }) if content_p is not None: intro = content_p.get_text().strip() if re.match(r'http://www\.youku\.com/show_page/(id_(\w+))\.html', seed): mainId = re.match( r'http://www\.youku\.com/show_page/id_(\w+)\.html', seed).group(1) self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId urlList = re.findall( r'<li\s+data="(reload_\d+)"\s*(class="current"){0,1}\s*>', doc) if urlList: for url in urlList: seed_sub = 'http://www.youku.com/show_episode/id_%s.html?dt=json&divid=%s' % ( mainId, url[0]) self.secondSpider(seed_sub) else: seed_sub = 'http://www.youku.com/show_episode_id_%s.html?dt=json' % mainId self.secondSpider(seed_sub)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") seed = "" seed_p = soup.find('a', attrs={'class': 'album_title'}) if seed_p is not None: if seed_p.get("href") is not None: seed = seed_p.get("href") if seed == "": return doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("img", attrs={'itemprop': 'image'}) if poster_p is not None: if poster_p.get("src") is not None: poster = poster_p.get("src") if not re.search(r'http', poster): poster = "http:%s" % poster if poster_p.get("alt") is not None: name = poster_p.get("alt") video_type = soup.find("div", attrs={"class": "video_type cf"}) if video_type is not None: divs = video_type.find_all('div', attrs={"class": "type_item"}) for div in divs: text = div.get_text() if re.search(r'地 区:'.decode("utf8"), text): area_p = div.find('span', attrs={"class": "type_txt"}) if area_p is not None: area = area_p.get_text() if re.search(r'语 言:'.decode("utf8"), text): programLanguage_p = div.find('span', attrs={"class": "type_txt"}) if programLanguage_p is not None: programLanguage = programLanguage_p.get_text() if re.search(r'上映时间:'.decode("utf8"), text): shootYear_p = div.find('span', attrs={"class": "type_txt"}) if shootYear_p is not None: shootYear = shootYear_p.get_text().split("-")[0] ctype_P = soup.find("div", attrs={"class": "tag_list"}) if ctype_P is not None: ctype_list = [] for each in ctype_P.find_all('a'): ctype_list.append(each.get_text()) ctype = ",".join(ctype_list) person_p = soup.find("ul", attrs={"class": "actor_list cf"}) if person_p is not None: lis = person_p.find_all('li') director_list = [] star_list = [] for li in lis: person = "" star_p = li.find("span", attrs={'class': 'name'}) if star_p is not None: person = star_p.get_text() if person != "" and li.find("span", attrs={"class": "director"}): director_list.append(person) elif person != "": star_list.append(person) star = ",".join(star_list) director = ",".join(director_list) content_p = soup.find('span', attrs={"class": "txt _desc_txt_lineHight"}) if content_p is not None: intro = content_p.get_text().strip() if re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html', seed): mainId = re.match( r'http://film\.qq\.com/(page|cover)/.*/(.*?).html', seed).group(2) if re.match(r'http://v\.qq\.com/detail/[\w\d]/(.*?).html', seed): mainId = re.match(r'http://v\.qq\.com/detail/[\w\d]/(.*?).html', seed).group(1) self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['programLanguage'] = programLanguage self.program['intro'] = intro self.program['mainId'] = mainId if mainId != "": seed_sub = 'http://s.video.qq.com/loadplaylist?vkey=898_qqvideo_clpl_%s_qq&vtype=' % mainId self.secondSpider(seed_sub)