def task_types_fetch(): retry = 5 i = 0 while True: type_url = rd.spop(config.yk_types_task) if type_url is None: print(u"yk_types_task sleeping 20sec....") return True if rd.sismember(config.yk_types_failed, type_url) == True or rd.sismember( config.yk_types_done, type_url) == True: continue r = requests_get(url=type_url, headers=youku_home_headers, session=session) if r is False or r == None: print(u'filed task:%s' % type_url) rd.sadd(config.yk_types_failed, type_url) continue pages = parse_category_show(r, type_url) print("task_types_fetch data:", pages) for page in xrange(1, int(pages['pages'])): page_url = re.sub('(\.html.*)', '_s_1_d_1_p_{page}.html'.format(page=page), type_url) print("task_types_fetch for :", page_url) if rd.sismember(config.yk_page_failed, page_url) == False and rd.sismember( config.yk_page_done, page_url) == False: rd.sadd(config.yk_page_task, page_url) rd.sadd(config.yk_types_done, type_url) # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_video(): """ """ retry = 5 i = 0 while True: id = rd.spop(config.douban_tv_task) # id = rd.spop(config.douban_tv_failed) if id is None: print(u"task_page sleeping....20sec") return True if rd.sismember(config.doubantv_ajax_task_done, id) == True: print(u"already done%s" % id) continue url = tv_url.format(id=id) r = requests_get(url=url, headers=douban_home_headers) if r == False or r == None: rd.sadd(config.douban_tv_failed, id) continue try: cb = check_block(r) except Exception as e: print("check_block:", str(e)) if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) continue data = parse_video(r) piw = piwik(page_title=page_title(r), session_time=session_time, origin_url=url, urlref='') print("piw", piw) if data.get("title") == None: rd.sadd(config.douban_tv_failed, id) time.sleep(task_wait) # update_session() print("------spider ben block...") continue data['doubanid'] = id print(json.dumps(data)) mongo_r = mongo_douban_tvs.insert(data, check_keys=False) # photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)}) if rd.sismember(config.douban_star_done, photostask) == False and rd.sismember( config.douban_photos_failed, photostask) == False: rd.sadd(config.douban_photos_task, photostask) print(photostask) # return True rd.sadd(config.douban_tv_done, id) # tv_after(id=id, url=url) print("done.. sleep %s seconds." % task_wait) delay() i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def get_category(): """获取分类,做种子""" start = 1 retry = 5 print('get_category') while retry > 0: try: r = requests_get(url=category_url, headers=youku_home_headers, timeout=timeout, session=session) page = etree.HTML(r) lis = page.xpath( u'//label[contains(text(),"分类:")]/following-sibling::ul/li') o = urlparse(category_url) host = o.scheme + '://' + o.netloc categories = [] for x in xrange(1, len(lis)): categories.append({ "name": lis[x].find('a').text, 'url': host + lis[x].find('a').get('href') }) # categories[lis[x].find('a').text] = host + lis[x].find('a').get('href') print("categories:", json.dumps(categories)) # categories = {lis[x].find('a').text : host + lis[x].find('a').get('href') for x in xrange(1,len(lis)) if lis[x].find('a')!=None} # if len(categories) == 0: update_session(proxy) continue for x in categories: if rd.sismember(config.yk_category_task_done, x['url']) == False and rd.sismember( config.yk_category_task_failed, x['url']) == False: task_sadd = rd.sadd(config.yk_category_task, json.dumps(x)) # 种子 re_sadd = rd.sadd(config.yk_category_url, json.dumps(x)) # 种子 if re_sadd != 0: # 去重保存 youku_category.insert(x, check_keys=False) # save categories return True except requests.exceptions.ProxyError as e: print("ttt", str(e)) update_session(proxy) # retry = 5 # except requests.exceptions.InvalidProxyURL as e: except requests.exceptions.RequestException as e: print("xxx", str(e)) update_session(proxy) # retry = 5 retry -= 1 start += 1 if start % 20 == 0: # 每50步更新一次session update_session()
def go_detail_list_task(): retry = 5 i = 0 while True: q = rd.spop(config.yk_video_detail_task) if q is None: print(u"yk_video_detail_task sleeping 20 sec....") # time.sleep(task_wait) return True detail_url = json.loads(q) #if rd.sismember(config.yk_video_detail_failed,q)==True or rd.sismember(config.yk_video_detail_done,detail_url['url'])==True: if rd.sismember(config.yk_video_detail_done, detail_url['url']) == True: print("pass", detail_url['url']) continue # r = go_detail_list_page(detail_url) r = requests_get(detail_url['url'], headers=youku_home_headers) d = parse_detail_list_page(r, detail_url['url']) data = d['data'] if data is False or data == None: rd.sadd(config.yk_video_detail_failed, q) continue for x in d['stars']: rd.sadd(config.yk_star_task, x) # 明星采集队列,redis set特性去重 print('detail_url done:', detail_url['url'], data) done = rd.sadd(config.yk_video_detail_done, detail_url['url']) # finished #if done == 1: youku_videos.insert(data, check_keys=False) # save tv data # 每50步更新一次session # time.sleep(2) i += 1 if i % max_step == 0: update_session()
def get_detailurl_task(): """ get_detailurl_task yk_get_detailurl_task 解析到detail_list页面的url """ retry = 5 i = 0 while True: q = rd.spop(config.yk_get_detailurl_task) if q is None: print(u"yk_get_detailurl_task sleeping 20 sec") # time.sleep(task_wait) return True to_detail_url = json.loads(q) headers = youku_home_headers headers['Referer'] = to_detail_url['Referer'] # if rd.sismember(config.yk_get_detailurl_done,q)==True or rd.sismember(config.yk_get_detailurl_field,q)==True: if rd.sismember(config.yk_get_detailurl_done,q)==True: print("pass") continue r = requests_get(to_detail_url['url'], headers=headers) # headers = youku_home_headers # headers['Referer'] = to_detail_url['url'] # try: # session.get('http://cmstool.youku.com/cms/player/userinfo/user_info?specialTest=test&client=pc&callback=tuijsonp1',headers=headers) # except Exception as e: # pass print("to_detail_url",to_detail_url['url']) detail_url = parse_tv_show(r, to_detail_url['url']) print("detail_url:",detail_url) if detail_url == False or detail_url==None: rd.sadd(config.yk_get_detailurl_field, q) continue # if rd.sismember(config.yk_video_detail_done,json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))==False: if rd.sismember(config.yk_video_detail_done,detail_url)==False: red = rd.sadd(config.yk_video_detail_task, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']})) if red==1: print("yes") rd.sadd(config.yk_get_detailurl_done,q) # rd.sadd(config.yk_video_detail_task_, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']})) # time.sleep(2) i += 1 if i % max_step == 0: update_session()
def task_page_fetch(): """ 解析每一个category下每个分类下的每一页list数据中的所有tv url, 这里要做url任务去重 """ retry = 5 i = 0 while True: page_url = rd.spop(config.yk_page_task) # page_url = rd.spop(config.yk_page_failed) #retry if page_url is None: print(u"task_page_fetch sleeping 20sec....") # time.sleep(task_wait) return True print("page_url", page_url) if rd.sismember(config.yk_page_failed, page_url) == True or rd.sismember( config.yk_page_done, page_url) == True: continue r = requests_get(url=page_url, headers=youku_home_headers, session=session) if r is False or r == None: # 获取详情失败 print(u'filed task:%s' % page_url) rd.sadd(config.yk_page_failed, page_url) continue print("done task_page_fetch:", page_url) data = parse_page_fetch(r, page_url) for x in data['yk_get_detailurl_task']: rd.sadd(config.yk_get_detailurl_task, json.dumps(x)) # 链接是直接到播放页面的V_show类型 for x in data['yk_video_detail_task']: r_add = rd.sadd(config.yk_video_detail_task, json.dumps(x)) # detail_list_task rd.sadd(config.yk_page_done, page_url) # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_category(): """ 解析每一个category下的分类, 并获取该category 每个分类下的全部资源的url任务, 这里要做url任务去重 """ retry = 5 i = 0 while True: category = rd.spop(config.yk_category_task) if category is None: print(u"task_category sleeping....20sec") # time.sleep(task_wait) return True category = json.loads(category) print(category) r = requests_get(url=category['url'], headers=youku_home_headers,session=session) if r is False or r == None: # 获取详情失败 print(u'filed task:%s' % category['url']) rd.sadd(config.yk_category_task_failed, category['url']) continue data = parse_category_show(r, category['url']) print("category and types:", json.dumps(data)) if len(data['types']) == 0: # category下没有type, re_sadd = rd.sadd(config.yk_types_task,category['url']) # types url else: for ty in data['types']: if rd.sismember(config.yk_types_done,data['types'][ty]) == False and rd.sismember(config.yk_types_failed,data['types'][ty]) == False: rd.sadd(config.yk_types_task,data['types'][ty]) # types fetch task re_sadd = rd.sadd(config.yk_types_done,data['types'][ty]) # types url 数据库去重 if re_sadd == 0: # 去重保存 continue youku_video_types.insert( {"name": ty, "url": data['types'][ty], "category": category['name']}, check_keys=False) # save tv types rd.sadd(config.yk_category_task_done, category['url']) # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_star(): """ """ retry = 5 i = 0 while True: # task = rd.spop(config.douban_star_task) task = rd.spop(config.douban_star_failed) if task is None: print(u"task_page sleeping....20sec") break continue # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True: if rd.sismember(config.douban_star_done, task) == True: print(u"already done%s" % task) continue url = star_url.format(id=task) print(url) r = requests_get(url=url) if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) continue data = parse_star(r) if data == False or data == None or data.get("name") == None: rd.sadd(config.douban_star_failed, task) update_session() time.sleep(20) print("------spider ben sleep 20 sec...") continue data['doubanid'] = task print(json.dumps(data)) result = mongo_douban_stars.insert(data, check_keys=False) rd.sadd(config.douban_star_done, task) delay() print("done.%s. sleep 3 seconds." % result) i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def task_star(): """ """ retry = 5 i = 0 while True: task = rd.spop(config.le_star_task) # task = u'{"7088": "石田卓也"}' if task is None: print(u"task_page sleeping....20sec") time.sleep(task_wait) continue print(task) is_done = rd.sismember(config.le_star_done, task) if is_done == True: print("already done.") continue task_json = json.loads(task) url = so_url.format(wd=task_json[task_json.keys()[0]]) r = requests_get(url=url, headers=leso_headers) if r is False or r == None: # 失败 print(u'filed task:%s' % url) rd.sadd(config.le_star_failed, task) continue data = parse_sostar(r, task_json) if data == False or data == None: rd.sadd(config.le_star_failed, task) continue mongo_id = mongo_letv_stars.insert(data, check_keys=False) # if mongo_id: rd.sadd(config.le_star_done, task) else: print(mongo_id) rd.sadd(config.le_star_failed, task) print('done.') # 每50步更新一次session i += 1 if i % max_step == 0: update_session()
def task_photos(): """ """ retry = 5 i = 0 photos_url = u'https://movie.douban.com/subject/{id}/photos?type=R' while True: #线程锁,必须加这里. #with threading.Lock(): # task = rd.spop(config.douban_photos_task) task = rd.spop(config.douban_photos_failed) if task is None: print(u"task_page sleeping....20sec") return True # if rd.sismember(config.douban_photos_failed, task) == True or rd.sismember(config.douban_photos_done, task) == True: if rd.sismember(config.douban_photos_done, task) == True: print(u"already done%s" % task) continue T = json.loads(task) # T = {} # task = "" # T['id'] = "25827963" url = photos_url.format(id=T['id']) print(url) # data = [] data = get_photos(url=url, id=T['id']) # for x in get_photos(url=url, id=T['id']): # #if x == False or len(x) == 0 or x == None: # if x == False or x == None: # # rd.sadd(config.douban_photos_failed, task) # rd.sadd(config.douban_photos_task, task) # print("------spider ben sleep 20 sec...") # update_session() # break # print(json.dumps(x)) # print(len(x)) # data += x print("++++++++++++++++%s+++++++++++++%s++++++++++++" % (task, len(data))) if len(data) == 0: #rd.sadd(config.douban_photos_failed, task) #rd.sadd(config.douban_photos_task, task) continue print(json.dumps(data)) # return '''这是后面的骚操作.....''' mongo_douban_tvs.update({'_id': ObjectId(T['mongoTVID'])}, {'$unset': { 'poster': 1 }}, multi=True) result = mongo_douban_tvs.update_one({'_id': ObjectId(T['mongoTVID'])}, {'$set': { 'poster': data }}) if result.modified_count == 0: rd.sadd(config.douban_photos_failed, task) #rd.sadd(config.douban_photos_task, task) rd.sadd(config.douban_photos_done, task) delay() print("done.%s. sleep 3 seconds." % result.modified_count) i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def parse_video(r): data = {} page = etree.HTML(r) year = re.search(u'<span class="year">\((\d{4})\)</span>', r) if year: data['year'] = year.group(1) title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r) if title: data['title'] = title.group(1) bianju = page.xpath(u'//span[contains(text(),"编剧")]') if len(bianju) > 0: bianju_a = bianju[0].getnext() if bianju_a: bianju_a = bianju_a.findall('a') data['screenwriter_list'] = [] screenwriters = '' for x in bianju_a: screenwriters = screenwriters + x.text + "," # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href") if re.search(u'/celebrity/(\d*)/', x.get("href")): doubanid = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) if rd.sismember(config.douban_star_done, doubanid) == False and rd.sismember( config.douban_star_failed, doubanid) == False: rd.sadd(config.douban_star_task, doubanid) else: doubanid = x.get("href") data['screenwriter_list'].append({ "name": x.text, "doubanid": doubanid }) screenwriters = screenwriters.strip(',') data['screenwriters'] = screenwriters directors_el = page.xpath(u'//span[contains(text(),"导演")]') if len(directors_el) > 0: directors_a = directors_el[0].getnext() if directors_a: directors_a = directors_a.findall('a') data['directors_list'] = [] directors = "" for x in directors_a: directors = directors + x.text + "," # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href") if re.search(u'/celebrity/(\d*)/', x.get("href")): doubanid = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) if rd.sismember(config.douban_star_done, doubanid) == False and rd.sismember( config.douban_star_failed, doubanid) == False: rd.sadd(config.douban_star_task, doubanid) else: doubanid = x.get("href") data["directors_list"].append({ "name": x.text, "doubanid": doubanid }) directors = directors.strip(',') data['directors'] = directors starring_el = page.xpath(u'//span[contains(text(),"主演")]') if len(starring_el) > 0: starring_a = starring_el[0].getnext() if starring_a: starring_a = starring_a.findall('a') data['starring_list'] = [] starring = "" for x in starring_a: starring = starring + x.text + "," # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href") if re.search(u'/celebrity/(\d*)/', x.get("href")): doubanid = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) if rd.sismember(config.douban_star_done, doubanid) == False and rd.sismember( config.douban_star_failed, doubanid) == False: rd.sadd(config.douban_star_task, doubanid) else: doubanid = x.get("href") data["starring_list"].append({ "name": x.text, "doubanid": doubanid }) starring = starring.strip(',') data['starring'] = starring type_el = page.xpath(u'//span[@property="v:genre"]') # 类型 if len(type_el) > 0: mvtype = "" for x in type_el: mvtype = mvtype + x.text + "," mvtype = mvtype.strip(',') data['type'] = mvtype producer_country_el = page.xpath(u'//span[contains(text(),"制片国家/地区:")]') if len(producer_country_el) > 0: data['producer_country'] = page.xpath( u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0] language_el = page.xpath(u'//span[contains(text(),"语言:")]') if len(language_el) > 0: data['language'] = page.xpath( u'//span[contains(text(),"语言:")]/following::text()[1]')[0] all_episode = page.xpath(u'//span[contains(text(),"集数:")]') if len(all_episode) > 0: data['all_episode'] = page.xpath( u'//span[contains(text(),"集数:")]/following::text()[1]')[0] episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]') if len(episode_time) > 0: data['episode_time'] = page.xpath( u'//span[contains(text(),"单集片长:")]')[0].text season = page.xpath( u'//select[@id="season"]/option[@selected="selected"]') #season季数 if len(season) > 0: data['season'] = season[0].text release_date_el = page.xpath( u'//span[@property="v:initialReleaseDate"]') #首播 if len(release_date_el) > 0: release_date = "" for x in release_date_el: release_date = release_date + x.text + "|" release_date = release_date.strip('|') data['release_date'] = release_date duration_el = page.xpath(u'//span[@property="v:runtime"]') if len(duration_el) > 0: data['duration'] = duration_el[0].text # 片长 alias_al = page.xpath(u'//span[contains(text(),"又名:")]') if len(alias_al) > 0: data["alias"] = page.xpath( u'//span[contains(text(),"又名:")]/following::text()[1]')[0] IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]') if len(IMDb_el) > 0: data["IMDb"] = IMDb_el[0].getnext().get("href") rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r) if rating: data['rating'] = rating.group(1) rating_sum = page.xpath(u'//span[@property="v:votes"]') if len(rating_sum) > 0: data['rating_sum'] = rating_sum[0].text summary_all = page.xpath(u'//span[@class="all hidden"]') summary = page.xpath(u'//span[@property="v:summary"]') if len(summary_all) > 0: data['summary'] = ''.join( page.xpath(u'//span[@class="all hidden"]/text()')) elif len(summary) > 0: data['summary'] = ''.join( page.xpath(u'//span[@property="v:summary"]/text()')) img_url = page.xpath(u'//img[@title="点击看更多海报"]') nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]') if len(img_url) > 0: data["img_url"] = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src") elif len(nbgnbg) > 0: data["img_url"] = nbgnbg[0].get("href") tags = page.xpath(u'//div[@class="tags-body"]/a') data['tags'] = '' for x in tags: data['tags'] += "".join([x.text, ',']) data['tags'] = data['tags'].strip(',') if len(data) == 0: print(r) return data
def task_api(): """ """ retry = 5 i = 0 while True: url = rd.spop(config.doubantv_ajax_task) origin_url = url if url is None: print(u"task_page sleeping....20sec") time.sleep(task_wait) continue # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True: if rd.sismember(config.doubantv_ajax_task_done, url) == True: print(u"already done%s" % url) continue start = 0 while True: url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url) print(url) r = requests_get(url, headers=douban_referer_tag_headers) if r is False or r == None: # 失败 print(u'filed task:%s' % url) rd.sadd(config.doubantv_ajax_task_failed, url) continue try: r_data = json.loads(r) except Exception as e: rd.sadd(config.doubantv_ajax_task_failed, url) print(r) print(str(e)) update_session() time.sleep(task_wait) print("-----spider ben sleep 10 sec....") continue if len(r_data['data']) == 0: rd.sadd(config.doubantv_ajax_task_done, origin_url) print("done%s" % origin_url) break for x in r_data['data']: if rd.sismember(config.douban_tv_done, x['id']) == False and rd.sismember( config.douban_tv_failed, x['id']) == False: add_task = rd.sadd(config.douban_tv_task, x['id']) if add_task == 1: print( "---------------join task.----%s--------------------" % x['id']) else: print( '***********task repeat-******%s********************' % x['id']) rd.sadd(config.douban_tvids, x['id']) rd.sadd(config.doubantv_ajax_task_done, origin_url) print("sleep 2 seconds") delay() i += 1 start += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/') try: session.get(url=ad_url.format(bid=bid), headers=douban_referer_tag_headers, timeout=timeout) except Exception as e: pass
def spider_seed(tag_url=tag_url): """获取分类,做种子""" start = 1 retry = 5 while retry > 0: try: r = requests_get(url=tag_url, headers=douban_home_headers, timeout=timeout) # page = etree.HTML(r) appjs_url = re.search( u'<script type="text/javascript" src="((.*)app\.js)"></script>', r).group(1) print(appjs_url) appjs = requests_get(url=appjs_url, headers=douban_appjs_headers) jsdata = re.search( u'mixins\:\[f\.mixin\],data\:function\(\)\{return(.*)\},ready\:function\(\)\{window', appjs).group(1) print(jsdata) jsdata = re.sub(u'!', '', jsdata) jsdata = re.sub( u'browserHeight:document.documentElement.clientHeight', '', jsdata) jsdata = demjson.decode(jsdata) save_tags = rd.sadd(config.doubantv_tags, json.dumps(jsdata['tag_categories'])) if save_tags == 1: # mongo_douban_tags.insert(json.dumps(jsdata["tag_categories"]), check_keys=False) # mongo_douban_tags.insert( {"tag_categories": jsdata["tag_categories"]}, check_keys=False) # ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}' print(len(jsdata["tag_categories"][0])) print(len(jsdata["tag_categories"][1])) print(len(jsdata["tag_categories"][2])) print(len(jsdata["tag_categories"][3])) jsdata["tag_categories"][1][0] = "" jsdata["tag_categories"][2][0] = "" jsdata["tag_categories"][3][0] = "" jsdata["tag_categories"][0][0] = "" for x in xrange(0, len(jsdata["tag_categories"][1])): # "全部类型" c1 = jsdata["tag_categories"][1][x] for xx in xrange(0, len(jsdata["tag_categories"][2])): # "全部地区" c2 = jsdata["tag_categories"][2][xx] # "全部特色" tag2 for xx in xrange(0, len(jsdata["tag_categories"][3])): c3 = jsdata["tag_categories"][3][xx] url = ajax_list_url.format(tags=c3, genres=c1, countries=c2) if rd.sismember(config.doubantv_ajax_task_failed, url) == False and rd.sismember( config.doubantv_ajax_task_done, url) == False: rd.sadd(config.doubantv_ajax_task, url) # rd.sadd(config.doubantv_ajax_url,url) print(url) # "全部形式" tag1 for xx in xrange(0, len(jsdata["tag_categories"][0])): c0 = jsdata["tag_categories"][0][xx] c3c0 = c3 + ',' + c0 c3c0 = re.sub(u',$', "", c3c0) c3c0 = re.sub(u'^,', "", c3c0) url = ajax_list_url.format(tags=c3c0, genres=c1, countries=c2) if rd.sismember(config.doubantv_ajax_task_failed, url) == False and rd.sismember( config.doubantv_ajax_task_done, url) == False: rd.sadd(config.doubantv_ajax_task, url) # rd.sadd(config.doubantv_ajax_url,url) url = ajax_list_url.format(tags=c0, genres=c1, countries=c2) if rd.sismember(config.doubantv_ajax_task_failed, url) == False and rd.sismember( config.doubantv_ajax_task_done, url) == False: rd.sadd(config.doubantv_ajax_task, url) # rd.sadd(config.doubantv_ajax_url,url) print(url) return True except requests.exceptions.ProxyError as e: print("ttt", str(e)) update_session(proxy) # retry = 5 # except requests.exceptions.InvalidProxyURL as e: except requests.exceptions.RequestException as e: print("xxx", str(e)) update_session(proxy) # retry = 5 retry -= 1 start += 1 if start % max_step == 0: # 每50步更新一次session update_session()
def spider_seed(category_url=category_url): """获取分类,做种子""" start = 1 list_url = u'http://list.youku.com' retry = 5 while retry > 0: try: r = requests_get(url=category_url, headers=leshi_headers, timeout=timeout) page = etree.HTML(r) category_el = page.xpath( u'//div[@class="list_box"]/div[@class="column"]/ul[@class="list_cnt"]/li' ) # categories = [{"url":list_url + x.find("a").get("href"),"category":x.find("a").text.replace(" ","").replace("\n","")} for x in category_el if x.find("a") != None] categories = [] for x in category_el: if x.find("a") != None: categories.append({ "url": list_url + x.find("a").get("href"), "title": x.find("a").text.replace(" ", "").replace("\n", "") }) else: categories.append({ "url": category_url, "title": x.text.replace(" ", "").replace("\n", "") }) print(json.dumps(categories)) # return categories for x in categories: rd.sadd(config.le_page_task, x['url']) rd.sadd(config.le_page_urls, x['url']) urls = parse_all_url(x["url"]) # 获取该category下的urls if urls == False: re.sadd(config.le_getpage_task, x["url"]) #获取该url下的urls失败 continue for xx in urls: # 遍历每一个url,得到该页面的全部url rd.sadd(config.le_page_task, xx['url']) rd.sadd(config.le_page_urls, xx['url']) print(xx['url']) print(xx['title']) r = requests_get(r=r, url=xx["url"]) rr_urls = parse_all_url(r) if rr_urls == False: re.sadd(config.le_getpage_task, x["url"]) #获取该url下的urls失败 continue for xxx in rr_urls: if rd.sismember(config.le_page_failed, xxx['url']) == True: continue if rd.sismember(config.le_page_done, xxx['url']) == True: continue rd.sadd(config.le_page_task, xxx['url']) rd.sadd(config.le_page_urls, xxx['url']) return True except requests.exceptions.ProxyError as e: print("ttt", str(e)) update_session(proxy) # retry = 5 # except requests.exceptions.InvalidProxyURL as e: except requests.exceptions.RequestException as e: print("xxx", str(e)) update_session(proxy) # retry = 5 retry -= 1 start += 1 if start % 20 == 0: # 每50步更新一次session update_session()
def task_page(): """ """ retry = 5 i = 0 while True: url = rd.spop(config.le_page_task) # url = rd.spop(config.le_page_failed) if url is None: print(u"task_page sleeping....20sec") time.sleep(task_wait) continue if rd.sismember(config.le_page_done, url) == True: print(u"already done%s" % url) continue r = requests_get(url, headers=leshi_headers) if r is False or r == None: # 失败 print(u'filed task:%s' % url) rd.sadd(config.le_page_failed, url) continue m = re.search( u"frontUrl\: *'(http://list\.le\.com\/getLesoData([^',]+?))',", r) print("task_page:", url) if m: # http://list.le.com/getLesoData?from=pc&src=1&stype=1&ps=30&pn=1&ph=420001&dt=1&cg=2&or=4&stt=1&vt=180001 ajax_url = m.group(1) pn = 1 while True: ajax_url = re.sub(u"pn=\d*", 'pn=%s' % pn, ajax_url) print("ajax_url:", ajax_url) r = requests_get(url=ajax_url, headers=leshi_ajax_headers) if r == False or r == None: rd.sadd(config.le_page_ajax_failed, ajax_url) continue pn += 1 # print(r) try: list_data = json.loads(r) except Exception as e: print(str(e)) print(r) print(ajax_url) rd.sadd(config.le_page_ajax_failed, ajax_url) print("continue") continue if list_data.get("data").get("more") == False: print("this url page fetch done") break for x in list_data.get("data").get("arr"): is_done = rd.sismember(config.le_tv_done, x["unique_id"]) if is_done == True: print("already done!") print(x['name']) # return False continue # 初步清洗 data = {} data = x data['created_at'] = time.time() data['updated_at'] = time.time() # print(json.dumps(x)) # data["summary"] = x['description'] # data["category"] = x['categoryName'] # data["title"] = x['name'] # data["alias"] = x['otherName'] # data["subname"] = x['subname'] # data["englishName"] = x['englishName'] # data["language"] = x['language'] # data["area"] = x['areaName'] # data["plays_num"] = x['playCount'] # data["le_score"] = x['rating'] # # data["isEnd"] = x['isEnd'] # data["subCategoryName"] = x['subCategoryName'] # data["videoTypeName"] = x['videoTypeName'] # data["duration"] = x['duration'] #时长 单集视频是秒,电视剧剧集资源是每集的分钟数 # data["doubanid"] = x['doubanid'] #doubanid # data["urlLink"] = x['urlLink'] # data["copyright"] = x['copyright'] # data["imgUrl"] = x['imgUrl'] # data["tag"] = x['tag'] # data["vids"] = x['vids'] #子集ids # data["shortDesc"] = x['shortDesc'] # data["monthCount"] = x['monthCount'] # data["intro"] = x['intro'] # data["publishCompany"] = x['publishCompany'] # data["fitAge"] = x['fitAge'] # data["weekCount"] = x['weekCount'] # data["style"] = x['style'] # data["letv_original_id"] = x['letv_original_id'] # data["global_id"] = x['global_id'] # data["tvTitle"] = x['tvTitle'] # data["videoBaseType"] = x['videoBaseType'] # data["pubName"] = x['pubName'] # data["nameQuanpin"] = x['nameQuanpin'] # data["nameJianpin"] = x['nameJianpin'] # data["allowforeign"] = x['allowforeign'] # data["subSrc"] = x['subSrc'] # data["updataInfo"] = x['updataInfo'] # data["downloadPlatform"] = x['downloadPlatform'] # data["pushFlag"] = x['pushFlag'] # data["payPlatform"] = x['payPlatform'] # data["vid"] = x['vid'] # data["episodes"] = x['episodes'] #集数 # data["nowEpisodes"] = x['nowEpisodes'] #当前更新到 # data["ispay"] = x['ispay'] # data["country"] = x['country'] # data["videoList"] = x['videoList'] # try: # data["published_at"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(x['releaseDate'])/1000)) #乐视平台的发布时间 # except Exception as e: # data["published_at"] = x['releaseDate'] #有-28800000,-126259200000此类值 data["ctime"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(x['ctime']) / 1000)) # 乐视平台的ctime,待分析,不明意义 data["mtime"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(x['mtime']) / 1000)) # 乐视平台的mtime,待分析,不明意义 data["images"] = [{ "url": x['images'][k], "width": k.split('*')[0], "height": k.split('*')[1] } for k in x['images']] # hai bao data["actors"] = "".join( [x['actor'][it] + "," for it in x['actor']]) # 演员 data["directors"] = "".join([ x['directory'][it] + "," for it in x['directory'] ]) # 导演 starring_type = type(x['starring']).__name__ if starring_type != u'str': for it in x['starring']: if rd.sismember(config.le_star_done, json.dumps(it)) == True: continue if rd.sismember(config.le_star_failed, json.dumps(it)) == True: continue rd.sadd(config.le_star_task, json.dumps(it)) # 主演 坑啊,python 拷贝 可变类型.... x['starring']和data["starring"]的值在同一块内存地址 data["starring"] = "".join([ starring[starring.keys()[0]] + "," for starring in x['starring'] ]) if type(x['actor']).__name__ != u'str': for it in x['actor']: if rd.sismember(config.le_star_done, json.dumps({it: x['actor'][it] })) == True: continue if rd.sismember(config.le_star_failed, json.dumps({it: x['actor'][it] })) == True: continue print(json.dumps({it: x['actor'][it]})) rd.sadd(config.le_star_task, json.dumps({it: x['actor'][it]})) if type(x['directory']).__name__ != u'str': for it in x['directory']: if rd.sismember( config.le_star_done, json.dumps({it: x['directory'][it]})) == True: continue if rd.sismember( config.le_star_failed, json.dumps({it: x['directory'][it]})) == True: continue json.dumps({it: x['directory'][it]}) rd.sadd(config.le_star_task, json.dumps({it: x['directory'][it]})) # print(json.dumps(data)) print("done!") mongo_letv_tvs.insert(data, check_keys=False) # rd.sadd(config.le_tv_done, x['unique_id']) else: print(u'filed task:%s' % url) rd.sadd(config.le_page_failed, url) continue # 每50步更新一次session i += 1 if i % max_step == 0: update_session()