def task_video(): """ """ retry = 5 i = 0 while True: id = rd.spop(config.douban_tv_task) # id = rd.spop(config.douban_tv_failed) if id is None: print(u"task_page sleeping....20sec") return True if rd.sismember(config.doubantv_ajax_task_done, id) == True: print(u"already done%s" % id) continue url = tv_url.format(id=id) r = requests_get(url=url, headers=douban_home_headers) if r == False or r == None: rd.sadd(config.douban_tv_failed, id) continue try: cb = check_block(r) except Exception as e: print("check_block:", str(e)) if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) continue data = parse_video(r) piw = piwik(page_title=page_title(r), session_time=session_time, origin_url=url, urlref='') print("piw", piw) if data.get("title") == None: rd.sadd(config.douban_tv_failed, id) time.sleep(task_wait) # update_session() print("------spider ben block...") continue data['doubanid'] = id print(json.dumps(data)) mongo_r = mongo_douban_tvs.insert(data, check_keys=False) # photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)}) if rd.sismember(config.douban_star_done, photostask) == False and rd.sismember( config.douban_photos_failed, photostask) == False: rd.sadd(config.douban_photos_task, photostask) print(photostask) # return True rd.sadd(config.douban_tv_done, id) # tv_after(id=id, url=url) print("done.. sleep %s seconds." % task_wait) delay() i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def piwik(page_title, session_time, origin_url, urlref=''): '''用户行为数据上报''' # https://fundin.douban.com/piwik?action_name=脱单告急 (豆瓣)&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26661189%2F&_id=7a36e03deb79996b&_idts=1525176862&_idvc=1&_idn=1&_refts=0&_viewts=1525176862&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768>_ms=1143 url = u'https://fundin.douban.com/piwik?action_name={page_title}&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url={origin_url}&urlref={urlref}&_id={_id}&_idts={_idts}&_idvc=1&_idn=1&_refts=0&_viewts={_viests}&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768>_ms=1143' url = url.format(page_title=page_title, origin_url=origin_url, _id=random_str(16, True), _idts=session_time, _viests=int(time.time()) + 3, urlref=urlref) headers = douban_home_headers headers['Referer'] = origin_url return requests_get(url=url, headers=headers)
def update_session(proxy=None): """ 更新session proxy: """ # if proxy != None: # delete_proxy(proxy) # proxy = get_proxy() # print("proxy:", proxy) # session = requests.Session() # session.cookies['bid'] = random_str(10) bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/') # session.cookies['ll'] = '218319' session.adapters.DEFAULT_RETRIES = 5 session_time = int(time.time())
def task_star(): """ """ retry = 5 i = 0 while True: # task = rd.spop(config.douban_star_task) task = rd.spop(config.douban_star_failed) if task is None: print(u"task_page sleeping....20sec") break continue # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True: if rd.sismember(config.douban_star_done, task) == True: print(u"already done%s" % task) continue url = star_url.format(id=task) print(url) r = requests_get(url=url) if u'检测到有异常请求从你的 IP 发出' in r: print("------spider ben block... break......") delay(block_wait) continue data = parse_star(r) if data == False or data == None or data.get("name") == None: rd.sadd(config.douban_star_failed, task) update_session() time.sleep(20) print("------spider ben sleep 20 sec...") continue data['doubanid'] = task print(json.dumps(data)) result = mongo_douban_stars.insert(data, check_keys=False) rd.sadd(config.douban_star_done, task) delay() print("done.%s. sleep 3 seconds." % result) i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def task_photos(): """ """ retry = 5 i = 0 photos_url = u'https://movie.douban.com/subject/{id}/photos?type=R' while True: #线程锁,必须加这里. #with threading.Lock(): # task = rd.spop(config.douban_photos_task) task = rd.spop(config.douban_photos_failed) if task is None: print(u"task_page sleeping....20sec") return True # if rd.sismember(config.douban_photos_failed, task) == True or rd.sismember(config.douban_photos_done, task) == True: if rd.sismember(config.douban_photos_done, task) == True: print(u"already done%s" % task) continue T = json.loads(task) # T = {} # task = "" # T['id'] = "25827963" url = photos_url.format(id=T['id']) print(url) # data = [] data = get_photos(url=url, id=T['id']) # for x in get_photos(url=url, id=T['id']): # #if x == False or len(x) == 0 or x == None: # if x == False or x == None: # # rd.sadd(config.douban_photos_failed, task) # rd.sadd(config.douban_photos_task, task) # print("------spider ben sleep 20 sec...") # update_session() # break # print(json.dumps(x)) # print(len(x)) # data += x print("++++++++++++++++%s+++++++++++++%s++++++++++++" % (task, len(data))) if len(data) == 0: #rd.sadd(config.douban_photos_failed, task) #rd.sadd(config.douban_photos_task, task) continue print(json.dumps(data)) # return '''这是后面的骚操作.....''' mongo_douban_tvs.update({'_id': ObjectId(T['mongoTVID'])}, {'$unset': { 'poster': 1 }}, multi=True) result = mongo_douban_tvs.update_one({'_id': ObjectId(T['mongoTVID'])}, {'$set': { 'poster': data }}) if result.modified_count == 0: rd.sadd(config.douban_photos_failed, task) #rd.sadd(config.douban_photos_task, task) rd.sadd(config.douban_photos_done, task) delay() print("done.%s. sleep 3 seconds." % result.modified_count) i += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/')
def task_api(): """ """ retry = 5 i = 0 while True: url = rd.spop(config.doubantv_ajax_task) origin_url = url if url is None: print(u"task_page sleeping....20sec") time.sleep(task_wait) continue # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True: if rd.sismember(config.doubantv_ajax_task_done, url) == True: print(u"already done%s" % url) continue start = 0 while True: url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url) print(url) r = requests_get(url, headers=douban_referer_tag_headers) if r is False or r == None: # 失败 print(u'filed task:%s' % url) rd.sadd(config.doubantv_ajax_task_failed, url) continue try: r_data = json.loads(r) except Exception as e: rd.sadd(config.doubantv_ajax_task_failed, url) print(r) print(str(e)) update_session() time.sleep(task_wait) print("-----spider ben sleep 10 sec....") continue if len(r_data['data']) == 0: rd.sadd(config.doubantv_ajax_task_done, origin_url) print("done%s" % origin_url) break for x in r_data['data']: if rd.sismember(config.douban_tv_done, x['id']) == False and rd.sismember( config.douban_tv_failed, x['id']) == False: add_task = rd.sadd(config.douban_tv_task, x['id']) if add_task == 1: print( "---------------join task.----%s--------------------" % x['id']) else: print( '***********task repeat-******%s********************' % x['id']) rd.sadd(config.douban_tvids, x['id']) rd.sadd(config.doubantv_ajax_task_done, origin_url) print("sleep 2 seconds") delay() i += 1 start += 1 if i % max_step == 0: bid = random_str(10) session.cookies.set('bid', bid, domain='.douban.com', path='/') try: session.get(url=ad_url.format(bid=bid), headers=douban_referer_tag_headers, timeout=timeout) except Exception as e: pass
tv_url = u'https://movie.douban.com/subject/{id}/' ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}' verify_users_url = u'https://m.douban.com/rexxar/api/v2/movie/{id}/verify_users?start=0&count=2&ck=' star_url = u'https://movie.douban.com/celebrity/{id}/' session = requests.Session() session.adapters.DEFAULT_RETRIES = 5 # session.cookies[] = u'll="118318"; bid=JMjve9nh9Ug; __yadk_uid=rma3RP9OuF1JDekWWGEQLIVRGDlSc5wR; _vwo_uuid_v2=D4BE7289F6AA483D6B792C38D0EC9C2F1|992a80a86f70b1cd20ef12e7e7959793; ap=1; dbcl2="154152988:v2gmo0C6RvA"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.15415; ck=sy5V; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1525319368%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=74d116d143255fe8.1525244970.3.1525319368.1525256304.; _pk_ses.100001.4cf6=*; __utma=30149280.487994295.1525244966.1525256302.1525319370.3; __utmc=30149280; __utmz=30149280.1525319370.3.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.310121982.1525244970.1525256304.1525319370.3; __utmb=223695111.0.10.1525319370; __utmc=223695111; __utmz=223695111.1525319370.3.3.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.2.10.1525319370' # session.get(url=home_url, headers=douban_home_headers, timeout=10) timeout = 3 proxy = '' task_wait = 0 block_wait = 10 max_step = 2 # 线程数越多,该值就尽量调小 40 / 线程数,,,减少 block ll = 118318 bid = random_str(10) session_time = int(time.time()) ad_url = u'https://erebor.douban.com/count/?ad=195767&bid={bid}&unit=dale_movie_tag_bottom_banner&type=impression' def delay(wait=0): time.sleep(wait) def update_session(proxy=None): """ 更新session proxy: """ # if proxy != None: