def get(): s = requests.session() s.headers.update({'User-Agent': UserAgent.get()}) s.proxies = Proxy.get() account = Account.get() for i in range(6): try: page = s.get("http://www.zhihu.com").content captcha = decaptcha(s) _xsrf = PageHelper.get_xsrf(page) data = { '_xsrf': _xsrf, 'password': account['password'], 'remember_me': 'true' } if captcha: data.update({'captcha': captcha}) if '@' in account['username']: data['email'] = account['username'] login = '******' else: data['phone_num'] = account['username'] login = '******' response = s.post(login, data) res = response.json() if res['r'] == 0: logger.warning("Login Success") break else: if i == 5: raise AccountException() else: logger.error('Login Failed %s, Change Account, Retry', res) account = Account.get(account) except requests.RequestException: if i == 5: raise NetworkException() if i % 2 == 1: s.proxies = Proxy.get() logger.error("Request Failed, Change Proxy, Retry") else: logger.error("Request Failed, Retry in 10s") time.sleep(10) return s
def terminate(*msg): if msg: logger.error(msg[0]) status.update({ 'status': 'error', 'message': str(msg[0]) }) else: status.update({ 'status': 'exit', 'message': 'Exit gracefully', }) logger.error('Terminate gracefully...') acct = Account.get_using() if 'username' in acct: rh.push_acct(acct) logger.error('return account %s', acct['username']) if task: rh.lpush_task_user(task) logger.error('return task %s', task) status.update({ 'finished': finished, 'task': task, 'account': acct, 'update_time': int(time.time()) }) rh.publish_status(status) logger.error('Exit') if msg: sys.exit(msg[0]) else: sys.exit(0)
def start(instance_id): global task global status task = '' try: logger.warning('Instance id: %s', instance_id) hostname = socket.gethostname() ip = socket.gethostbyname(hostname) start_time = int(time.time()) session = SessionHelper() status = { 'id': hostname + '-' + str(instance_id), 'hostname': hostname, 'ip': ip, 'finished': finished, 'task': '', 'status': 'init', 'message': '', 'account': Account.get_using(), 'start_time': start_time, 'update_time': int(time.time()) } rh.publish_status(status) while True: task = '' task = rh.get_task_user() logger.warning('Get task: ' + task) if rh.is_user_crawled(task): logger.warning("User %s crawled, skip", task) continue status.update({ 'finished': finished, 'task': task, 'status': 'crawling', 'account': Account.get_using(), 'update_time': int(time.time()) }) rh.publish_status(status) try: fc = FollowingsCrawler(session, task) user = fc.get() logger.warning('Push result: ' + task) rh.push_result_user(user) finished['user'] += 1 finished['followings'] += len(user['followings']) time.sleep(random.uniform(1, 5)) except NotFoundException: message = "User %s not found, continue" % task logger.error(message) continue except ResponseException: message = 'Crawling response error, push back task, quit' terminate(message) except RedisException: logger.error('Redis connection error, quit') sys.exit('Redis Error!') except NetworkException: message = 'Network connection error, quit' terminate(message) except Exception as e: print(traceback.format_exc()) terminate(e)