async def crawl_follow(self): while True: follow_dict = self.redis_job_now.fetch_job(JobType.follower.value) if follow_dict: try: await self.grab_follow(follow_dict) LOGGER.info('finish %d follow crawl ' % follow_dict['uid']) except TimeoutError as e: print(e) except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def crawl_repost(self): while True: repost_job_info = await self.redis_job.fetch_job(JobType.repost.value) if repost_job_info: try: await self.grab_tweet_repost(repost_job_info) except TimeoutError as e: pass except: LOGGER.error("something error") LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def search(self): while True: search_job_info = await self.redis_job.fetch_job( JobType.search.value) if search_job_info: try: await self.search_tweet(search_job_info) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def topic_finding_now(self): topic_job_info = self.redis_job_now.fetch_job(JobType.topic.value) if topic_job_info: try: print(topic_job_info) LOGGER.info('topic finding') self.search_topic_user_now(topic_job_info) except TimeoutError as e: LOGGER.info('topic finding timeout error') pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def crawl_comment(self): while True: comment_job_info = await self.redis_job.fetch_job(JobType.comment.value) if comment_job_info: try: # asyncio.run_coroutine_threadsafe(self.grab_tweet_comments(comment_job_info), self.loop) await self.grab_tweet_comments(comment_job_info) except TimeoutError as e: pass except: LOGGER.error("something error") LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def crawl_follow(self): while True: follow_dict = await self.redis_job.fetch_job(JobType.follower.value ) if follow_dict: try: await self.grab_follow(follow_dict) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def super_fan_finding(self): while True: topic_job_info = self.redis_job_now.fetch_job(JobType.superfan.value) if topic_job_info: try: print(topic_job_info) LOGGER.info('super fan finding') await self.search_super_fan(topic_job_info) except TimeoutError as e: LOGGER.info('super fan finding timeout error') pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def main(): # RedisCookies.clean() weiboLogin = WeiboLogin() success = [] failed = [] for account in ACCOUNTS: try: LOGGER.info('get cookies for %s' % str(account)) cookies = weiboLogin.login_by_selenium(account['user'], account['password']) if cookies is not None and 'SSOLoginState' in cookies and 'SUBP' in cookies and 'SUHB' in cookies: success.append(account) RedisCookies.save_cookies(account['user'], cookies) else: failed.append(account) except Exception: LOGGER.error("get cookies failed") traceback.print_exc() failed.append(account) LOGGER.info("%d accounts login success" % len(success)) LOGGER.info("%d accounts login failed" % len(failed))
async def crawl_user(self): while True: user_job_info = await self.redis_job.fetch_job(JobType.user.value) if user_job_info: try: # asyncio.run_coroutine_threadsafe(self.grab_user_info(user_job_info['user_id']), self.loop) await self.grab_user_info(user_job_info['user_id']) # await self.redis_job.push_job(JobType.tweet.value, # {'url': 'https://weibo.cn/' + user_job_info['user_id'], # 'uid': user_job_info['user_id']}) # await self.redis_job.push_job(JobType.follower.value, # {'url': self.follow_url % user_job_info['user_id'], # 'uid': user_job_info['user_id']}) # self.weibo_queue.put({'url': self.user_tweet_url % user_id, 'uid': user_id}) # self.follow_queue.put({'uid': user_id, 'url': self.follow_url % user_id}) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def crawl_weibo(self): r = re.compile(r'https://weibo.cn/(\d*)\?page=(\d*)') while True: tweet_job_info = await self.redis_job.fetch_job(JobType.tweet.value) if tweet_job_info: m = r.findall(tweet_job_info['url']) if m: page_no = int(m[0][1]) if page_no > 200: LOGGER.info('job passed %s' % str(tweet_job_info)) continue # if 'page=' in tweet_job_info['url']: # LOGGER.info('job passed %s' % str(tweet_job_info)) # continue try: await self.grab_user_tweet(tweet_job_info) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def login_by_selenium(self, weibo_user, weibo_password): browser = webdriver.Firefox() browser.maximize_window() try_time = 5 cookie_got = False browser.get('https://weibo.com/login.php') username = browser.find_element_by_id("loginname") username.clear() username.send_keys(weibo_user) psd = browser.find_element_by_xpath('//input[@type="password"]') psd.clear() psd.send_keys(weibo_password) commit_btn = browser.find_element_by_xpath( '//a[@node-type="submitBtn"]') commit_btn.click() # 没那么快登录成功 sleep(5) while try_time: try: # 如果登录不成功是有验证码框的 browser.find_element_by_xpath( '//div[@node-type="verifycode_box"]') code_input = browser.find_element_by_xpath( '//input[@node-type="verifycode"]') LOGGER.info("need input verify code") code_input.send_keys(' ') img_path = self.save_verify_code_img(browser, weibo_user) while not os.path.exists(img_path): LOGGER.info(img_path + "not exist") sleep(1) LOGGER.info(img_path) captcha_id, code_text = self.yun_da_ma.recognize(img_path) # os.remove(img_path) code_str = bytes.decode(code_text) LOGGER.info('recognize result: %s' % code_str) code_input.clear() code_input.send_keys(code_str) commit_btn = browser.find_element_by_xpath( '//a[@node-type="submitBtn"]') commit_btn.click() # 稍等一会 sleep(3) try_time -= 1 except StaleElementReferenceException: cookie_got = True print('login success') break except NoSuchElementException: cookie_got = True print('login success') break except ElementNotInteractableException: sleep(2) try_time -= 1 if cookie_got: sleep(2) LOGGER.info('get https://weibo.cn/1316949123/info') browser.get('https://weibo.cn/1316949123/info') sleep(2) cookies_dict = {} for elem in browser.get_cookies(): cookies_dict[elem['name']] = elem['value'] print(elem["name"], elem["value"]) # RedisCookies.save_cookies(weibo_user, cookies_dict) browser.close() return cookies_dict else: browser.close() LOGGER.error("get cookie failed :%s" % weibo_user) return None
def info(self, user_id): base_home_url = 'https://weibo.com/p/100306%s/info?mod=pedit_more' % user_id LOGGER.info('info task: %s' % base_home_url) cookies_json = RedisCookies.fetch_cookies() cookies = cookies_json['cookies'] headers = self.get_header() session = requests.Session() session2 = requests.Session() headers['Host'] = 'weibo.com' headers['Referer'] = 'https://weibo.com/p/100306%s/home' % user_id # https://weibo.com/p/1003061316949123/home?from=page_100306&mod=TAB&is_all=1 # 'http://weibo.com/2606356035/fans?from=100505&wvr=6&mod=headfans¤t=fans' headers['Upgrade-Insecure-Requests'] = '1' headers.pop('Connection') headers.pop('Accept') headers['Proxy-Connection'] = 'keep-alive' try_time = 0 info_html = '' info_html_str = '' while try_time < 10: resp_text = session.get(url=base_home_url, headers=headers, cookies=cookies, verify=False).text resp_text2 = requests.get(url=base_home_url, headers=headers, verify=False).text print(resp_text2) view_json = self.find_fm_view_json(html=resp_text) for r_json in view_json: if 'Pl_Official_PersonalInfo__58' == r_json['domid']: info_html_str = r_json['html'] break if info_html_str != '': info_html = BeautifulSoup(info_html_str, 'html.parser') iframe = info_html.find_all('iframe') if not iframe: break try_time += 1 if info_html != '': # user = User() # user.user_id = user_id # if not db_session.query(exists().where(User.user_id == user_id)).scalar(): # db_session.add(user) # db_session.commit() lis = info_html.find_all('li', 'clearfix') info_dict = {} for li in lis: try: title = li.find('span', 'pt_title').text pt_detail = li.find('span', 'pt_detail') all_a = pt_detail.find_all('a') if all_a: detail = ','.join([a.text for a in all_a]) else: detail = pt_detail.text detail = detail.replace('\n', '').replace('\t', '').replace('\r', '') value = self.info_map.get(title[:-1], None) if value: info_dict[value] = detail except: LOGGER.error('info task error: %s' % traceback.format_exc()) continue # app.send_task('tasks.user.fans', args=(user_id,)) if info_dict: LOGGER.info('info task result: %s' % info_dict) # try: # db_session.query(User).filter(User.user_id == user_id).update(info_dict) # db_session.commit() # except: # db_session.rollback() # LOGGER.error('info task error: %s' % traceback.format_exc()) return info_dict return None