async def search_super_fan(self, super_fan_job_info): LOGGER.info('try to grab html: ' + super_fan_job_info['url']) html_content = await self.grab_html(super_fan_job_info['url']) userjson = json.loads(html_content, 'GBK') userjson = userjson['data'] LOGGER.info('succeed to grab html: ' + super_fan_job_info['url']) for group in userjson['cards']: for card in group['card_group']: user_id = card['user']['id'] await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'super'})
async def fetch_job(self, job_type): if not self._pool: await self.init_pool() with await self._pool as conn: job_info = await conn.execute('rpop', job_type) if job_info: LOGGER.info('fetched job: %s' % job_info) return json.loads(job_info) else: return None
def save_cookies(cls, user_name, cookies): pickled_cookies = json.dumps({ 'user_name': user_name, 'cookies': cookies, 'login_time': datetime.datetime.now().timestamp() }) LOGGER.info('save cookie in redis: %s' % str(pickled_cookies)) r = redis.Redis(connection_pool=cls.redis_pool) r.hset('account', user_name, pickled_cookies) cls.user_in_queue(user_name)
async def search_topic_user(self, topic_job_info): LOGGER.info('try to grab html: ' + topic_job_info['url']) html_content = await self.grab_html(topic_job_info['url']) userjson = json.loads(html_content) userjson = userjson['data'] LOGGER.info('succeed to grab html: ' + topic_job_info['url']) for group in userjson['cards']: if 'show_type' in group: for card in group['card_group']: user_id = card['mblog']['user']['id'] await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'comment'})
def __init__(self, username, password): self.YDMApi = windll.LoadLibrary(ROOT_PATH + '\\dll\\yundamaAPI-x64.dll') self.appId = 4296 # Èí¼þid self.appKey = b'fdacec8d9f1c2deb86346bfcf64e95f2' # Èí¼þÃÜÔ¿ LOGGER.info('app id£º%d\r\napp key£º%s' % (self.appId, self.appKey)) self.username = username.encode() self.password = password.encode() self.code_type = 1005 self.timeout = 60 self.YDMApi.YDM_SetAppInfo(self.appId, self.appKey)
async def search(self): while True: search_job_info = await self.redis_job.fetch_job(JobType.search.value) if search_job_info: try: await self.search_tweet(search_job_info) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def search_topic_user_now(self, topic_job_info): LOGGER.info('try to grab html: ' + topic_job_info['url']) userjson = self.grab_html_now(topic_job_info['url']) LOGGER.info('succeed to grab html: ' + topic_job_info['url']) for group in userjson['cards']: if 'show_type' in group: for card in group['card_group']: user_id = card['mblog']['user']['id'] self.redis_job_now.push_job(JobType.user.value, { 'user_id': user_id, 'source': 'comment' })
async def crawl_repost(self): while True: repost_job_info = await self.redis_job.fetch_job(JobType.repost.value) if repost_job_info: try: await self.grab_tweet_repost(repost_job_info) except TimeoutError as e: pass except: LOGGER.error("something error") LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def del_proc(): status_file = os.path.join(BASE_DIR, "proc/status") if os.path.exists(status_file): os.unlink(status_file) if not os.path.exists(status_file): LOGGER.info("remove status sucessfully") else: LOGGER.error("Failed to remove status file") prometheus_dir = os.path.join(BASE_DIR, ".prometheus_multiproc_dir") for file_obj in os.listdir(prometheus_dir): file_path = os.path.join(prometheus_dir, file_obj) os.remove(file_path)
async def crawl_follow(self): while True: follow_dict = self.redis_job_now.fetch_job(JobType.follower.value) if follow_dict: try: await self.grab_follow(follow_dict) LOGGER.info('finish %d follow crawl ' % follow_dict['uid']) except TimeoutError as e: print(e) except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def push_job(self, job_type, job_info): if not self._pool: await self.init_pool() url = job_info.get('url', '') if url and url in self.url_filter: LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info))) return else: self.url_filter.add(url) with await self._pool as conn: await conn.execute('lpush', str(job_type), json.dumps(job_info)) LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info)))
def recognize(self, filename): if not isinstance(filename, bytes): filename = filename.encode() result = c_char_p(b" ") LOGGER.info('>>>ÕýÔڵǽ...') captcha_id = self.YDMApi.YDM_EasyDecodeByPath(self.username, self.password, self.appId, self.appKey, filename, self.code_type, self.timeout, result) return captcha_id, result.value
def fetch_cookies(cls): # LOGGER.info('get cookies from reids') r = redis.Redis(connection_pool=cls.redis_pool) while True: user = r.spop('users') r.sadd('users', user) c = r.hget('account', user) if c: user_cookies = c.decode('utf-8') cookies_json = json.loads(user_cookies) # LOGGER.info(cookies_json) return cookies_json LOGGER.warn('cookies not get')
async def crawl_comment(self): while True: comment_job_info = await self.redis_job.fetch_job(JobType.comment.value) if comment_job_info: try: # asyncio.run_coroutine_threadsafe(self.grab_tweet_comments(comment_job_info), self.loop) await self.grab_tweet_comments(comment_job_info) except TimeoutError as e: pass except: LOGGER.error("something error") LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def getStockList(force_update=False): if force_update: twstock.__update_codes() stocks = [] for code in twstock.codes: stock = twstock.codes[code] if stock.type == '股票': stocks.append(stock) LOGGER.info(f'Get {len(stocks)} stocks in list') return stocks
async def crawl_follow(self): while True: follow_dict = await self.redis_job.fetch_job(JobType.follower.value ) if follow_dict: try: await self.grab_follow(follow_dict) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
def grab_html_now(self, url): cookies = self.redis_cookie_now.fetch_cookies() headers = self.get_header() headers['Upgrade-Insecure-Requests'] = '1' headers['Proxy-Connection'] = 'keep-alive' LOGGER.info('using cookies'+str(cookies)) ok = True while ok: resp_text = requests.get(url=url, cookies=cookies['cookies'], verify=False).text userjson = json.loads(resp_text) # userjson = json.loads(resp_text,'GBK') if userjson['ok'] == 1: ok = False return userjson['data']
def genStockList(output_file=None, force_update=False): if force_update: twstock.__update_codes() stocks = [] for code in twstock.codes: stock = twstock.codes[code] if stock.type == '股票': # stocks.append(f'{code},{stock.name}') # stocks.append((code, stock.name)) stocks.append(code) LOGGER.info(f'Get {len(stocks)} stocks in list') return sorted(stocks)
def get_cookie_from_login_sina_com_cn(account, password): """ 获取一个账号的Cookie """ login_url = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)" username = base64.b64encode(account.encode("utf-8")).decode("utf-8") headers = { 'Referer': 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)', 'Upgrade-Insecure-Requests': '1', 'Host': 'login.sina.com.cn', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } post_data = { "entry": "sso", "gateway": "1", "from": "null", "savestate": "30", "useticket": "0", "pagerefer": "", "vsnf": "1", "su": username, "service": "sso", "sp": password, "sr": "1440*900", "encoding": "UTF-8", "cdult": "3", "domain": "sina.com.cn", "prelt": "0", "returntype": "TEXT", } session = requests.Session() r = session.post(login_url, headers=headers, data=post_data, verify=False) json_str = r.content.decode("gbk") info = json.loads(json_str) LOGGER.info('get cookies for %s' % account) if info["retcode"] == "0": LOGGER.info("Get Cookie Success!( Account:%s )" % account) cookies = session.cookies.get_dict() for k, v in cookies.items(): print(k, v) return cookies else: LOGGER.warning("Get Cookie failed!( Account:%s )" % account) LOGGER.warning(info) return None
async def grab_tweet_comments(self, comment_job): LOGGER.info('start grab comment: %s' % str(comment_job)) html_content = await self.grab_html(comment_job['url']) comment_html = BeautifulSoup(html_content, "lxml") comment_divs = comment_html.find_all(id=re.compile('C_[\d]'), class_='c') for comment_div in comment_divs: comment_info = {} comment_id = comment_div.get('id') user_a = comment_div.find('a') if user_a: user_href = user_a.get('href') if user_href.startswith('/u/'): user_id = user_href[3:] else: user_id = await self.get_user_id_from_homepage( self.weibo_host + user_href) await self.user_id_in_queue(user_id) comment_info['userId'] = user_id comment_info['content'] = comment_div.find( class_='ctt').get_text() others = comment_div.find(class_='ct').get_text() if others: others = others.split('\u6765\u81ea') comment_info['pubTime'] = self.get_time(others[0]) if len(others) == 2: comment_info['source'] = others[1] comment_info['id'] = comment_id comment_info['tweetId'] = comment_job['tweetId'] comment_info['type'] = 'comment_info' await self.weibo_producer.send(comment_info, comment_job['url']) if 'page=' not in comment_job['url']: await self.parse_tweet_content(comment_html, comment_job) page_div = comment_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) for page in range(2, max_page + 1): await self.redis_job.push_job( JobType.comment.value, { 'url': self.tweet_comment_url2 % (comment_job['tweetId'], page), 'tweetId': comment_job['tweetId'] })
def __init__(self, username, password): self.YDMApi = windll.LoadLibrary(ROOT_PATH+'\\dll\\yundamaAPI-x64.dll') self.appId = 5064 # Èí¼þid self.appKey = b'57f477ba2a00eeb3e2fcb392474305d4' # Èí¼þÃÜÔ¿ LOGGER.info('app id£º%d\r\napp key£º%s' % (self.appId, self.appKey)) self.username = username.encode() self.password = password.encode() print(self.username) print(self.password) self.code_type = 1005 self.timeout = 60 self.YDMApi.YDM_SetAppInfo(self.appId, self.appKey) self.uid = self.YDMApi.YDM_Login(self.username, self.password) balance = self.YDMApi.YDM_GetBalance(self.username, self.password) LOGGER.info('succeed to login in YunDaMa, balance : %d', balance)
def shutdown(): LOGGER.info('Stopping http server') server.stop() LOGGER.info('Will Shutdown in %s seconds ...', 60) io_loop = tornado.ioloop.IOLoop.instance() deadline = time.time() + 60 # remove status file del_proc() def stop_loop(): now = time.time() if now < deadline and (io_loop._callbacks or io_loop._timeouts): io_loop.add_timeout(now + 1, stop_loop) else: io_loop.stop() LOGGER.info('Shutdown') stop_loop()
async def grab_tweet_repost(self, repost_job_info): LOGGER.info('start grab tweet repost: %s' % str(repost_job_info)) html_content = await self.grab_html(repost_job_info['url']) tweet_repost_html = BeautifulSoup(html_content, "lxml") repost_divs = tweet_repost_html.find_all(class_='c') for div in repost_divs: span_cc = div.find('span', class_='cc') if span_cc: attitube_a = span_cc.find('a') if attitube_a: href = attitube_a.get('href') if len(href.split('/')) > 2: await self.redis_job.push_job( JobType.comment.value, { 'url': self.tweet_comment_url % href.split('/')[2], 'tweetId': href.split('/')[2], 'parentTid': repost_job_info['tweetId'] }) await self.redis_job.push_job( JobType.repost.value, { 'url': self.user_repost_url % href.split('/')[2], 'tweetId': href.split('/')[2], 'parentTid': repost_job_info['tweetId'] }) if 'page=' not in repost_job_info['url']: await self.parse_tweet_content(tweet_repost_html, repost_job_info) page_div = tweet_repost_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) for page in range(2, max_page + 1): await self.redis_job.push_job( JobType.repost.value, { 'url': self.user_repost_url2 % (repost_job_info['tweetId'], page), 'tweetId': repost_job_info['tweetId'] }) pass
async def search_topic_user(self, topic_job_info): LOGGER.info('try to grab html: ' + topic_job_info['url']) html_content = await self.grab_html(topic_job_info['url']) userjson = json.loads(html_content) userjson = userjson['data'] LOGGER.info('succeed to grab html: ' + topic_job_info['url']) topic_tweet = {} topic_tweet['type'] = 'topic_tweet' for group in userjson['cards']: if 'show_type' in group: for card in group['card_group']: topic_tweet['tweet_time'] = card['mblog']['created_at'] topic_tweet['latest_update'] = card['mblog'][ 'latest_update'] topic_tweet['user_id'] = card['mblog']['user']['id'] topic_tweet['user_gende'] = card['mblog']['user']['gender'] topic_tweet['reposts'] = card['mblog']['reposts_count'] topic_tweet['comments'] = card['mblog']['comments_count'] await self.weibo_producer.send(topic_tweet, topic_job_info['url'])
def save_verify_code_img(browser, weibo_user): screen_shot_path = '.\\img\\%s-screenshot.png' % weibo_user code_img_path = '.\\img\\%s-verify_code.png' % weibo_user LOGGER.info('get verify code img for %s' % weibo_user) browser.save_screenshot(screen_shot_path) code_img = browser.find_element_by_xpath( '//img[@node-type="verifycode_image"]') left = code_img.location['x'] top = code_img.location['y'] right = code_img.location['x'] + code_img.size['width'] bottom = code_img.location['y'] + code_img.size['height'] # print(left, top, right, bottom) picture = Image.open(screen_shot_path) # picture = picture.crop((1422, 300, 1533, 334)) picture = picture.crop((left, top, right, bottom)) picture.save(code_img_path) os.remove(screen_shot_path) LOGGER.info('code img saved(%s)' % code_img_path) return code_img_path
def user_in_queue(cls, user_name): r = redis.Redis(connection_pool=cls.redis_pool) if not r.sismember('users', user_name): LOGGER.info('user in queue: %s' % user_name) r.sadd("users", user_name) else: LOGGER.info('user already in queue: %s' % user_name) LOGGER.info("remove it") r.srem("users", user_name) LOGGER.info('user in queue: %s' % user_name) r.sadd("users", user_name)
async def crawl_user(self): while True: user_job_info = await self.redis_job.fetch_job(JobType.user.value) if user_job_info: try: # asyncio.run_coroutine_threadsafe(self.grab_user_info(user_job_info['user_id']), self.loop) await self.grab_user_info(user_job_info['user_id']) # await self.redis_job.push_job(JobType.tweet.value, # {'url': 'https://weibo.cn/' + user_job_info['user_id'], # 'uid': user_job_info['user_id']}) # await self.redis_job.push_job(JobType.follower.value, # {'url': self.follow_url % user_job_info['user_id'], # 'uid': user_job_info['user_id']}) # self.weibo_queue.put({'url': self.user_tweet_url % user_id, 'uid': user_id}) # self.follow_queue.put({'uid': user_id, 'url': self.follow_url % user_id}) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def grab_view(self, user_id): """ 获取用户id的微博数、粉丝数、发布的微博数 :param user_id: 用户id :return: dict """ LOGGER.info('grab user view: %s' % str(user_id)) html_content = await self.grab_html(self.weibo_host + '/' + str(user_id)) home_page_html = BeautifulSoup(html_content, "lxml") v = home_page_html.find('div', class_='tip2') result = {} if v: content = v.get_text(';') else: content = '' tweet_r = re.findall('微博\[(\d+)\];', content) result['tweetNum'] = tweet_r[0] if tweet_r else -1 fans_r = re.findall('粉丝\[(\d+)\];', content) result['fansNum'] = fans_r[0] if fans_r else -1 follow_r = re.findall('关注\[(\d+)\];', content) result['followNum'] = follow_r[0] if follow_r else -1 return result
async def crawl_weibo(self): r = re.compile(r'https://weibo.cn/(\d*)\?page=(\d*)') while True: tweet_job_info = await self.redis_job.fetch_job(JobType.tweet.value) if tweet_job_info: m = r.findall(tweet_job_info['url']) if m: page_no = int(m[0][1]) if page_no > 200: LOGGER.info('job passed %s' % str(tweet_job_info)) continue # if 'page=' in tweet_job_info['url']: # LOGGER.info('job passed %s' % str(tweet_job_info)) # continue try: await self.grab_user_tweet(tweet_job_info) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60)
async def grab_follow(self, follow_dict): LOGGER.info('start grab user follow: %s' % str(follow_dict)) html_content = await self.grab_html(follow_dict['url']) follow_html = BeautifulSoup(html_content, "lxml") all_td = follow_html.find_all('td', style=True) follow_id = [] for td in all_td: a = td.find('a').get('href') usr_id_result = self.user_id_pattern.findall(a) if usr_id_result: usr_id = usr_id_result[0] else: usr_id = await self.get_user_id_from_homepage(a) if usr_id not in follow_id: follow_id.append(int(usr_id)) user_follow_dict = {} follow_id_key_list = [i for i in range(len(follow_id))] follow_id = dict(zip(follow_id_key_list, follow_id)) user_follow_dict['type'] = 'follow' user_follow_dict['uid'] = follow_dict['uid'] user_follow_dict['fans_id'] = follow_id await self.weibo_producer.send(user_follow_dict, self.follow_url % follow_dict['uid']) if 'page=' not in follow_dict['url']: page_div = follow_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) if max_page > 20: max_page = 20 for page in range(2, max_page + 1): await self.redis_job.push_job( JobType.follower.value, { 'url': (self.follow_url % follow_dict['uid']) + '?page=' + str(page), 'uid': follow_dict['uid'] })