def run(self): if log.isEnabledFor(logging.DEBUG): log.debug("邮件服务线程启动") try: while True: time.sleep(self.smtp_send_interval) # 准备发送的内容 msg = MIMEText(self.get_email_content(), 'plain', 'utf-8') msg['from'] = self.smtp_from_addr msg['to'] = self.smtp_to_addr msg['Subject'] = Header(self.get_email_header(), 'utf-8').encode() # 发送 smtp_server = smtplib.SMTP(self.smtp_server_host, self.smtp_server_port) smtp_server.login(self.smtp_from_addr, self.smtp_server_password) smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr], msg.as_string()) smtp_server.quit() # 更新最后一次发送时间 self.lastSendTime = datetime.datetime.now() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e) self.thread_status = 'error'
def parse_follow_info(self, response_info): # 获取ResponseInfo中的信息 data = response_info[1] # 提取JSON follow_list_token = [] try: # 转为JSON 对象 json_data = json.loads(data) # 提取用户列表信息 if 'data' not in json_data: return data = json_data['data'] # 提取用户 token for follow_info in data: if 'url_token' in follow_info: token = follow_info['url_token'] # 检查重复并添加 if self.token_filter.check_token(token) is False: follow_list_token.append(token) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Follower & Following List 数据解析错误') if log.isEnabledFor(logging.DEBUG): log.exception(e) return # 添加token url 到队列中 for token in follow_list_token: # 封装 URL 信息(List) url_info = ['info', self.generate_user_info_url(token), token] self.redis_connection.rpush(self.user_info_url_queue, url_info) # 提取用户的关注关系(即 following) # (返回的Response内容[info, data, token, followingList/followerList]) if self.is_parser_follow_relation is True: # 关注列表类型 follow_list_type = response_info[3] # 用户Token token = response_info[2] if follow_list_type == 'followingList': pipe = self.redis_connection.pipeline() for following_token in follow_list_token: # 封装关注关系 follow_relation = { FOLLOW_FROM: token, FOLLOW_TO: following_token } pipe.rpush(self.follow_relation_persistent_cache, follow_relation) pipe.execute()
def check_and_restart(self): for process_thread in self.processor_list: if process_thread.thread_status == 'error': thread_id = process_thread.thread_id self.processor_list.remove(process_thread) del process_thread new_thread = ProcessThread(thread_id, self.redis_connection, self.token_filter, self.response_buffer, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation) self.processor_list.append(new_thread) new_thread.start() if log.isEnabledFor(logging.ERROR): log.error('数据处理器线程[' + thread_id + ']重新启动')
def common_login(self): # 创建会话 session = requests.session() session.headers = requestHeader # 获取 _xsrf try: response = session.get(mainPageURL) input_tag = BeautifulSoup(response.text, 'html.parser').find( 'input', attrs={'name': '_xsrf'}) if input_tag is None: return False _xsrf = input_tag['value'] # login form_data = { '_xsrf': _xsrf, 'email': self.login_token, 'password': self.password } requestHeader.update({ 'X-Requested-With': 'XMLHttpRequest', 'X-Xsrftoken': _xsrf }) session.headers = requestHeader response = session.post(url=loginURL, data=form_data) if response.status_code == 200: # 检查是否已经登陆成功 response = session.get(authTestURL) if response.status_code == 200: # 保存登陆认证cookie self.auth_token = session.cookies.get_dict() if log.isEnabledFor(logging.INFO): log.info('知乎账户登陆成功') return True # 登陆失败 if log.isEnabledFor(logging.INFO): log.info('知乎账户登陆失败') return False except Exception as e: if log.isEnabledFor(logging.ERROR): log.error(e) finally: session.close()
def run(self): # 初始化配置 self.init() # 启动代理检验线程 validate_thread_list = [] for i in range(PROXY_VALIDATE_THREAD_NUM): validate_thread = ProxyValidateThread() validate_thread_list.append(validate_thread) validate_thread.start() if log.isEnabledFor(logging.DEBUG): log.debug("代理验证线程启动") # 启动代理池扫描线程 scan_thread = ProxyPoolScanThread() scan_thread.start() if log.isEnabledFor(logging.DEBUG): log.debug("代理池扫描线程启动") # 检查是否有线程出现异常并将其重启 while True: # 检查代理验证线程 for thread in validate_thread_list: if thread.status == 'error': validate_thread_list.remove(thread) thread = ProxyValidateThread() validate_thread_list.append(thread) thread.start() if log.error(logging.ERROR): log.error('代理验证线程重新启动') # 检查代理池扫描线程 if scan_thread.status == 'error': scan_thread = ProxyPoolScanThread() scan_thread.start() if log.isEnabledFor(logging.ERROR): log.error("代理池扫描线程重新启动") time.sleep(180)
def send_message(self, email_content): # 准备发送的内容 now = datetime.datetime.now() header = self.smtp_email_header + '[' + str(now.month) + '-' + str(now.day) + ' ' + \ str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']' msg = MIMEText(email_content, 'plain', 'utf-8') msg['from'] = self.smtp_from_addr msg['to'] = self.smtp_to_addr msg['Subject'] = Header(header, 'utf-8').encode() # 发送 try: smtp_server = smtplib.SMTP(self.smtp_server_host, self.smtp_server_port) smtp_server.login(self.smtp_from_addr, self.smtp_server_password) smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr], msg.as_string()) smtp_server.quit() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e)
def parse_user_info(self, response_info): # 获取ResponseInfo中的信息 data = response_info[1] token = response_info[2] # 提取JSON信息 user_info_entities = None try: bs_obj = BeautifulSoup(data, 'html.parser') data_json = bs_obj.find('div', attrs={'id': 'data'}) if data_json is None: return else: data_json = data_json['data-state'] # 字符串处理 # 处理转义字符 data_json = html.unescape(data_json) # 处理html标签 data_json = BeautifulSoup(data_json, 'html.parser').text # 转换为JSON对象 data_json = json.loads(data_json) # 提取实体 if 'entities' not in data_json: return entities = data_json['entities'] # 提取用户信息 if 'users' not in entities: return users = entities['users'] # 提取目标用户信息 if token not in users: return user_info = users[token] # 提取目标用户的个人信息 avatar_url_template = None name = None headline = None locations = [] business = None employments = [] educations = [] description = None gender = None following_count = None follower_count = None answer_count = None question_count = None voteup_count = None if USER_AVATAR_URL_TEMPLATE in user_info: avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE] if USER_NAME in user_info: name = user_info[USER_NAME] if USER_HEADLINE in user_info: headline = user_info[USER_HEADLINE] if USER_LOCATIONS in user_info: for location in user_info[USER_LOCATIONS]: locations.append(location['name']) if USER_BUSINESS in user_info: business = user_info[USER_BUSINESS]['name'] if USER_EMPLOYMENTS in user_info: for employment in user_info[USER_EMPLOYMENTS]: elem = {} if 'job' in employment: job = employment['job']['name'] elem.update({'job': job}) if 'company' in employment: company = employment['company']['name'] elem.update({'company': company}) employments.append(elem) if USER_EDUCATIONS in user_info: for education in user_info[USER_EDUCATIONS]: if 'school' in education: school = education['school']['name'] educations.append(school) if USER_DESCRIPTION in user_info: description = user_info[USER_DESCRIPTION] if USER_GENDER in user_info: gender = user_info[USER_GENDER] if USER_FOLLOWING_COUNT in user_info: following_count = user_info[USER_FOLLOWING_COUNT] if USER_FOLLOWER_COUNT in user_info: follower_count = user_info[USER_FOLLOWER_COUNT] if USER_ANSWER_COUNT in user_info: answer_count = user_info[USER_ANSWER_COUNT] if USER_QUESTION_COUNT in user_info: question_count = user_info[USER_QUESTION_COUNT] if USER_VOTE_UP_COUNT in user_info: voteup_count = user_info[USER_VOTE_UP_COUNT] # 构造用户信息实体 user_info_entities = { USER_AVATAR_URL_TEMPLATE: avatar_url_template, USER_URL_TOKEN: token, USER_NAME: name, USER_HEADLINE: headline, USER_LOCATIONS: locations, USER_BUSINESS: business, USER_EMPLOYMENTS: employments, USER_EDUCATIONS: educations, USER_DESCRIPTION: description, USER_GENDER: gender, USER_FOLLOWING_COUNT: following_count, USER_FOLLOWER_COUNT: follower_count, USER_ANSWER_COUNT: answer_count, USER_QUESTION_COUNT: question_count, USER_VOTE_UP_COUNT: voteup_count } except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('User info 数据解析错误') log.exception(e) # 处理提取的信息 if user_info_entities is None: return # 再次检查用户是否已经添加,若已经添加则不再继续 if self.token_filter.check_token(token) is True: return # 标记提取的用户信息 self.token_filter.mark_token(token) # 生成 Following List URL if self.is_parser_following_list is True: pipe = self.redis_connection.pipeline() following_count = user_info_entities[USER_FOLLOWING_COUNT] if following_count is not None: offset = 0 limit = 20 while offset < following_count: url_info = [ 'list', self.generate_following_info_url(token, offset, limit), token, 'followingList' ] offset += limit pipe.rpush(self.follow_info_url_queue, url_info) pipe.execute() # 生成 Follower List URL if self.is_parser_follower_list is True: pipe = self.redis_connection.pipeline() follower_count = user_info_entities[USER_FOLLOWER_COUNT] if follower_count is not None: offset = 0 limit = 20 while offset < follower_count: url_info = [ 'list', self.generate_follower_info_url(token, offset, limit), token, 'followerList' ] offset += limit pipe.rpush(self.follow_info_url_queue, url_info) pipe.execute() # 保存提取到的用户信息 if log.isEnabledFor(logging.DEBUG): log.info('成功获取一个用户的详细信息') self.redis_connection.rpush(self.persistent_cache, user_info_entities)
def start_spider_core(self): if log.isEnabledFor(logging.INFO): log.info('Spider 开始启动') try: # 创建Redis连接 redis_connect_retry_times = 3 while redis_connect_retry_times > 0: self.redis_connection = redis.StrictRedis( host=self.redis_host, port=self.redis_port, db=self.redis_db, password=self.redis_password) ping = self.redis_connection.ping() if ping is True: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接成功') break else: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接失败') redis_connect_retry_times -= 1 time.sleep(5) # 若连接不成功则退出 if redis_connect_retry_times <= 0: raise Exception() # 创建MySQL连接 self.mysql_connection = pymysql.connect(host=self.mysql_host, user=self.mysql_username, passwd=self.mysql_password, db=self.mysql_database, charset=self.mysql_charset) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Redis 启动失败') log.exception(e) return # 创建 response 缓存队列 self.response_buffer = ResponseBuffer() # 启动账户管理器并登陆 self.account_manager = AccountManager(self.login_token, self.password, self.is_login_by_cookie, self.z_c0) is_login = self.account_manager.login() if not is_login: return # 启动Downloader self.downloader = Downloader( self.redis_connection, self.response_buffer, self.account_manager, self.is_proxy_service_enable, self.session_pool_size, self.download_thread_num, self.network_retry_times, self.connect_timeout, self.download_interval) self.downloader.start_downloader() # 启动Scheduler self.schedule = Scheduler(self.redis_connection, self.url_rate) self.schedule.start() # 启动 DataPersistent self.dataPersistent = DataPersistent( self.persistent_cache_size, self.follow_relation_persistent_cache_size, self.mysql_connection, self.redis_connection) self.dataPersistent.start_data_persistent() # 启动Processor self.processor = Processor(self.process_thread_num, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation, self.redis_connection, self.response_buffer) self.processor.start_processor() self.processor.load_init_data(self.init_token) # 启动邮件服务 if self.is_email_service_enable is True: self.email_service = EmailService( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.dataPersistent) self.email_service.start_email_service() self.email_service.send_message('Spider 启动完毕') if log.isEnabledFor(logging.INFO): log.info('Spider 启动完毕') # 模块异常检查 while True: # Downloader模块异常检查 self.downloader.check_and_restart() # EmailService 模块异常检查 if self.is_email_service_enable is True: self.email_service.check_and_restart() # DataPersistent 模块异常检查 self.dataPersistent.check_and_restart() # Scheduler 模块异常检查 # Processor 模块异常检查 self.processor.check_and_restart() # 检查间隔 time.sleep(180) gc.collect()
def run(self): debug_info = None try: while True: # 持久化用户信息 current_user_info_cache_size = self.redis_connection.llen( self.persistent_cache) if current_user_info_cache_size >= self.persistent_cache_size: self.lock.acquire() cursor = self.db_connection.cursor() for i in range(current_user_info_cache_size): user_info = self.redis_connection.lpop( self.persistent_cache) debug_info = user_info if user_info is not None: user_info = self.convert_user_info( eval(user_info.decode('utf-8'))) cursor.execute(INSERT_USER_INFO, [ user_info[USER_AVATAR_URL_TEMPLATE], user_info[USER_URL_TOKEN], user_info[USER_NAME], user_info[USER_HEADLINE], user_info[USER_LOCATIONS], user_info[USER_BUSINESS], user_info[USER_EMPLOYMENTS], user_info[USER_EDUCATIONS], user_info[USER_DESCRIPTION], user_info[USER_GENDER], user_info[USER_FOLLOWING_COUNT], user_info[USER_FOLLOWER_COUNT], user_info[USER_ANSWER_COUNT], user_info[USER_QUESTION_COUNT], user_info[USER_VOTE_UP_COUNT] ]) self.db_connection.commit() cursor.close() self.lock.release() # 持久化关注关系 current_follow_relation_cache_size = self.redis_connection.llen( self.follow_relation_persistent_cache) if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size: self.lock.acquire() cursor = self.db_connection.cursor() for i in range(current_follow_relation_cache_size): follow_relation = self.redis_connection.lpop( self.follow_relation_persistent_cache) debug_info = follow_relation if follow_relation is not None: follow_relation = eval( follow_relation.decode('utf-8')) cursor.execute(INSERT_FOLLOW_RELATION, [ follow_relation[FOLLOW_FROM], follow_relation[FOLLOW_TO] ]) self.db_connection.commit() cursor.close() self.lock.release() # 检查时间间隔 time.sleep(180) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('用户数据持久化线程异常退出') log.exception(e) log.debug(debug_info) self.thread_status = 'error'
def run(self): if log.isEnabledFor(logging.INFO): log.info('数据下载线程' + self.thread_id + '启动') # 初次启动,阻塞至获取足够的代理 self.session_manager.init_get() # 保存上一次未下载的url info previous_url_info = None while True: # 获取session session = self.session_manager.get_session_connection() # 尝试下载数据 network_retry_times = 0 while network_retry_times < self.NETWORK_RETRY_TIMES: try: # 获取URL if previous_url_info is None: url_info = self.get_url_info_from_queue() previous_url_info = url_info else: url_info = previous_url_info url = url_info[1] # 下载数据 response = session.get(url, timeout=self.CONNECT_TIMEOUT) if log.isEnabledFor(logging.DEBUG): log.debug(response.status_code) # 检查返回结果 if response.status_code == 200: # 封装下载的数据(包括原来的数据) response_info = url_info response_info[1] = response.text self.put_response_info_to_queue(response_info) previous_url_info = None if log.isEnabledFor(logging.DEBUG): log.debug('下载成功') break elif response.status_code == 403: if log.isEnabledFor(logging.ERROR): log.error('账号认证失败') break elif response.status_code == 429: if log.isEnabledFor(logging.DEBUG): log.debug('[' + str(self.thread_id) + ']' + '访问太频繁,稍候重新访问,响应码为:' + str(response.status_code)) previous_url_info = url_info break elif response.status_code == 404 or response.status_code == 410: previous_url_info = None del url_info break else: if log.isEnabledFor(logging.ERROR): log.error(response.status_code) network_retry_times += 1 except Exception as e: network_retry_times += 1 time.sleep(self.DOWNLOAD_INTERVAL) if log.isEnabledFor(logging.DEBUG): log.debug('[' + str(self.thread_id) + ']' + '下载异常,正在重新连接...(第' + str(network_retry_times) + '次重试)') if log.isEnabledFor(logging.DEBUG): log.error(e) # 下载间隔 time.sleep(self.DOWNLOAD_INTERVAL) # 归还session if network_retry_times < self.NETWORK_RETRY_TIMES: self.session_manager.return_session_connection(session) else: self.session_manager.return_and_switch_proxy(session)