def run(self): if log.isEnabledFor(logging.DEBUG): log.debug("邮件服务线程启动") try: while True: time.sleep(self.smtp_send_interval) # 准备发送的内容 msg = MIMEText(self.get_email_content(), 'plain', 'utf-8') msg['from'] = self.smtp_from_addr msg['to'] = self.smtp_to_addr msg['Subject'] = Header(self.get_email_header(), 'utf-8').encode() # 发送 smtp_server = smtplib.SMTP(self.smtp_server_host, self.smtp_server_port) smtp_server.login(self.smtp_from_addr, self.smtp_server_password) smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr], msg.as_string()) smtp_server.quit() # 更新最后一次发送时间 self.lastSendTime = datetime.datetime.now() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e) self.thread_status = 'error'
def run(self): if log.isEnabledFor(logging.INFO): log.info('数据处理线程' + self.thread_id + '启动') try: while True: # 获取Response数据 response = self.response_buffer.get_response_from_buffer() if response is None or len(response) < 2: continue # 判断Response类型 response_type = response[0] # 分派给对应的处理方法 if response_type == 'info': self.parse_user_info(response) elif response_type == 'list': self.parse_follow_info(response) time.sleep(0.1) except Exception as e: self.thread_status = 'error' if log.isEnabledFor(logging.ERROR): log.exception(e)
def parse_follow_info(self, response_info): # 获取ResponseInfo中的信息 data = response_info[1] # 提取JSON follow_list_token = [] try: # 转为JSON 对象 json_data = json.loads(data) # 提取用户列表信息 if 'data' not in json_data: return data = json_data['data'] # 提取用户 token for follow_info in data: if 'url_token' in follow_info: token = follow_info['url_token'] # 检查重复并添加 if self.token_filter.check_token(token) is False: follow_list_token.append(token) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Follower & Following List 数据解析错误') if log.isEnabledFor(logging.DEBUG): log.exception(e) return # 添加token url 到队列中 for token in follow_list_token: # 封装 URL 信息(List) url_info = ['info', self.generate_user_info_url(token), token] self.redis_connection.rpush(self.user_info_url_queue, url_info) # 提取用户的关注关系(即 following) # (返回的Response内容[info, data, token, followingList/followerList]) if self.is_parser_follow_relation is True: # 关注列表类型 follow_list_type = response_info[3] # 用户Token token = response_info[2] if follow_list_type == 'followingList': pipe = self.redis_connection.pipeline() for following_token in follow_list_token: # 封装关注关系 follow_relation = { FOLLOW_FROM: token, FOLLOW_TO: following_token } pipe.rpush(self.follow_relation_persistent_cache, follow_relation) pipe.execute()
def run(self): try: while True: if proxy_pool.qsize( ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize( ) < PROXY_POOL_SIZE: self.fetch_and_parse_proxy() elif proxy_pool.qsize() == PROXY_POOL_SIZE: if log.isEnabledFor(logging.DEBUG): log.debug('代理池更新') self.scan_proxy_pool() time.sleep(PROXY_POOL_SCAN_INTERVAL) else: time.sleep(60) except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def run(self): try: while True: # 若正在扫描代理池,则暂停 while is_scanning: time.sleep(3) if proxy_pool.qsize( ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize() > 0: unchecked_proxy = unchecked_proxy_list.get() is_available = self.dataValidateModule.validate_proxy_ip( unchecked_proxy) if is_available is True: proxy_pool.put(unchecked_proxy) # print(unchecked_proxy) time.sleep(1) else: time.sleep(5) except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def send_message(self, email_content): # 准备发送的内容 now = datetime.datetime.now() header = self.smtp_email_header + '[' + str(now.month) + '-' + str(now.day) + ' ' + \ str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']' msg = MIMEText(email_content, 'plain', 'utf-8') msg['from'] = self.smtp_from_addr msg['to'] = self.smtp_to_addr msg['Subject'] = Header(header, 'utf-8').encode() # 发送 try: smtp_server = smtplib.SMTP(self.smtp_server_host, self.smtp_server_port) smtp_server.login(self.smtp_from_addr, self.smtp_server_password) smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr], msg.as_string()) smtp_server.quit() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e)
def parse_user_info(self, response_info): # 获取ResponseInfo中的信息 data = response_info[1] token = response_info[2] # 提取JSON信息 user_info_entities = None try: bs_obj = BeautifulSoup(data, 'html.parser') data_json = bs_obj.find('div', attrs={'id': 'data'}) if data_json is None: return else: data_json = data_json['data-state'] # 字符串处理 # 处理转义字符 data_json = html.unescape(data_json) # 处理html标签 data_json = BeautifulSoup(data_json, 'html.parser').text # 转换为JSON对象 data_json = json.loads(data_json) # 提取实体 if 'entities' not in data_json: return entities = data_json['entities'] # 提取用户信息 if 'users' not in entities: return users = entities['users'] # 提取目标用户信息 if token not in users: return user_info = users[token] # 提取目标用户的个人信息 avatar_url_template = None name = None headline = None locations = [] business = None employments = [] educations = [] description = None gender = None following_count = None follower_count = None answer_count = None question_count = None voteup_count = None if USER_AVATAR_URL_TEMPLATE in user_info: avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE] if USER_NAME in user_info: name = user_info[USER_NAME] if USER_HEADLINE in user_info: headline = user_info[USER_HEADLINE] if USER_LOCATIONS in user_info: for location in user_info[USER_LOCATIONS]: locations.append(location['name']) if USER_BUSINESS in user_info: business = user_info[USER_BUSINESS]['name'] if USER_EMPLOYMENTS in user_info: for employment in user_info[USER_EMPLOYMENTS]: elem = {} if 'job' in employment: job = employment['job']['name'] elem.update({'job': job}) if 'company' in employment: company = employment['company']['name'] elem.update({'company': company}) employments.append(elem) if USER_EDUCATIONS in user_info: for education in user_info[USER_EDUCATIONS]: if 'school' in education: school = education['school']['name'] educations.append(school) if USER_DESCRIPTION in user_info: description = user_info[USER_DESCRIPTION] if USER_GENDER in user_info: gender = user_info[USER_GENDER] if USER_FOLLOWING_COUNT in user_info: following_count = user_info[USER_FOLLOWING_COUNT] if USER_FOLLOWER_COUNT in user_info: follower_count = user_info[USER_FOLLOWER_COUNT] if USER_ANSWER_COUNT in user_info: answer_count = user_info[USER_ANSWER_COUNT] if USER_QUESTION_COUNT in user_info: question_count = user_info[USER_QUESTION_COUNT] if USER_VOTE_UP_COUNT in user_info: voteup_count = user_info[USER_VOTE_UP_COUNT] # 构造用户信息实体 user_info_entities = { USER_AVATAR_URL_TEMPLATE: avatar_url_template, USER_URL_TOKEN: token, USER_NAME: name, USER_HEADLINE: headline, USER_LOCATIONS: locations, USER_BUSINESS: business, USER_EMPLOYMENTS: employments, USER_EDUCATIONS: educations, USER_DESCRIPTION: description, USER_GENDER: gender, USER_FOLLOWING_COUNT: following_count, USER_FOLLOWER_COUNT: follower_count, USER_ANSWER_COUNT: answer_count, USER_QUESTION_COUNT: question_count, USER_VOTE_UP_COUNT: voteup_count } except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('User info 数据解析错误') log.exception(e) # 处理提取的信息 if user_info_entities is None: return # 再次检查用户是否已经添加,若已经添加则不再继续 if self.token_filter.check_token(token) is True: return # 标记提取的用户信息 self.token_filter.mark_token(token) # 生成 Following List URL if self.is_parser_following_list is True: pipe = self.redis_connection.pipeline() following_count = user_info_entities[USER_FOLLOWING_COUNT] if following_count is not None: offset = 0 limit = 20 while offset < following_count: url_info = [ 'list', self.generate_following_info_url(token, offset, limit), token, 'followingList' ] offset += limit pipe.rpush(self.follow_info_url_queue, url_info) pipe.execute() # 生成 Follower List URL if self.is_parser_follower_list is True: pipe = self.redis_connection.pipeline() follower_count = user_info_entities[USER_FOLLOWER_COUNT] if follower_count is not None: offset = 0 limit = 20 while offset < follower_count: url_info = [ 'list', self.generate_follower_info_url(token, offset, limit), token, 'followerList' ] offset += limit pipe.rpush(self.follow_info_url_queue, url_info) pipe.execute() # 保存提取到的用户信息 if log.isEnabledFor(logging.DEBUG): log.info('成功获取一个用户的详细信息') self.redis_connection.rpush(self.persistent_cache, user_info_entities)
def start_spider_core(self): if log.isEnabledFor(logging.INFO): log.info('Spider 开始启动') try: # 创建Redis连接 redis_connect_retry_times = 3 while redis_connect_retry_times > 0: self.redis_connection = redis.StrictRedis( host=self.redis_host, port=self.redis_port, db=self.redis_db, password=self.redis_password) ping = self.redis_connection.ping() if ping is True: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接成功') break else: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接失败') redis_connect_retry_times -= 1 time.sleep(5) # 若连接不成功则退出 if redis_connect_retry_times <= 0: raise Exception() # 创建MySQL连接 self.mysql_connection = pymysql.connect(host=self.mysql_host, user=self.mysql_username, passwd=self.mysql_password, db=self.mysql_database, charset=self.mysql_charset) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Redis 启动失败') log.exception(e) return # 创建 response 缓存队列 self.response_buffer = ResponseBuffer() # 启动账户管理器并登陆 self.account_manager = AccountManager(self.login_token, self.password, self.is_login_by_cookie, self.z_c0) is_login = self.account_manager.login() if not is_login: return # 启动Downloader self.downloader = Downloader( self.redis_connection, self.response_buffer, self.account_manager, self.is_proxy_service_enable, self.session_pool_size, self.download_thread_num, self.network_retry_times, self.connect_timeout, self.download_interval) self.downloader.start_downloader() # 启动Scheduler self.schedule = Scheduler(self.redis_connection, self.url_rate) self.schedule.start() # 启动 DataPersistent self.dataPersistent = DataPersistent( self.persistent_cache_size, self.follow_relation_persistent_cache_size, self.mysql_connection, self.redis_connection) self.dataPersistent.start_data_persistent() # 启动Processor self.processor = Processor(self.process_thread_num, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation, self.redis_connection, self.response_buffer) self.processor.start_processor() self.processor.load_init_data(self.init_token) # 启动邮件服务 if self.is_email_service_enable is True: self.email_service = EmailService( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.dataPersistent) self.email_service.start_email_service() self.email_service.send_message('Spider 启动完毕') if log.isEnabledFor(logging.INFO): log.info('Spider 启动完毕') # 模块异常检查 while True: # Downloader模块异常检查 self.downloader.check_and_restart() # EmailService 模块异常检查 if self.is_email_service_enable is True: self.email_service.check_and_restart() # DataPersistent 模块异常检查 self.dataPersistent.check_and_restart() # Scheduler 模块异常检查 # Processor 模块异常检查 self.processor.check_and_restart() # 检查间隔 time.sleep(180) gc.collect()
def run(self): debug_info = None try: while True: # 持久化用户信息 current_user_info_cache_size = self.redis_connection.llen( self.persistent_cache) if current_user_info_cache_size >= self.persistent_cache_size: self.lock.acquire() cursor = self.db_connection.cursor() for i in range(current_user_info_cache_size): user_info = self.redis_connection.lpop( self.persistent_cache) debug_info = user_info if user_info is not None: user_info = self.convert_user_info( eval(user_info.decode('utf-8'))) cursor.execute(INSERT_USER_INFO, [ user_info[USER_AVATAR_URL_TEMPLATE], user_info[USER_URL_TOKEN], user_info[USER_NAME], user_info[USER_HEADLINE], user_info[USER_LOCATIONS], user_info[USER_BUSINESS], user_info[USER_EMPLOYMENTS], user_info[USER_EDUCATIONS], user_info[USER_DESCRIPTION], user_info[USER_GENDER], user_info[USER_FOLLOWING_COUNT], user_info[USER_FOLLOWER_COUNT], user_info[USER_ANSWER_COUNT], user_info[USER_QUESTION_COUNT], user_info[USER_VOTE_UP_COUNT] ]) self.db_connection.commit() cursor.close() self.lock.release() # 持久化关注关系 current_follow_relation_cache_size = self.redis_connection.llen( self.follow_relation_persistent_cache) if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size: self.lock.acquire() cursor = self.db_connection.cursor() for i in range(current_follow_relation_cache_size): follow_relation = self.redis_connection.lpop( self.follow_relation_persistent_cache) debug_info = follow_relation if follow_relation is not None: follow_relation = eval( follow_relation.decode('utf-8')) cursor.execute(INSERT_FOLLOW_RELATION, [ follow_relation[FOLLOW_FROM], follow_relation[FOLLOW_TO] ]) self.db_connection.commit() cursor.close() self.lock.release() # 检查时间间隔 time.sleep(180) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('用户数据持久化线程异常退出') log.exception(e) log.debug(debug_info) self.thread_status = 'error'