Ejemplo n.º 1
0
    def run(self):
        if log.isEnabledFor(logging.DEBUG):
            log.debug("邮件服务线程启动")
        try:
            while True:
                time.sleep(self.smtp_send_interval)

                # 准备发送的内容
                msg = MIMEText(self.get_email_content(), 'plain', 'utf-8')
                msg['from'] = self.smtp_from_addr
                msg['to'] = self.smtp_to_addr
                msg['Subject'] = Header(self.get_email_header(),
                                        'utf-8').encode()

                # 发送
                smtp_server = smtplib.SMTP(self.smtp_server_host,
                                           self.smtp_server_port)
                smtp_server.login(self.smtp_from_addr,
                                  self.smtp_server_password)
                smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr],
                                     msg.as_string())
                smtp_server.quit()

                # 更新最后一次发送时间
                self.lastSendTime = datetime.datetime.now()

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
            self.thread_status = 'error'
Ejemplo n.º 2
0
    def run(self):
        if log.isEnabledFor(logging.INFO):
            log.info('数据处理线程' + self.thread_id + '启动')

        try:
            while True:
                # 获取Response数据
                response = self.response_buffer.get_response_from_buffer()

                if response is None or len(response) < 2:
                    continue

                # 判断Response类型
                response_type = response[0]

                # 分派给对应的处理方法
                if response_type == 'info':
                    self.parse_user_info(response)
                elif response_type == 'list':
                    self.parse_follow_info(response)
                time.sleep(0.1)

        except Exception as e:
            self.thread_status = 'error'
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
Ejemplo n.º 3
0
    def parse_follow_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]

        # 提取JSON
        follow_list_token = []
        try:
            # 转为JSON 对象
            json_data = json.loads(data)

            # 提取用户列表信息
            if 'data' not in json_data:
                return
            data = json_data['data']

            # 提取用户 token
            for follow_info in data:
                if 'url_token' in follow_info:
                    token = follow_info['url_token']
                    # 检查重复并添加
                    if self.token_filter.check_token(token) is False:
                        follow_list_token.append(token)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Follower & Following List 数据解析错误')
            if log.isEnabledFor(logging.DEBUG):
                log.exception(e)
            return

        # 添加token url 到队列中
        for token in follow_list_token:
            # 封装 URL 信息(List)
            url_info = ['info', self.generate_user_info_url(token), token]
            self.redis_connection.rpush(self.user_info_url_queue, url_info)

        # 提取用户的关注关系(即 following)
        # (返回的Response内容[info, data, token, followingList/followerList])
        if self.is_parser_follow_relation is True:
            # 关注列表类型
            follow_list_type = response_info[3]
            # 用户Token
            token = response_info[2]
            if follow_list_type == 'followingList':
                pipe = self.redis_connection.pipeline()
                for following_token in follow_list_token:
                    # 封装关注关系
                    follow_relation = {
                        FOLLOW_FROM: token,
                        FOLLOW_TO: following_token
                    }
                    pipe.rpush(self.follow_relation_persistent_cache,
                               follow_relation)
                pipe.execute()
Ejemplo n.º 4
0
 def run(self):
     try:
         while True:
             if proxy_pool.qsize(
             ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize(
             ) < PROXY_POOL_SIZE:
                 self.fetch_and_parse_proxy()
             elif proxy_pool.qsize() == PROXY_POOL_SIZE:
                 if log.isEnabledFor(logging.DEBUG):
                     log.debug('代理池更新')
                 self.scan_proxy_pool()
                 time.sleep(PROXY_POOL_SCAN_INTERVAL)
             else:
                 time.sleep(60)
     except Exception as e:
         if log.isEnabledFor(logging.ERROR):
             log.exception(e)
         self.status = 'error'
Ejemplo n.º 5
0
    def run(self):
        try:
            while True:
                # 若正在扫描代理池,则暂停
                while is_scanning:
                    time.sleep(3)

                if proxy_pool.qsize(
                ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize() > 0:
                    unchecked_proxy = unchecked_proxy_list.get()
                    is_available = self.dataValidateModule.validate_proxy_ip(
                        unchecked_proxy)
                    if is_available is True:
                        proxy_pool.put(unchecked_proxy)
                        # print(unchecked_proxy)
                    time.sleep(1)
                else:
                    time.sleep(5)
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
            self.status = 'error'
Ejemplo n.º 6
0
    def send_message(self, email_content):
        # 准备发送的内容
        now = datetime.datetime.now()
        header = self.smtp_email_header + '[' + str(now.month) + '-' + str(now.day) + ' ' + \
            str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']'
        msg = MIMEText(email_content, 'plain', 'utf-8')
        msg['from'] = self.smtp_from_addr
        msg['to'] = self.smtp_to_addr
        msg['Subject'] = Header(header, 'utf-8').encode()

        # 发送
        try:
            smtp_server = smtplib.SMTP(self.smtp_server_host,
                                       self.smtp_server_port)
            smtp_server.login(self.smtp_from_addr, self.smtp_server_password)
            smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr],
                                 msg.as_string())
            smtp_server.quit()
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
Ejemplo n.º 7
0
    def parse_user_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]
        token = response_info[2]

        # 提取JSON信息
        user_info_entities = None
        try:
            bs_obj = BeautifulSoup(data, 'html.parser')
            data_json = bs_obj.find('div', attrs={'id': 'data'})
            if data_json is None:
                return
            else:
                data_json = data_json['data-state']

            # 字符串处理
            # 处理转义字符
            data_json = html.unescape(data_json)
            # 处理html标签
            data_json = BeautifulSoup(data_json, 'html.parser').text

            # 转换为JSON对象
            data_json = json.loads(data_json)

            # 提取实体
            if 'entities' not in data_json:
                return
            entities = data_json['entities']

            # 提取用户信息
            if 'users' not in entities:
                return
            users = entities['users']

            # 提取目标用户信息
            if token not in users:
                return
            user_info = users[token]

            # 提取目标用户的个人信息
            avatar_url_template = None
            name = None
            headline = None
            locations = []
            business = None
            employments = []
            educations = []
            description = None
            gender = None
            following_count = None
            follower_count = None
            answer_count = None
            question_count = None
            voteup_count = None
            if USER_AVATAR_URL_TEMPLATE in user_info:
                avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE]

            if USER_NAME in user_info:
                name = user_info[USER_NAME]

            if USER_HEADLINE in user_info:
                headline = user_info[USER_HEADLINE]

            if USER_LOCATIONS in user_info:
                for location in user_info[USER_LOCATIONS]:
                    locations.append(location['name'])

            if USER_BUSINESS in user_info:
                business = user_info[USER_BUSINESS]['name']

            if USER_EMPLOYMENTS in user_info:
                for employment in user_info[USER_EMPLOYMENTS]:
                    elem = {}
                    if 'job' in employment:
                        job = employment['job']['name']
                        elem.update({'job': job})
                    if 'company' in employment:
                        company = employment['company']['name']
                        elem.update({'company': company})
                    employments.append(elem)

            if USER_EDUCATIONS in user_info:
                for education in user_info[USER_EDUCATIONS]:
                    if 'school' in education:
                        school = education['school']['name']
                        educations.append(school)

            if USER_DESCRIPTION in user_info:
                description = user_info[USER_DESCRIPTION]

            if USER_GENDER in user_info:
                gender = user_info[USER_GENDER]

            if USER_FOLLOWING_COUNT in user_info:
                following_count = user_info[USER_FOLLOWING_COUNT]

            if USER_FOLLOWER_COUNT in user_info:
                follower_count = user_info[USER_FOLLOWER_COUNT]

            if USER_ANSWER_COUNT in user_info:
                answer_count = user_info[USER_ANSWER_COUNT]

            if USER_QUESTION_COUNT in user_info:
                question_count = user_info[USER_QUESTION_COUNT]

            if USER_VOTE_UP_COUNT in user_info:
                voteup_count = user_info[USER_VOTE_UP_COUNT]

            # 构造用户信息实体
            user_info_entities = {
                USER_AVATAR_URL_TEMPLATE: avatar_url_template,
                USER_URL_TOKEN: token,
                USER_NAME: name,
                USER_HEADLINE: headline,
                USER_LOCATIONS: locations,
                USER_BUSINESS: business,
                USER_EMPLOYMENTS: employments,
                USER_EDUCATIONS: educations,
                USER_DESCRIPTION: description,
                USER_GENDER: gender,
                USER_FOLLOWING_COUNT: following_count,
                USER_FOLLOWER_COUNT: follower_count,
                USER_ANSWER_COUNT: answer_count,
                USER_QUESTION_COUNT: question_count,
                USER_VOTE_UP_COUNT: voteup_count
            }

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('User info 数据解析错误')
                log.exception(e)

        # 处理提取的信息
        if user_info_entities is None:
            return

        # 再次检查用户是否已经添加,若已经添加则不再继续
        if self.token_filter.check_token(token) is True:
            return

        # 标记提取的用户信息
        self.token_filter.mark_token(token)

        # 生成 Following List URL
        if self.is_parser_following_list is True:
            pipe = self.redis_connection.pipeline()
            following_count = user_info_entities[USER_FOLLOWING_COUNT]
            if following_count is not None:
                offset = 0
                limit = 20
                while offset < following_count:
                    url_info = [
                        'list',
                        self.generate_following_info_url(token, offset, limit),
                        token, 'followingList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 生成 Follower List URL
        if self.is_parser_follower_list is True:
            pipe = self.redis_connection.pipeline()
            follower_count = user_info_entities[USER_FOLLOWER_COUNT]
            if follower_count is not None:
                offset = 0
                limit = 20
                while offset < follower_count:
                    url_info = [
                        'list',
                        self.generate_follower_info_url(token, offset, limit),
                        token, 'followerList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 保存提取到的用户信息
        if log.isEnabledFor(logging.DEBUG):
            log.info('成功获取一个用户的详细信息')
        self.redis_connection.rpush(self.persistent_cache, user_info_entities)
Ejemplo n.º 8
0
    def start_spider_core(self):
        if log.isEnabledFor(logging.INFO):
            log.info('Spider 开始启动')

        try:
            # 创建Redis连接
            redis_connect_retry_times = 3
            while redis_connect_retry_times > 0:
                self.redis_connection = redis.StrictRedis(
                    host=self.redis_host,
                    port=self.redis_port,
                    db=self.redis_db,
                    password=self.redis_password)
                ping = self.redis_connection.ping()
                if ping is True:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接成功')
                    break
                else:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接失败')
                    redis_connect_retry_times -= 1
                    time.sleep(5)

            # 若连接不成功则退出
            if redis_connect_retry_times <= 0:
                raise Exception()

            # 创建MySQL连接
            self.mysql_connection = pymysql.connect(host=self.mysql_host,
                                                    user=self.mysql_username,
                                                    passwd=self.mysql_password,
                                                    db=self.mysql_database,
                                                    charset=self.mysql_charset)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Redis 启动失败')
                log.exception(e)
            return

        # 创建 response 缓存队列
        self.response_buffer = ResponseBuffer()

        # 启动账户管理器并登陆
        self.account_manager = AccountManager(self.login_token, self.password,
                                              self.is_login_by_cookie,
                                              self.z_c0)
        is_login = self.account_manager.login()
        if not is_login:
            return

        # 启动Downloader
        self.downloader = Downloader(
            self.redis_connection, self.response_buffer, self.account_manager,
            self.is_proxy_service_enable, self.session_pool_size,
            self.download_thread_num, self.network_retry_times,
            self.connect_timeout, self.download_interval)
        self.downloader.start_downloader()

        # 启动Scheduler
        self.schedule = Scheduler(self.redis_connection, self.url_rate)
        self.schedule.start()

        # 启动 DataPersistent
        self.dataPersistent = DataPersistent(
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size, self.mysql_connection,
            self.redis_connection)
        self.dataPersistent.start_data_persistent()

        # 启动Processor
        self.processor = Processor(self.process_thread_num,
                                   self.is_parser_following_list,
                                   self.is_parser_follower_list,
                                   self.is_parser_follow_relation,
                                   self.redis_connection, self.response_buffer)
        self.processor.start_processor()
        self.processor.load_init_data(self.init_token)

        # 启动邮件服务
        if self.is_email_service_enable is True:
            self.email_service = EmailService(
                self.smtp_server_host, self.smtp_server_port,
                self.smtp_server_password, self.smtp_from_addr,
                self.smtp_to_addr, self.smtp_email_header,
                self.smtp_send_interval, self.dataPersistent)
            self.email_service.start_email_service()
            self.email_service.send_message('Spider 启动完毕')

        if log.isEnabledFor(logging.INFO):
            log.info('Spider 启动完毕')

        # 模块异常检查
        while True:
            # Downloader模块异常检查
            self.downloader.check_and_restart()
            # EmailService 模块异常检查
            if self.is_email_service_enable is True:
                self.email_service.check_and_restart()
            # DataPersistent 模块异常检查
            self.dataPersistent.check_and_restart()
            # Scheduler 模块异常检查
            # Processor 模块异常检查
            self.processor.check_and_restart()
            # 检查间隔
            time.sleep(180)
            gc.collect()
Ejemplo n.º 9
0
    def run(self):
        debug_info = None
        try:
            while True:
                # 持久化用户信息
                current_user_info_cache_size = self.redis_connection.llen(
                    self.persistent_cache)
                if current_user_info_cache_size >= self.persistent_cache_size:
                    self.lock.acquire()
                    cursor = self.db_connection.cursor()
                    for i in range(current_user_info_cache_size):
                        user_info = self.redis_connection.lpop(
                            self.persistent_cache)
                        debug_info = user_info
                        if user_info is not None:
                            user_info = self.convert_user_info(
                                eval(user_info.decode('utf-8')))
                            cursor.execute(INSERT_USER_INFO, [
                                user_info[USER_AVATAR_URL_TEMPLATE],
                                user_info[USER_URL_TOKEN],
                                user_info[USER_NAME], user_info[USER_HEADLINE],
                                user_info[USER_LOCATIONS],
                                user_info[USER_BUSINESS],
                                user_info[USER_EMPLOYMENTS],
                                user_info[USER_EDUCATIONS],
                                user_info[USER_DESCRIPTION],
                                user_info[USER_GENDER],
                                user_info[USER_FOLLOWING_COUNT],
                                user_info[USER_FOLLOWER_COUNT],
                                user_info[USER_ANSWER_COUNT],
                                user_info[USER_QUESTION_COUNT],
                                user_info[USER_VOTE_UP_COUNT]
                            ])
                    self.db_connection.commit()
                    cursor.close()
                    self.lock.release()

                # 持久化关注关系
                current_follow_relation_cache_size = self.redis_connection.llen(
                    self.follow_relation_persistent_cache)
                if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size:
                    self.lock.acquire()
                    cursor = self.db_connection.cursor()
                    for i in range(current_follow_relation_cache_size):
                        follow_relation = self.redis_connection.lpop(
                            self.follow_relation_persistent_cache)
                        debug_info = follow_relation
                        if follow_relation is not None:
                            follow_relation = eval(
                                follow_relation.decode('utf-8'))
                            cursor.execute(INSERT_FOLLOW_RELATION, [
                                follow_relation[FOLLOW_FROM],
                                follow_relation[FOLLOW_TO]
                            ])
                    self.db_connection.commit()
                    cursor.close()
                    self.lock.release()

                # 检查时间间隔
                time.sleep(180)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('用户数据持久化线程异常退出')
                log.exception(e)
            log.debug(debug_info)
            self.thread_status = 'error'