Example #1
0
    def run(self):
        if log.isEnabledFor(logging.DEBUG):
            log.debug("邮件服务线程启动")
        try:
            while True:
                time.sleep(self.smtp_send_interval)

                # 准备发送的内容
                msg = MIMEText(self.get_email_content(), 'plain', 'utf-8')
                msg['from'] = self.smtp_from_addr
                msg['to'] = self.smtp_to_addr
                msg['Subject'] = Header(self.get_email_header(),
                                        'utf-8').encode()

                # 发送
                smtp_server = smtplib.SMTP(self.smtp_server_host,
                                           self.smtp_server_port)
                smtp_server.login(self.smtp_from_addr,
                                  self.smtp_server_password)
                smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr],
                                     msg.as_string())
                smtp_server.quit()

                # 更新最后一次发送时间
                self.lastSendTime = datetime.datetime.now()

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
            self.thread_status = 'error'
Example #2
0
    def run(self):
        if log.isEnabledFor(logging.INFO):
            log.info('数据处理线程' + self.thread_id + '启动')

        try:
            while True:
                # 获取Response数据
                response = self.response_buffer.get_response_from_buffer()

                if response is None or len(response) < 2:
                    continue

                # 判断Response类型
                response_type = response[0]

                # 分派给对应的处理方法
                if response_type == 'info':
                    self.parse_user_info(response)
                elif response_type == 'list':
                    self.parse_follow_info(response)
                time.sleep(0.1)

        except Exception as e:
            self.thread_status = 'error'
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
Example #3
0
 def login(self):
     if self.is_login_by_cookie is True:
         if log.isEnabledFor(logging.INFO):
             log.info('使用Cookie登陆方式登陆')
         return self.cookie_login()
     else:
         if log.isEnabledFor(logging.INFO):
             log.info('使用邮箱或手机号码登陆方式登陆')
         return self.common_login()
Example #4
0
    def parse_follow_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]

        # 提取JSON
        follow_list_token = []
        try:
            # 转为JSON 对象
            json_data = json.loads(data)

            # 提取用户列表信息
            if 'data' not in json_data:
                return
            data = json_data['data']

            # 提取用户 token
            for follow_info in data:
                if 'url_token' in follow_info:
                    token = follow_info['url_token']
                    # 检查重复并添加
                    if self.token_filter.check_token(token) is False:
                        follow_list_token.append(token)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Follower & Following List 数据解析错误')
            if log.isEnabledFor(logging.DEBUG):
                log.exception(e)
            return

        # 添加token url 到队列中
        for token in follow_list_token:
            # 封装 URL 信息(List)
            url_info = ['info', self.generate_user_info_url(token), token]
            self.redis_connection.rpush(self.user_info_url_queue, url_info)

        # 提取用户的关注关系(即 following)
        # (返回的Response内容[info, data, token, followingList/followerList])
        if self.is_parser_follow_relation is True:
            # 关注列表类型
            follow_list_type = response_info[3]
            # 用户Token
            token = response_info[2]
            if follow_list_type == 'followingList':
                pipe = self.redis_connection.pipeline()
                for following_token in follow_list_token:
                    # 封装关注关系
                    follow_relation = {
                        FOLLOW_FROM: token,
                        FOLLOW_TO: following_token
                    }
                    pipe.rpush(self.follow_relation_persistent_cache,
                               follow_relation)
                pipe.execute()
Example #5
0
    def start_downloader(self):
        # 启动下载线程
        for download_thread in self.download_thread_list:
            download_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('Downloader 模块启动成功')
Example #6
0
    def __init__(self, redis_connection, response_buffer, account_manager,
                 is_proxy_service_enable, session_pool_size,
                 download_thread_num, network_retry_times, connect_timeout,
                 download_interval):
        # 设置下载线程的数量
        self.download_thread_num = download_thread_num
        # 设置 Redis 连接
        self.redis_connection = redis_connection
        # 设置 response 缓存队列
        self.response_buffer = response_buffer
        # 设置账户认证管理器
        self.account_manager = account_manager
        # 设置并启动sessionManager
        self.session_manager = SessionManager(session_pool_size,
                                              account_manager,
                                              is_proxy_service_enable)

        # 设置网络连接参数
        self.NETWORK_RETRY_TIMES = network_retry_times
        self.CONNECT_TIMEOUT = connect_timeout
        self.DOWNLOAD_INTERVAL = download_interval

        # 初始化下载线程
        self.download_thread_list = []
        for i in range(self.download_thread_num):
            download_thread = DownloadThread(
                'thread' + str(i), self.session_manager, self.redis_connection,
                self.response_buffer, self.NETWORK_RETRY_TIMES,
                self.CONNECT_TIMEOUT, self.DOWNLOAD_INTERVAL)
            self.download_thread_list.append(download_thread)

        if log.isEnabledFor(logging.INFO):
            log.info("Downloader 模块初始化完毕")
Example #7
0
    def start_processor(self):
        # 启动处理线程
        for process_thread in self.processor_list:
            process_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('Processor 模块启动成功')
Example #8
0
    def __init__(self, process_thread_num, is_parser_following_list,
                 is_parser_follower_list, is_parser_follow_relation,
                 redis_connection, response_buffer):
        # 设置数据处理器数量
        self.process_thread_num = process_thread_num
        # 设置 Redis 连接
        self.redis_connection = redis_connection
        # 创建 Token 过滤器
        self.token_filter = TokenFilter(self.redis_connection)
        # 设置 response 缓存队列
        self.response_buffer = response_buffer

        # 是否解析正在关注列表
        self.is_parser_following_list = is_parser_following_list
        # 是否解析关注者列表
        self.is_parser_follower_list = is_parser_follower_list
        # 是否解析关注关系
        self.is_parser_follow_relation = is_parser_follow_relation

        # 创建处理器
        self.processor_list = []
        for i in range(process_thread_num):
            process_thread = ProcessThread(
                'thread' + str(i), self.redis_connection, self.token_filter,
                self.response_buffer, self.is_parser_following_list,
                self.is_parser_follower_list, self.is_parser_follow_relation)
            self.processor_list.append(process_thread)

        if log.isEnabledFor(logging.INFO):
            log.info('Processor 模块初始化完毕')
Example #9
0
    def run(self):

        if log.isEnabledFor(logging.INFO):
            log.info('Scheduler 模块启动成功')

        while True:
            # 当 urlQueue 队列中元素太多时,停止放入
            while self.redis_connection.llen(self.url_queue_name) > 500:
                time.sleep(180)

            # 当队列中均没有元素时,暂停添加
            follow_info_queue_length = self.redis_connection.llen(
                self.follow_info_url_queue)
            user_info_queue_length = self.redis_connection.llen(
                self.user_info_url_queue)
            if follow_info_queue_length == 0 and user_info_queue_length == 0:
                time.sleep(20)
                continue

            # 分别从两个队列中获取设定比例的数量的元素添加到下载URL队列
            for i in range(self.url_rate):
                url_info = self.redis_connection.lpop(self.user_info_url_queue)
                if url_info is not None:
                    self.redis_connection.rpush(self.url_queue_name, url_info)
                    del url_info

            for i in range(10 - self.url_rate):
                url_info = self.redis_connection.lpop(
                    self.follow_info_url_queue)
                if url_info is not None:
                    self.redis_connection.rpush(self.url_queue_name, url_info)
                    del url_info
Example #10
0
    def validate_proxy_ip(self, proxy_ip_info):
        from Proxy import proxyCore
        if proxy_ip_info is None:
            return False

        # 构造代理信息
        proxy_ip = proxy_ip_info[proxyCore.PROXY_IP]
        proxy_port = proxy_ip_info[proxyCore.PROXY_PORT]
        proxy_protocol = proxy_ip_info[proxyCore.PROXY_PROTOCOL].lower()
        proxy = {proxy_protocol: proxy_ip + ':' + proxy_port}

        # 使用代理进行连接
        retry_time = 0
        while retry_time < NETWORK_RECONNECT_TIMES:
            try:
                response = requests.get(url,
                                        timeout=CONNECT_TIMEOUT,
                                        headers=header,
                                        proxies=proxy)

                # 解析返回的当前使用的IP并判断是否有效
                match_list = re.findall(r'[0-9]+(?:\.[0-9]+){3}',
                                        response.text)
                if len(match_list) > 0:
                    current_ip = match_list.pop()
                    if current_ip is not None and current_ip == proxy_ip:
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug("获取到一个可用的代理IP")
                        return True
                else:
                    retry_time += 1
                    time.sleep(1)
            except Exception:
                retry_time += 1
        return False
Example #11
0
    def __init__(self, session_pool_size, account_manager,
                 is_proxy_service_enable):
        # session pool 大小
        self.session_pool_size = session_pool_size
        # 已经创建的session数量
        self.created_session_num = 0
        # 当前池中的session数量
        self.available_session_num = 0
        # 是否启用代理服务
        self.is_proxy_service_enable = is_proxy_service_enable
        # 账号认证管理器
        self.account_manager = account_manager

        # available session num 锁
        self.available_session_lock = threading.Lock()
        # created session num 锁
        self.created_session_lock = threading.Lock()

        # 创建 session pool
        self.session_pool = queue.Queue(session_pool_size)
        # 创建并启动代理服务
        if self.is_proxy_service_enable is True:
            self.proxy_service = proxyCore.ProxyService()
            self.proxy_service.start_proxy_service()

        if log.isEnabledFor(logging.INFO):
            log.info("Session Manager 启动成功")
Example #12
0
    def common_login(self):
        # 创建会话
        session = requests.session()
        session.headers = requestHeader

        # 获取 _xsrf
        try:
            response = session.get(mainPageURL)
            input_tag = BeautifulSoup(response.text, 'html.parser').find(
                'input', attrs={'name': '_xsrf'})
            if input_tag is None:
                return False
            _xsrf = input_tag['value']

            # login
            form_data = {
                '_xsrf': _xsrf,
                'email': self.login_token,
                'password': self.password
            }
            requestHeader.update({
                'X-Requested-With': 'XMLHttpRequest',
                'X-Xsrftoken': _xsrf
            })
            session.headers = requestHeader
            response = session.post(url=loginURL, data=form_data)
            if response.status_code == 200:
                # 检查是否已经登陆成功
                response = session.get(authTestURL)
                if response.status_code == 200:
                    # 保存登陆认证cookie
                    self.auth_token = session.cookies.get_dict()
                    if log.isEnabledFor(logging.INFO):
                        log.info('知乎账户登陆成功')
                    return True

            # 登陆失败
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆失败')
            return False
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error(e)
        finally:
            session.close()
Example #13
0
 def run(self):
     try:
         while True:
             if proxy_pool.qsize(
             ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize(
             ) < PROXY_POOL_SIZE:
                 self.fetch_and_parse_proxy()
             elif proxy_pool.qsize() == PROXY_POOL_SIZE:
                 if log.isEnabledFor(logging.DEBUG):
                     log.debug('代理池更新')
                 self.scan_proxy_pool()
                 time.sleep(PROXY_POOL_SCAN_INTERVAL)
             else:
                 time.sleep(60)
     except Exception as e:
         if log.isEnabledFor(logging.ERROR):
             log.exception(e)
         self.status = 'error'
Example #14
0
 def check_and_restart(self):
     if self.persistent_thread.thread_status == 'error':
         self.persistent_thread = PersistentThread(
             self.db_connection, self.redis_connection,
             self.persistent_cache_size,
             self.follow_relation_persistent_cache_size)
         self.persistent_thread.start()
         if log.isEnabledFor(logging.INFO):
             log.info('DataPersistent模块持久化线程中重新启动')
Example #15
0
 def check_and_restart(self):
     if self.email_service_thread.thread_status == 'error':
         self.email_service_thread = EmailServiceThread(
             self.smtp_server_host, self.smtp_server_port,
             self.smtp_server_password, self.smtp_from_addr,
             self.smtp_to_addr, self.smtp_email_header,
             self.smtp_send_interval, self.data_persistent)
         self.email_service_thread.start()
         if log.isEnabledFor(logging.INFO):
             log.info('EmailService线程重新启动')
Example #16
0
    def load_init_data(self, token_list):
        if token_list is None:
            return

        for token in token_list:
            # 封装 URL 信息
            url_info = ['info', URL_PUBLIC + token + URL_PINS, token]
            self.redis_connection.rpush('userInfoURLQueue', url_info)
            del url_info

        if log.isEnabledFor(logging.INFO):
            log.info('初始用户Token载入完毕')
Example #17
0
    def run(self):
        # 初始化配置
        self.init()

        # 启动代理检验线程
        validate_thread_list = []
        for i in range(PROXY_VALIDATE_THREAD_NUM):
            validate_thread = ProxyValidateThread()
            validate_thread_list.append(validate_thread)
            validate_thread.start()
        if log.isEnabledFor(logging.DEBUG):
            log.debug("代理验证线程启动")

        # 启动代理池扫描线程
        scan_thread = ProxyPoolScanThread()
        scan_thread.start()
        if log.isEnabledFor(logging.DEBUG):
            log.debug("代理池扫描线程启动")

        # 检查是否有线程出现异常并将其重启
        while True:
            # 检查代理验证线程
            for thread in validate_thread_list:
                if thread.status == 'error':
                    validate_thread_list.remove(thread)
                    thread = ProxyValidateThread()
                    validate_thread_list.append(thread)
                    thread.start()
                    if log.error(logging.ERROR):
                        log.error('代理验证线程重新启动')

            # 检查代理池扫描线程
            if scan_thread.status == 'error':
                scan_thread = ProxyPoolScanThread()
                scan_thread.start()
                if log.isEnabledFor(logging.ERROR):
                    log.error("代理池扫描线程重新启动")

            time.sleep(180)
Example #18
0
 def check_and_restart(self):
     for download_thread in self.download_thread_list:
         if download_thread.thread_status == 'error':
             thread_id = download_thread.thread_id
             self.download_thread_list.remove(download_thread)
             download_thread = DownloadThread(
                 thread_id, self.session_manager, self.redis_connection,
                 self.response_buffer, self.NETWORK_RETRY_TIMES,
                 self.CONNECT_TIMEOUT, self.DOWNLOAD_INTERVAL)
             self.download_thread_list.append(download_thread)
             download_thread.start()
             if log.isEnabledFor(logging.INFO):
                 log.info('数据下载线程' + thread_id + '重新启动')
Example #19
0
    def check_and_restart(self):
        for process_thread in self.processor_list:
            if process_thread.thread_status == 'error':
                thread_id = process_thread.thread_id
                self.processor_list.remove(process_thread)
                del process_thread
                new_thread = ProcessThread(thread_id, self.redis_connection, self.token_filter,
                                           self.response_buffer, self.is_parser_following_list,
                                           self.is_parser_follower_list, self.is_parser_follow_relation)
                self.processor_list.append(new_thread)
                new_thread.start()

                if log.isEnabledFor(logging.ERROR):
                    log.error('数据处理器线程[' + thread_id + ']重新启动')
Example #20
0
    def cookie_login(self):
        # 创建会话
        session = requests.session()
        session.headers = requestHeader

        # 获取基本的cookie
        session.get(mainPageURL)

        # 添加用户配置的认证Cookie
        cookie = {'z_c0': self.z_c0}
        requests.utils.add_dict_to_cookiejar(session.cookies, cookie)

        # 检验是否成功登陆
        response = session.get(authTestURL)
        if response.status_code == 200:
            # 保存已经被认证Cookie
            self.auth_token = session.cookies.get_dict()
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆成功')
            return True
        else:
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆失败')
            return False
Example #21
0
    def __init__(self, redis_connection, url_rate):
        threading.Thread.__init__(self)
        # 设置Redis连接
        self.redis_connection = redis_connection
        # 设置 url 的调度比例
        self.url_rate = url_rate

        # Following & Follower URL 队列名称
        self.follow_info_url_queue = 'followInfoURLQueue'
        # User info URL 队列名称
        self.user_info_url_queue = 'userInfoURLQueue'
        # 待下载URL队列名称
        self.url_queue_name = 'urlQueue'

        if log.isEnabledFor(logging.INFO):
            log.info('Scheduler 模块初始化完毕')
Example #22
0
    def __init__(self, persistent_cache_size,
                 follow_relation_persistent_cache_size, db_connection,
                 redis_connection):
        # 设置用户信息数据持久化缓存大小
        self.persistent_cache_size = persistent_cache_size
        # 设置用户关注关系持久化缓存大小
        self.follow_relation_persistent_cache_size = follow_relation_persistent_cache_size
        # 设置数据库连接
        self.db_connection = db_connection
        # 设置Redis连接
        self.redis_connection = redis_connection
        # 创建数据库持久化线程
        self.persistent_thread = PersistentThread(
            self.db_connection, self.redis_connection,
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size)

        if log.isEnabledFor(logging.INFO):
            log.info('DataPersistent 模块初始化完毕')
Example #23
0
    def __init__(self, smtp_server_host, smtp_server_port,
                 smtp_server_password, smtp_from_addr, smtp_to_addr,
                 smtp_email_header, smtp_send_interval, data_persistent):
        self.data_persistent = data_persistent
        # 设置参数
        self.smtp_server_host = smtp_server_host
        self.smtp_server_port = smtp_server_port
        self.smtp_server_password = smtp_server_password
        self.smtp_from_addr = smtp_from_addr
        self.smtp_to_addr = smtp_to_addr
        self.smtp_email_header = smtp_email_header
        self.smtp_send_interval = smtp_send_interval

        # 创建邮件定时发送线程
        self.email_service_thread = EmailServiceThread(
            self.smtp_server_host, self.smtp_server_port,
            self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr,
            self.smtp_email_header, self.smtp_send_interval,
            self.data_persistent)

        if log.isEnabledFor(logging.INFO):
            log.info('EmailService 模块初始化完毕')
Example #24
0
    def send_message(self, email_content):
        # 准备发送的内容
        now = datetime.datetime.now()
        header = self.smtp_email_header + '[' + str(now.month) + '-' + str(now.day) + ' ' + \
            str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']'
        msg = MIMEText(email_content, 'plain', 'utf-8')
        msg['from'] = self.smtp_from_addr
        msg['to'] = self.smtp_to_addr
        msg['Subject'] = Header(header, 'utf-8').encode()

        # 发送
        try:
            smtp_server = smtplib.SMTP(self.smtp_server_host,
                                       self.smtp_server_port)
            smtp_server.login(self.smtp_from_addr, self.smtp_server_password)
            smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr],
                                 msg.as_string())
            smtp_server.quit()
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
Example #25
0
    def run(self):
        try:
            while True:
                # 若正在扫描代理池,则暂停
                while is_scanning:
                    time.sleep(3)

                if proxy_pool.qsize(
                ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize() > 0:
                    unchecked_proxy = unchecked_proxy_list.get()
                    is_available = self.dataValidateModule.validate_proxy_ip(
                        unchecked_proxy)
                    if is_available is True:
                        proxy_pool.put(unchecked_proxy)
                        # print(unchecked_proxy)
                    time.sleep(1)
                else:
                    time.sleep(5)
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
            self.status = 'error'
Example #26
0
    def parse_user_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]
        token = response_info[2]

        # 提取JSON信息
        user_info_entities = None
        try:
            bs_obj = BeautifulSoup(data, 'html.parser')
            data_json = bs_obj.find('div', attrs={'id': 'data'})
            if data_json is None:
                return
            else:
                data_json = data_json['data-state']

            # 字符串处理
            # 处理转义字符
            data_json = html.unescape(data_json)
            # 处理html标签
            data_json = BeautifulSoup(data_json, 'html.parser').text

            # 转换为JSON对象
            data_json = json.loads(data_json)

            # 提取实体
            if 'entities' not in data_json:
                return
            entities = data_json['entities']

            # 提取用户信息
            if 'users' not in entities:
                return
            users = entities['users']

            # 提取目标用户信息
            if token not in users:
                return
            user_info = users[token]

            # 提取目标用户的个人信息
            avatar_url_template = None
            name = None
            headline = None
            locations = []
            business = None
            employments = []
            educations = []
            description = None
            gender = None
            following_count = None
            follower_count = None
            answer_count = None
            question_count = None
            voteup_count = None
            if USER_AVATAR_URL_TEMPLATE in user_info:
                avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE]

            if USER_NAME in user_info:
                name = user_info[USER_NAME]

            if USER_HEADLINE in user_info:
                headline = user_info[USER_HEADLINE]

            if USER_LOCATIONS in user_info:
                for location in user_info[USER_LOCATIONS]:
                    locations.append(location['name'])

            if USER_BUSINESS in user_info:
                business = user_info[USER_BUSINESS]['name']

            if USER_EMPLOYMENTS in user_info:
                for employment in user_info[USER_EMPLOYMENTS]:
                    elem = {}
                    if 'job' in employment:
                        job = employment['job']['name']
                        elem.update({'job': job})
                    if 'company' in employment:
                        company = employment['company']['name']
                        elem.update({'company': company})
                    employments.append(elem)

            if USER_EDUCATIONS in user_info:
                for education in user_info[USER_EDUCATIONS]:
                    if 'school' in education:
                        school = education['school']['name']
                        educations.append(school)

            if USER_DESCRIPTION in user_info:
                description = user_info[USER_DESCRIPTION]

            if USER_GENDER in user_info:
                gender = user_info[USER_GENDER]

            if USER_FOLLOWING_COUNT in user_info:
                following_count = user_info[USER_FOLLOWING_COUNT]

            if USER_FOLLOWER_COUNT in user_info:
                follower_count = user_info[USER_FOLLOWER_COUNT]

            if USER_ANSWER_COUNT in user_info:
                answer_count = user_info[USER_ANSWER_COUNT]

            if USER_QUESTION_COUNT in user_info:
                question_count = user_info[USER_QUESTION_COUNT]

            if USER_VOTE_UP_COUNT in user_info:
                voteup_count = user_info[USER_VOTE_UP_COUNT]

            # 构造用户信息实体
            user_info_entities = {
                USER_AVATAR_URL_TEMPLATE: avatar_url_template,
                USER_URL_TOKEN: token,
                USER_NAME: name,
                USER_HEADLINE: headline,
                USER_LOCATIONS: locations,
                USER_BUSINESS: business,
                USER_EMPLOYMENTS: employments,
                USER_EDUCATIONS: educations,
                USER_DESCRIPTION: description,
                USER_GENDER: gender,
                USER_FOLLOWING_COUNT: following_count,
                USER_FOLLOWER_COUNT: follower_count,
                USER_ANSWER_COUNT: answer_count,
                USER_QUESTION_COUNT: question_count,
                USER_VOTE_UP_COUNT: voteup_count
            }

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('User info 数据解析错误')
                log.exception(e)

        # 处理提取的信息
        if user_info_entities is None:
            return

        # 再次检查用户是否已经添加,若已经添加则不再继续
        if self.token_filter.check_token(token) is True:
            return

        # 标记提取的用户信息
        self.token_filter.mark_token(token)

        # 生成 Following List URL
        if self.is_parser_following_list is True:
            pipe = self.redis_connection.pipeline()
            following_count = user_info_entities[USER_FOLLOWING_COUNT]
            if following_count is not None:
                offset = 0
                limit = 20
                while offset < following_count:
                    url_info = [
                        'list',
                        self.generate_following_info_url(token, offset, limit),
                        token, 'followingList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 生成 Follower List URL
        if self.is_parser_follower_list is True:
            pipe = self.redis_connection.pipeline()
            follower_count = user_info_entities[USER_FOLLOWER_COUNT]
            if follower_count is not None:
                offset = 0
                limit = 20
                while offset < follower_count:
                    url_info = [
                        'list',
                        self.generate_follower_info_url(token, offset, limit),
                        token, 'followerList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 保存提取到的用户信息
        if log.isEnabledFor(logging.DEBUG):
            log.info('成功获取一个用户的详细信息')
        self.redis_connection.rpush(self.persistent_cache, user_info_entities)
Example #27
0
    def start_spider_core(self):
        if log.isEnabledFor(logging.INFO):
            log.info('Spider 开始启动')

        try:
            # 创建Redis连接
            redis_connect_retry_times = 3
            while redis_connect_retry_times > 0:
                self.redis_connection = redis.StrictRedis(
                    host=self.redis_host,
                    port=self.redis_port,
                    db=self.redis_db,
                    password=self.redis_password)
                ping = self.redis_connection.ping()
                if ping is True:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接成功')
                    break
                else:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接失败')
                    redis_connect_retry_times -= 1
                    time.sleep(5)

            # 若连接不成功则退出
            if redis_connect_retry_times <= 0:
                raise Exception()

            # 创建MySQL连接
            self.mysql_connection = pymysql.connect(host=self.mysql_host,
                                                    user=self.mysql_username,
                                                    passwd=self.mysql_password,
                                                    db=self.mysql_database,
                                                    charset=self.mysql_charset)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Redis 启动失败')
                log.exception(e)
            return

        # 创建 response 缓存队列
        self.response_buffer = ResponseBuffer()

        # 启动账户管理器并登陆
        self.account_manager = AccountManager(self.login_token, self.password,
                                              self.is_login_by_cookie,
                                              self.z_c0)
        is_login = self.account_manager.login()
        if not is_login:
            return

        # 启动Downloader
        self.downloader = Downloader(
            self.redis_connection, self.response_buffer, self.account_manager,
            self.is_proxy_service_enable, self.session_pool_size,
            self.download_thread_num, self.network_retry_times,
            self.connect_timeout, self.download_interval)
        self.downloader.start_downloader()

        # 启动Scheduler
        self.schedule = Scheduler(self.redis_connection, self.url_rate)
        self.schedule.start()

        # 启动 DataPersistent
        self.dataPersistent = DataPersistent(
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size, self.mysql_connection,
            self.redis_connection)
        self.dataPersistent.start_data_persistent()

        # 启动Processor
        self.processor = Processor(self.process_thread_num,
                                   self.is_parser_following_list,
                                   self.is_parser_follower_list,
                                   self.is_parser_follow_relation,
                                   self.redis_connection, self.response_buffer)
        self.processor.start_processor()
        self.processor.load_init_data(self.init_token)

        # 启动邮件服务
        if self.is_email_service_enable is True:
            self.email_service = EmailService(
                self.smtp_server_host, self.smtp_server_port,
                self.smtp_server_password, self.smtp_from_addr,
                self.smtp_to_addr, self.smtp_email_header,
                self.smtp_send_interval, self.dataPersistent)
            self.email_service.start_email_service()
            self.email_service.send_message('Spider 启动完毕')

        if log.isEnabledFor(logging.INFO):
            log.info('Spider 启动完毕')

        # 模块异常检查
        while True:
            # Downloader模块异常检查
            self.downloader.check_and_restart()
            # EmailService 模块异常检查
            if self.is_email_service_enable is True:
                self.email_service.check_and_restart()
            # DataPersistent 模块异常检查
            self.dataPersistent.check_and_restart()
            # Scheduler 模块异常检查
            # Processor 模块异常检查
            self.processor.check_and_restart()
            # 检查间隔
            time.sleep(180)
            gc.collect()
Example #28
0
    def load_config(self):
        section = "spider_core"
        config = configparser.ConfigParser()
        config.read("Core/Config/SpiderCoreConfig.conf", encoding="utf8")

        # 读取 downloader 模块配置
        self.is_proxy_service_enable = True if int(
            config.get(section, 'isProxyServiceEnable')) == 1 else False
        self.session_pool_size = int(config.get(section, 'sessionPoolSize'))
        self.download_thread_num = int(config.get(section,
                                                  'downloadThreadNum'))
        self.network_retry_times = int(config.get(section,
                                                  'networkRetryTimes'))
        self.connect_timeout = int(config.get(section, 'connectTimeout'))
        self.download_interval = int(config.get(section, 'downloadInterval'))

        # 读取 Processor 模块配置
        self.process_thread_num = int(config.get(section, 'processThreadNum'))
        self.is_parser_following_list = True if int(
            config.get(section, 'isParserFollowingList')) == 1 else False
        self.is_parser_follower_list = True if int(
            config.get(section, 'isParserFollowerList')) == 1 else False
        self.is_parser_follow_relation = True if int(
            config.get(section, 'isParserFollowRelation')) == 1 else False

        # 读取 Scheduler 模块配置
        self.url_rate = int(config.get(section, 'urlRate'))

        # 读取 DataPersistent 模块配置
        self.persistent_cache_size = int(
            config.get(section, 'persistentCacheSize'))
        self.follow_relation_persistent_cache_size = int(
            config.get(section, 'followRelationPersistentCacheSize'))

        # 读取邮件服务配置
        self.is_email_service_enable = True if int(
            config.get(section, 'isEmailServiceEnable')) == 1 else False
        self.smtp_server_host = config.get(section, 'smtpServerHost')
        self.smtp_server_port = int(config.get(section, 'smtpServerPort'))
        self.smtp_server_password = config.get(section, 'smtpServerPassword')
        self.smtp_from_addr = config.get(section, 'smtpFromAddr')
        self.smtp_to_addr = config.get(section, 'smtpToAddr')
        self.smtp_email_header = config.get(section, 'smtpEmailHeader')
        self.smtp_send_interval = int(config.get(section, 'smtpSendInterval'))

        # 读取 Redis 数据库配置
        self.redis_host = config.get(section, 'redisHost')
        self.redis_port = int(config.get(section, 'redisPort'))
        self.redis_db = int(config.get(section, 'redisDB'))
        self.redis_password = config.get(section, 'redisPassword')

        # 读取 MySQL 数据库配置
        self.mysql_host = config.get(section, 'mysqlHost')
        self.mysql_username = config.get(section, 'mysqlUsername')
        self.mysql_password = config.get(section, 'mysqlPassword')
        self.mysql_database = config.get(section, 'mysqlDatabase')
        self.mysql_charset = config.get(section, 'mysqlCharset')

        # 读取知乎账户配置
        self.is_login_by_cookie = True if int(
            config.get(section, 'isLoginByCookie')) == 1 else False
        self.z_c0 = config.get(section, 'z_c0')
        self.login_token = config.get(section, 'loginToken')
        self.password = config.get(section, 'password')

        # 读取初始token
        token_list = config.get(section, 'initToken')
        for token in token_list.split(','):
            self.init_token.append(str(token).strip())

        if log.isEnabledFor(logging.INFO):
            log.info('配置文件读取并配置完毕')
Example #29
0
    def run(self):
        debug_info = None
        try:
            while True:
                # 持久化用户信息
                current_user_info_cache_size = self.redis_connection.llen(
                    self.persistent_cache)
                if current_user_info_cache_size >= self.persistent_cache_size:
                    self.lock.acquire()
                    cursor = self.db_connection.cursor()
                    for i in range(current_user_info_cache_size):
                        user_info = self.redis_connection.lpop(
                            self.persistent_cache)
                        debug_info = user_info
                        if user_info is not None:
                            user_info = self.convert_user_info(
                                eval(user_info.decode('utf-8')))
                            cursor.execute(INSERT_USER_INFO, [
                                user_info[USER_AVATAR_URL_TEMPLATE],
                                user_info[USER_URL_TOKEN],
                                user_info[USER_NAME], user_info[USER_HEADLINE],
                                user_info[USER_LOCATIONS],
                                user_info[USER_BUSINESS],
                                user_info[USER_EMPLOYMENTS],
                                user_info[USER_EDUCATIONS],
                                user_info[USER_DESCRIPTION],
                                user_info[USER_GENDER],
                                user_info[USER_FOLLOWING_COUNT],
                                user_info[USER_FOLLOWER_COUNT],
                                user_info[USER_ANSWER_COUNT],
                                user_info[USER_QUESTION_COUNT],
                                user_info[USER_VOTE_UP_COUNT]
                            ])
                    self.db_connection.commit()
                    cursor.close()
                    self.lock.release()

                # 持久化关注关系
                current_follow_relation_cache_size = self.redis_connection.llen(
                    self.follow_relation_persistent_cache)
                if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size:
                    self.lock.acquire()
                    cursor = self.db_connection.cursor()
                    for i in range(current_follow_relation_cache_size):
                        follow_relation = self.redis_connection.lpop(
                            self.follow_relation_persistent_cache)
                        debug_info = follow_relation
                        if follow_relation is not None:
                            follow_relation = eval(
                                follow_relation.decode('utf-8'))
                            cursor.execute(INSERT_FOLLOW_RELATION, [
                                follow_relation[FOLLOW_FROM],
                                follow_relation[FOLLOW_TO]
                            ])
                    self.db_connection.commit()
                    cursor.close()
                    self.lock.release()

                # 检查时间间隔
                time.sleep(180)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('用户数据持久化线程异常退出')
                log.exception(e)
            log.debug(debug_info)
            self.thread_status = 'error'
Example #30
0
    def start_data_persistent(self):
        # 启动线程
        self.persistent_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('DataPersistent 模块启动成功')