def run(self):
        if log.isEnabledFor(logging.DEBUG):
            log.debug("邮件服务线程启动")
        try:
            while True:
                time.sleep(self.smtp_send_interval)

                # 准备发送的内容
                msg = MIMEText(self.get_email_content(), 'plain', 'utf-8')
                msg['from'] = self.smtp_from_addr
                msg['to'] = self.smtp_to_addr
                msg['Subject'] = Header(self.get_email_header(),
                                        'utf-8').encode()

                # 发送
                smtp_server = smtplib.SMTP(self.smtp_server_host,
                                           self.smtp_server_port)
                smtp_server.login(self.smtp_from_addr,
                                  self.smtp_server_password)
                smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr],
                                     msg.as_string())
                smtp_server.quit()

                # 更新最后一次发送时间
                self.lastSendTime = datetime.datetime.now()

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
            self.thread_status = 'error'
Exemple #2
0
    def parse_follow_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]

        # 提取JSON
        follow_list_token = []
        try:
            # 转为JSON 对象
            json_data = json.loads(data)

            # 提取用户列表信息
            if 'data' not in json_data:
                return
            data = json_data['data']

            # 提取用户 token
            for follow_info in data:
                if 'url_token' in follow_info:
                    token = follow_info['url_token']
                    # 检查重复并添加
                    if self.token_filter.check_token(token) is False:
                        follow_list_token.append(token)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Follower & Following List 数据解析错误')
            if log.isEnabledFor(logging.DEBUG):
                log.exception(e)
            return

        # 添加token url 到队列中
        for token in follow_list_token:
            # 封装 URL 信息(List)
            url_info = ['info', self.generate_user_info_url(token), token]
            self.redis_connection.rpush(self.user_info_url_queue, url_info)

        # 提取用户的关注关系(即 following)
        # (返回的Response内容[info, data, token, followingList/followerList])
        if self.is_parser_follow_relation is True:
            # 关注列表类型
            follow_list_type = response_info[3]
            # 用户Token
            token = response_info[2]
            if follow_list_type == 'followingList':
                pipe = self.redis_connection.pipeline()
                for following_token in follow_list_token:
                    # 封装关注关系
                    follow_relation = {
                        FOLLOW_FROM: token,
                        FOLLOW_TO: following_token
                    }
                    pipe.rpush(self.follow_relation_persistent_cache,
                               follow_relation)
                pipe.execute()
Exemple #3
0
    def check_and_restart(self):
        for process_thread in self.processor_list:
            if process_thread.thread_status == 'error':
                thread_id = process_thread.thread_id
                self.processor_list.remove(process_thread)
                del process_thread
                new_thread = ProcessThread(thread_id, self.redis_connection, self.token_filter,
                                           self.response_buffer, self.is_parser_following_list,
                                           self.is_parser_follower_list, self.is_parser_follow_relation)
                self.processor_list.append(new_thread)
                new_thread.start()

                if log.isEnabledFor(logging.ERROR):
                    log.error('数据处理器线程[' + thread_id + ']重新启动')
    def common_login(self):
        # 创建会话
        session = requests.session()
        session.headers = requestHeader

        # 获取 _xsrf
        try:
            response = session.get(mainPageURL)
            input_tag = BeautifulSoup(response.text, 'html.parser').find(
                'input', attrs={'name': '_xsrf'})
            if input_tag is None:
                return False
            _xsrf = input_tag['value']

            # login
            form_data = {
                '_xsrf': _xsrf,
                'email': self.login_token,
                'password': self.password
            }
            requestHeader.update({
                'X-Requested-With': 'XMLHttpRequest',
                'X-Xsrftoken': _xsrf
            })
            session.headers = requestHeader
            response = session.post(url=loginURL, data=form_data)
            if response.status_code == 200:
                # 检查是否已经登陆成功
                response = session.get(authTestURL)
                if response.status_code == 200:
                    # 保存登陆认证cookie
                    self.auth_token = session.cookies.get_dict()
                    if log.isEnabledFor(logging.INFO):
                        log.info('知乎账户登陆成功')
                    return True

            # 登陆失败
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆失败')
            return False
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error(e)
        finally:
            session.close()
Exemple #5
0
    def run(self):
        # 初始化配置
        self.init()

        # 启动代理检验线程
        validate_thread_list = []
        for i in range(PROXY_VALIDATE_THREAD_NUM):
            validate_thread = ProxyValidateThread()
            validate_thread_list.append(validate_thread)
            validate_thread.start()
        if log.isEnabledFor(logging.DEBUG):
            log.debug("代理验证线程启动")

        # 启动代理池扫描线程
        scan_thread = ProxyPoolScanThread()
        scan_thread.start()
        if log.isEnabledFor(logging.DEBUG):
            log.debug("代理池扫描线程启动")

        # 检查是否有线程出现异常并将其重启
        while True:
            # 检查代理验证线程
            for thread in validate_thread_list:
                if thread.status == 'error':
                    validate_thread_list.remove(thread)
                    thread = ProxyValidateThread()
                    validate_thread_list.append(thread)
                    thread.start()
                    if log.error(logging.ERROR):
                        log.error('代理验证线程重新启动')

            # 检查代理池扫描线程
            if scan_thread.status == 'error':
                scan_thread = ProxyPoolScanThread()
                scan_thread.start()
                if log.isEnabledFor(logging.ERROR):
                    log.error("代理池扫描线程重新启动")

            time.sleep(180)
    def send_message(self, email_content):
        # 准备发送的内容
        now = datetime.datetime.now()
        header = self.smtp_email_header + '[' + str(now.month) + '-' + str(now.day) + ' ' + \
            str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']'
        msg = MIMEText(email_content, 'plain', 'utf-8')
        msg['from'] = self.smtp_from_addr
        msg['to'] = self.smtp_to_addr
        msg['Subject'] = Header(header, 'utf-8').encode()

        # 发送
        try:
            smtp_server = smtplib.SMTP(self.smtp_server_host,
                                       self.smtp_server_port)
            smtp_server.login(self.smtp_from_addr, self.smtp_server_password)
            smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr],
                                 msg.as_string())
            smtp_server.quit()
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
Exemple #7
0
    def parse_user_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]
        token = response_info[2]

        # 提取JSON信息
        user_info_entities = None
        try:
            bs_obj = BeautifulSoup(data, 'html.parser')
            data_json = bs_obj.find('div', attrs={'id': 'data'})
            if data_json is None:
                return
            else:
                data_json = data_json['data-state']

            # 字符串处理
            # 处理转义字符
            data_json = html.unescape(data_json)
            # 处理html标签
            data_json = BeautifulSoup(data_json, 'html.parser').text

            # 转换为JSON对象
            data_json = json.loads(data_json)

            # 提取实体
            if 'entities' not in data_json:
                return
            entities = data_json['entities']

            # 提取用户信息
            if 'users' not in entities:
                return
            users = entities['users']

            # 提取目标用户信息
            if token not in users:
                return
            user_info = users[token]

            # 提取目标用户的个人信息
            avatar_url_template = None
            name = None
            headline = None
            locations = []
            business = None
            employments = []
            educations = []
            description = None
            gender = None
            following_count = None
            follower_count = None
            answer_count = None
            question_count = None
            voteup_count = None
            if USER_AVATAR_URL_TEMPLATE in user_info:
                avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE]

            if USER_NAME in user_info:
                name = user_info[USER_NAME]

            if USER_HEADLINE in user_info:
                headline = user_info[USER_HEADLINE]

            if USER_LOCATIONS in user_info:
                for location in user_info[USER_LOCATIONS]:
                    locations.append(location['name'])

            if USER_BUSINESS in user_info:
                business = user_info[USER_BUSINESS]['name']

            if USER_EMPLOYMENTS in user_info:
                for employment in user_info[USER_EMPLOYMENTS]:
                    elem = {}
                    if 'job' in employment:
                        job = employment['job']['name']
                        elem.update({'job': job})
                    if 'company' in employment:
                        company = employment['company']['name']
                        elem.update({'company': company})
                    employments.append(elem)

            if USER_EDUCATIONS in user_info:
                for education in user_info[USER_EDUCATIONS]:
                    if 'school' in education:
                        school = education['school']['name']
                        educations.append(school)

            if USER_DESCRIPTION in user_info:
                description = user_info[USER_DESCRIPTION]

            if USER_GENDER in user_info:
                gender = user_info[USER_GENDER]

            if USER_FOLLOWING_COUNT in user_info:
                following_count = user_info[USER_FOLLOWING_COUNT]

            if USER_FOLLOWER_COUNT in user_info:
                follower_count = user_info[USER_FOLLOWER_COUNT]

            if USER_ANSWER_COUNT in user_info:
                answer_count = user_info[USER_ANSWER_COUNT]

            if USER_QUESTION_COUNT in user_info:
                question_count = user_info[USER_QUESTION_COUNT]

            if USER_VOTE_UP_COUNT in user_info:
                voteup_count = user_info[USER_VOTE_UP_COUNT]

            # 构造用户信息实体
            user_info_entities = {
                USER_AVATAR_URL_TEMPLATE: avatar_url_template,
                USER_URL_TOKEN: token,
                USER_NAME: name,
                USER_HEADLINE: headline,
                USER_LOCATIONS: locations,
                USER_BUSINESS: business,
                USER_EMPLOYMENTS: employments,
                USER_EDUCATIONS: educations,
                USER_DESCRIPTION: description,
                USER_GENDER: gender,
                USER_FOLLOWING_COUNT: following_count,
                USER_FOLLOWER_COUNT: follower_count,
                USER_ANSWER_COUNT: answer_count,
                USER_QUESTION_COUNT: question_count,
                USER_VOTE_UP_COUNT: voteup_count
            }

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('User info 数据解析错误')
                log.exception(e)

        # 处理提取的信息
        if user_info_entities is None:
            return

        # 再次检查用户是否已经添加,若已经添加则不再继续
        if self.token_filter.check_token(token) is True:
            return

        # 标记提取的用户信息
        self.token_filter.mark_token(token)

        # 生成 Following List URL
        if self.is_parser_following_list is True:
            pipe = self.redis_connection.pipeline()
            following_count = user_info_entities[USER_FOLLOWING_COUNT]
            if following_count is not None:
                offset = 0
                limit = 20
                while offset < following_count:
                    url_info = [
                        'list',
                        self.generate_following_info_url(token, offset, limit),
                        token, 'followingList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 生成 Follower List URL
        if self.is_parser_follower_list is True:
            pipe = self.redis_connection.pipeline()
            follower_count = user_info_entities[USER_FOLLOWER_COUNT]
            if follower_count is not None:
                offset = 0
                limit = 20
                while offset < follower_count:
                    url_info = [
                        'list',
                        self.generate_follower_info_url(token, offset, limit),
                        token, 'followerList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 保存提取到的用户信息
        if log.isEnabledFor(logging.DEBUG):
            log.info('成功获取一个用户的详细信息')
        self.redis_connection.rpush(self.persistent_cache, user_info_entities)
Exemple #8
0
    def start_spider_core(self):
        if log.isEnabledFor(logging.INFO):
            log.info('Spider 开始启动')

        try:
            # 创建Redis连接
            redis_connect_retry_times = 3
            while redis_connect_retry_times > 0:
                self.redis_connection = redis.StrictRedis(
                    host=self.redis_host,
                    port=self.redis_port,
                    db=self.redis_db,
                    password=self.redis_password)
                ping = self.redis_connection.ping()
                if ping is True:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接成功')
                    break
                else:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接失败')
                    redis_connect_retry_times -= 1
                    time.sleep(5)

            # 若连接不成功则退出
            if redis_connect_retry_times <= 0:
                raise Exception()

            # 创建MySQL连接
            self.mysql_connection = pymysql.connect(host=self.mysql_host,
                                                    user=self.mysql_username,
                                                    passwd=self.mysql_password,
                                                    db=self.mysql_database,
                                                    charset=self.mysql_charset)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Redis 启动失败')
                log.exception(e)
            return

        # 创建 response 缓存队列
        self.response_buffer = ResponseBuffer()

        # 启动账户管理器并登陆
        self.account_manager = AccountManager(self.login_token, self.password,
                                              self.is_login_by_cookie,
                                              self.z_c0)
        is_login = self.account_manager.login()
        if not is_login:
            return

        # 启动Downloader
        self.downloader = Downloader(
            self.redis_connection, self.response_buffer, self.account_manager,
            self.is_proxy_service_enable, self.session_pool_size,
            self.download_thread_num, self.network_retry_times,
            self.connect_timeout, self.download_interval)
        self.downloader.start_downloader()

        # 启动Scheduler
        self.schedule = Scheduler(self.redis_connection, self.url_rate)
        self.schedule.start()

        # 启动 DataPersistent
        self.dataPersistent = DataPersistent(
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size, self.mysql_connection,
            self.redis_connection)
        self.dataPersistent.start_data_persistent()

        # 启动Processor
        self.processor = Processor(self.process_thread_num,
                                   self.is_parser_following_list,
                                   self.is_parser_follower_list,
                                   self.is_parser_follow_relation,
                                   self.redis_connection, self.response_buffer)
        self.processor.start_processor()
        self.processor.load_init_data(self.init_token)

        # 启动邮件服务
        if self.is_email_service_enable is True:
            self.email_service = EmailService(
                self.smtp_server_host, self.smtp_server_port,
                self.smtp_server_password, self.smtp_from_addr,
                self.smtp_to_addr, self.smtp_email_header,
                self.smtp_send_interval, self.dataPersistent)
            self.email_service.start_email_service()
            self.email_service.send_message('Spider 启动完毕')

        if log.isEnabledFor(logging.INFO):
            log.info('Spider 启动完毕')

        # 模块异常检查
        while True:
            # Downloader模块异常检查
            self.downloader.check_and_restart()
            # EmailService 模块异常检查
            if self.is_email_service_enable is True:
                self.email_service.check_and_restart()
            # DataPersistent 模块异常检查
            self.dataPersistent.check_and_restart()
            # Scheduler 模块异常检查
            # Processor 模块异常检查
            self.processor.check_and_restart()
            # 检查间隔
            time.sleep(180)
            gc.collect()
    def run(self):
        debug_info = None
        try:
            while True:
                # 持久化用户信息
                current_user_info_cache_size = self.redis_connection.llen(
                    self.persistent_cache)
                if current_user_info_cache_size >= self.persistent_cache_size:
                    self.lock.acquire()
                    cursor = self.db_connection.cursor()
                    for i in range(current_user_info_cache_size):
                        user_info = self.redis_connection.lpop(
                            self.persistent_cache)
                        debug_info = user_info
                        if user_info is not None:
                            user_info = self.convert_user_info(
                                eval(user_info.decode('utf-8')))
                            cursor.execute(INSERT_USER_INFO, [
                                user_info[USER_AVATAR_URL_TEMPLATE],
                                user_info[USER_URL_TOKEN],
                                user_info[USER_NAME], user_info[USER_HEADLINE],
                                user_info[USER_LOCATIONS],
                                user_info[USER_BUSINESS],
                                user_info[USER_EMPLOYMENTS],
                                user_info[USER_EDUCATIONS],
                                user_info[USER_DESCRIPTION],
                                user_info[USER_GENDER],
                                user_info[USER_FOLLOWING_COUNT],
                                user_info[USER_FOLLOWER_COUNT],
                                user_info[USER_ANSWER_COUNT],
                                user_info[USER_QUESTION_COUNT],
                                user_info[USER_VOTE_UP_COUNT]
                            ])
                    self.db_connection.commit()
                    cursor.close()
                    self.lock.release()

                # 持久化关注关系
                current_follow_relation_cache_size = self.redis_connection.llen(
                    self.follow_relation_persistent_cache)
                if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size:
                    self.lock.acquire()
                    cursor = self.db_connection.cursor()
                    for i in range(current_follow_relation_cache_size):
                        follow_relation = self.redis_connection.lpop(
                            self.follow_relation_persistent_cache)
                        debug_info = follow_relation
                        if follow_relation is not None:
                            follow_relation = eval(
                                follow_relation.decode('utf-8'))
                            cursor.execute(INSERT_FOLLOW_RELATION, [
                                follow_relation[FOLLOW_FROM],
                                follow_relation[FOLLOW_TO]
                            ])
                    self.db_connection.commit()
                    cursor.close()
                    self.lock.release()

                # 检查时间间隔
                time.sleep(180)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('用户数据持久化线程异常退出')
                log.exception(e)
            log.debug(debug_info)
            self.thread_status = 'error'
Exemple #10
0
    def run(self):
        if log.isEnabledFor(logging.INFO):
            log.info('数据下载线程' + self.thread_id + '启动')

        # 初次启动,阻塞至获取足够的代理
        self.session_manager.init_get()

        # 保存上一次未下载的url info
        previous_url_info = None
        while True:
            # 获取session
            session = self.session_manager.get_session_connection()

            # 尝试下载数据
            network_retry_times = 0
            while network_retry_times < self.NETWORK_RETRY_TIMES:
                try:
                    # 获取URL
                    if previous_url_info is None:
                        url_info = self.get_url_info_from_queue()
                        previous_url_info = url_info
                    else:
                        url_info = previous_url_info
                    url = url_info[1]

                    # 下载数据
                    response = session.get(url, timeout=self.CONNECT_TIMEOUT)

                    if log.isEnabledFor(logging.DEBUG):
                        log.debug(response.status_code)

                    # 检查返回结果
                    if response.status_code == 200:
                        # 封装下载的数据(包括原来的数据)
                        response_info = url_info
                        response_info[1] = response.text
                        self.put_response_info_to_queue(response_info)
                        previous_url_info = None
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug('下载成功')
                        break
                    elif response.status_code == 403:
                        if log.isEnabledFor(logging.ERROR):
                            log.error('账号认证失败')
                        break
                    elif response.status_code == 429:
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug('[' + str(self.thread_id) + ']' +
                                      '访问太频繁,稍候重新访问,响应码为:' +
                                      str(response.status_code))
                        previous_url_info = url_info
                        break
                    elif response.status_code == 404 or response.status_code == 410:
                        previous_url_info = None
                        del url_info
                        break
                    else:
                        if log.isEnabledFor(logging.ERROR):
                            log.error(response.status_code)
                        network_retry_times += 1
                except Exception as e:
                    network_retry_times += 1
                    time.sleep(self.DOWNLOAD_INTERVAL)
                    if log.isEnabledFor(logging.DEBUG):
                        log.debug('[' + str(self.thread_id) + ']' +
                                  '下载异常,正在重新连接...(第' +
                                  str(network_retry_times) + '次重试)')
                    if log.isEnabledFor(logging.DEBUG):
                        log.error(e)

            # 下载间隔
            time.sleep(self.DOWNLOAD_INTERVAL)

            # 归还session
            if network_retry_times < self.NETWORK_RETRY_TIMES:
                self.session_manager.return_session_connection(session)
            else:
                self.session_manager.return_and_switch_proxy(session)