Esempio n. 1
0
 def delete_item(self):
     """
     删除缓存
     :return:
     """
     key_info_list = self.cache_info_lb.get(
         self.cache_info_lb.curselection()).split('   ')
     key_id = key_info_list[0]
     result = tk.messagebox.askquestion('删除',
                                        '确定删除key为' + key_id + '的缓存?',
                                        icon='warning')
     if result == 'yes':
         re_result = tk.messagebox.askquestion('删除',
                                               '你刚才点了删除,真的要删除吗?',
                                               icon='warning')
         if re_result == 'yes':
             redis = RedisUtil(int(self.db_index.get()))
             try:
                 redis.del_key(key_id)
             except ConnectionError:
                 tk.messagebox.showerror('错误', '没有连接Redis')
         else:
             pass
     else:
         pass
Esempio n. 2
0
    def print_key_value(self,event):
        """
        点击Listbox组件中一条缓存条目,查看对应数据
        :param event:
        :return:None
        """
        self.cache_content_text.delete('1.0', tk.END)

        #获取Listbox中选中的item的数据
        key_info_list = self.cache_info_lb.get(self.cache_info_lb.curselection()).split('   ')
        key_id = key_info_list[0]
        key_type = key_info_list[1]
        third_value = key_info_list[2]  #超时时间 or field_name(hash/set)

        redis = RedisUtil(int(self.db_index.get()))

        #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field
        if key_type in ['hash','set']:
            try:
                cache_info = redis.get_key_value(key_type, key_id, None)
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return
            self.fillin_listbox(key_type, key_id, cache_info)
        elif key_type in ['string','list','hash_field','set_field']:
            try:
                key_value = redis.get_key_value(key_type, key_id, third_value)
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return
            self.cache_content_text.insert(tk.INSERT, str(key_value))
Esempio n. 3
0
    def print_key_value(self, event):
        """
        点击Listbox组件中一条缓存条目,查看对应数据
        :param event:
        :return:None
        """
        self.cache_content_text.delete('1.0', tk.END)

        #获取Listbox中选中的item的数据
        key_info_list = self.cache_info_lb.get(
            self.cache_info_lb.curselection()).split('   ')
        key_id = key_info_list[0]
        key_type = key_info_list[1]
        third_value = key_info_list[2]  #超时时间 or field_name(hash/set)

        redis = RedisUtil(int(self.db_index.get()))

        #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field
        if key_type in ['hash', 'set']:
            try:
                cache_info = redis.get_key_value(key_type, key_id, None)
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return
            self.fillin_listbox(key_type, key_id, cache_info)
        elif key_type in ['string', 'list', 'hash_field', 'set_field']:
            try:
                key_value = redis.get_key_value(key_type, key_id, third_value)
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return
            self.cache_content_text.insert(tk.INSERT, str(key_value))
Esempio n. 4
0
    def __init__(self):
        self.base_url = 'https://www.zhihu.com'
        self.settings = 'https://www.zhihu.com/settings/profile'
        self.headers = {
            "User-Agent":
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
            "Referer": 'http://www.zhihu.com/',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Host': 'www.zhihu.com',
        }
        # 爬虫起点
        self.start_user = None
        # 已爬取用户ID的Redis Set Key
        self.pass_key = 'zhihu:pass'
        # 爬取失败用户ID的Redis Set Key
        self.fail_key = 'zhihu:fail'
        # 待爬取用户ID的Redis List Key
        self.queue_key = 'user'
        # 知乎账号池
        self.pool = AccountPool()
        # 采用requests库保存会话信息
        self.session = requests.session()
        # mongodb存储爬取的用户信息
        self.mongo = Mongo(database='zhihu')
        # redis存储爬取状态信息
        self.redis = RedisUtil(host='localhost', port=6379, namespace='zhihu')
        # logger配置
        logging.config.fileConfig("./Log/zhihu.conf")
        self.logger = logging.getLogger('zhihu')

        self.use_account()
Esempio n. 5
0
    def query_cache_info(self):
        """
        按查询条件查询缓存
        :return:
        """
        if self.db_index.get() in self.all_db:
            redis = RedisUtil(int(self.db_index.get()))
        else:
            tk.messagebox.showerror('错误', '参数不正确:没有选择数据库')
            return

        type_variable_value = self.type_variable.get()
        query_condition_key = self.input_value.get()

        self.cache_content_text.delete('1.0', tk.END)
        self.cache_info_lb.delete(0, tk.END)

        #2.按指定的key类型和key名称查询数据
        if type_variable_value in ['string','list','set','hash'] and \
                self.check_none(query_condition_key):
            try:
                cache_info = redis.get_key_value(type_variable_value,
                                                 query_condition_key, None)
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return

            #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field
            if type_variable_value in ['hash', 'set']:
                self.fillin_listbox(type_variable_value, query_condition_key,
                                    cache_info)
                return

            #如果数据类型是string,则在Text中显示value,在Listbox中显示key
            self.cache_info_lb.delete(0, tk.END)
            try:
                key_info_tuple = redis.get_key_info(query_condition_key)
                result = query_condition_key + '   ' + type_variable_value + '   超时时间:' + key_info_tuple[
                    1]
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return
            self.cache_info_lb.insert(tk.END, result)
            self.cache_content_text.insert(tk.INSERT, str(cache_info))

        #1.查询当前数据库中所有的key信息:key名称、key类型、key超时时间,在Listbox组件显示
        elif type_variable_value == '选择数据类型':
            try:
                cache_info = redis.get_all_keys()
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return

            for item in cache_info:
                self.tmp_list.clear()
                self.tmp_list.append(item)  #临时保存数据
                self.cache_info_lb.insert(tk.END, item)
Esempio n. 6
0
    def query_cache_info(self):
        """
        按查询条件查询缓存
        :return:
        """
        if self.db_index.get() in self.all_db:
            redis = RedisUtil(int(self.db_index.get()))
        else:
            tk.messagebox.showerror('错误', '参数不正确:没有选择数据库')
            return

        type_variable_value = self.type_variable.get()
        query_condition_key = self.input_value.get()

        self.cache_content_text.delete('1.0', tk.END)
        self.cache_info_lb.delete(0,tk.END)

        #2.按指定的key类型和key名称查询数据
        if type_variable_value in ['string','list','set','hash'] and \
                self.check_none(query_condition_key):
            try:
                cache_info = redis.get_key_value(type_variable_value, query_condition_key, None)
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return

            #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field
            if type_variable_value in ['hash','set']:
                self.fillin_listbox(type_variable_value, query_condition_key, cache_info)
                return

            #如果数据类型是string,则在Text中显示value,在Listbox中显示key
            self.cache_info_lb.delete(0,tk.END)
            try:
                key_info_tuple = redis.get_key_info(query_condition_key)
                result = query_condition_key+'   '+type_variable_value+'   超时时间:'+key_info_tuple[1]
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return
            self.cache_info_lb.insert(tk.END, result)
            self.cache_content_text.insert(tk.INSERT, str(cache_info))

        #1.查询当前数据库中所有的key信息:key名称、key类型、key超时时间,在Listbox组件显示
        elif type_variable_value=='选择数据类型':
            try:
                cache_info = redis.get_all_keys()
            except ConnectionError:
                tk.messagebox.showerror('错误', '没有连接Redis')
                return

            for item in cache_info:
                self.tmp_list.clear()
                self.tmp_list.append(item)       #临时保存数据
                self.cache_info_lb.insert(tk.END, item)
Esempio n. 7
0
    def _handler_for_recent_games(self) -> str:
        """[Returns the recent games searched by the user]

        Returns:
            [str]: [games with separator as \n]
        """
        return RedisUtil(self.message.author).get_all_games()
Esempio n. 8
0
    def _insert_game_in_db_if_exists(self):
        """
        Insert game in redis if google query contains words like
        !google apple games, !google game of thrones etc.

        Otherwise does nothing.
        """
        if "game" in self.user_message_lower:
            game_name = " ".join(self.user_message_lower.split(" ")[1:])
            RedisUtil(self.message.author).insert_game(game_name)
Esempio n. 9
0
 def __init__(self):
     if utils.get_host_ip() == '10.1.13.49':
         self.HOST = '10.1.13.29'
     else:
         self.HOST = '202.107.204.50'
     self.conn = MySQLdb.connect(host=self.HOST, user='******', passwd='tdlabDatabase', db='techpooldata',
                                 port=3306, charset='utf8')
     self.tables = {'paper': 'expert_paper_join', 'patent': 'expert_patent_join', 'project': 'expert_project_join'}
     self.columns = {'paper': 'PAPER_ID', 'patent': 'PATENT_ID', 'project': 'PROJECT_ID'}
     self.redis = RedisUtil()
Esempio n. 10
0
 def delete_item(self):
     """
     删除缓存
     :return:
     """
     key_info_list = self.cache_info_lb.get(self.cache_info_lb.curselection()).split('   ')
     key_id = key_info_list[0]
     result = tk.messagebox.askquestion('删除', '确定删除key为'+key_id+'的缓存?', icon='warning')
     if result == 'yes':
         re_result = tk.messagebox.askquestion('删除', '你刚才点了删除,真的要删除吗?', icon='warning')
         if re_result == 'yes':
             redis = RedisUtil(int(self.db_index.get()))
             try:
                 redis.del_key(key_id)
             except ConnectionError:
                 tk.messagebox.showerror('错误', '没有连接Redis')
         else:
             pass
     else:
         pass
Esempio n. 11
0
    def conn_redis(self):
        env = self.env_value.get()
        host = self.host_value.get()
        port = self.port_value.get()
        password = self.password_value.get()

        is_Connection_success = RedisUtil(None).testConnection(
            host=host, port=port, password=password)
        if is_Connection_success:
            self.top_level.destroy()  #销毁Toplevel窗口
            conf_file_path = os.getcwd() + '\\conf\\redis_conf.cfg'
            RedisConf().write_cfg(file_path=conf_file_path,
                                  env=env,
                                  host=host,
                                  port=port,
                                  password=password)
            tk.messagebox.showinfo('连接成功', '连接Redis成功!')
        else:
            tk.messagebox.showwarning('连接失败', '连接Redis失败!')
Esempio n. 12
0
class ZhihuCrawler:
    def __init__(self):
        self.base_url = 'https://www.zhihu.com'
        self.settings = 'https://www.zhihu.com/settings/profile'
        self.headers = {
            "User-Agent":
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
            "Referer": 'http://www.zhihu.com/',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Host': 'www.zhihu.com',
        }
        # 爬虫起点
        self.start_user = None
        # 已爬取用户ID的Redis Set Key
        self.pass_key = 'zhihu:pass'
        # 爬取失败用户ID的Redis Set Key
        self.fail_key = 'zhihu:fail'
        # 待爬取用户ID的Redis List Key
        self.queue_key = 'user'
        # 知乎账号池
        self.pool = AccountPool()
        # 采用requests库保存会话信息
        self.session = requests.session()
        # mongodb存储爬取的用户信息
        self.mongo = Mongo(database='zhihu')
        # redis存储爬取状态信息
        self.redis = RedisUtil(host='localhost', port=6379, namespace='zhihu')
        # logger配置
        logging.config.fileConfig("./Log/zhihu.conf")
        self.logger = logging.getLogger('zhihu')

        self.use_account()

    '''
    切换账号
    '''

    def use_account(self):
        cookie = self.pool.get()
        if cookie is None:
            self.logger.error('NO ACCOUNT')
            return False
        self.session.cookies.update(cookie)
        return self.is_login()

    '''
    验证是否处于登录状态
    '''

    def is_login(self):
        login_code = self.session.get(self.settings,
                                      headers=self.headers,
                                      allow_redirects=False).status_code
        return True if login_code == 200 else False

    '''
    获取用户基本信息
    包括其关注者列表
    '''

    def get_user_basic(self, username):
        home_url = self.base_url + '/people/' + username + '/following'
        req = self.session.get(url=home_url, headers=self.headers, verify=True)
        soup = BeautifulSoup(req.text, 'lxml')

        user_info = dict()
        data = soup.find('div', id='data')['data-state']
        data = json.loads(data, encoding='utf-8')
        user = data['entities']['users'][username]
        followings = list(data['entities']['users'])
        followings.remove(username)

        img = soup.find('img', class_='Avatar Avatar--large UserAvatar-inner')
        user_info['avatar'] = img['src'] if img is not None else ''
        user_info['name'] = user['name']
        user_info['headline'] = user['headline']
        user_info['gender'] = 'Male' if user['gender'] else 'Female'
        user_info['description'] = user['description']
        user_info['business'] = user['business'][
            'name'] if 'business' in user.keys() else ''
        user_info['answerCount'] = int(user['answerCount'])
        user_info['favoriteCount'] = int(user['favoriteCount'])
        user_info['thankedCount'] = int(user['thankedCount'])
        user_info['followerCount'] = int(user['followerCount'])
        user_info['followingCount'] = int(user['followingCount'])
        user_info['educations'] = list()
        user_info['employments'] = list()
        user_info['locations'] = list()

        for edu in user['educations']:
            info = dict()
            info['school'] = edu['school']['name'] if 'school' in edu.keys(
            ) else ''
            info['major'] = edu['major']['name'] if 'major' in edu.keys(
            ) else ''
            user_info['educations'].append(info)
        for loc in user['locations']:
            info = dict()
            info['name'] = loc['name']
            user_info['locations'].append(info)
        for em in user['employments']:
            info = dict()
            info['name'] = em['company']['name'] if 'name' in em.keys() else ''
            info['job'] = em['job']['name'] if 'job' in em.keys() else ''
            user_info['employments'].append(info)

        user_info['create_time'] = datetime.datetime.now()
        user_info['following'] = followings

        return user_info, followings

    '''
    采用BFS沿着关注链爬取用户
    depth: 当前层数
    max_depth: 最大层数
    '''

    def following_crawler(self, depth, max_depth=5):
        if depth > max_depth:
            return
        depths = ['#{}'.format(i) for i in range(max_depth)]
        index = 0
        s_cnt = self.redis.ssize(self.pass_key)
        f_cnt = self.redis.ssize(self.fail_key)
        if self.redis.get(self.queue_key) is None:
            self.start_user = raw_input('从谁开始爬? ').strip()
            self.redis.put(self.start_user)
            self.redis.put('#0')

        while index <= max_depth:
            while not self.redis.empty(self.queue_key):
                username = self.redis.get(self.queue_key)
                try:
                    index = depths.index(username)
                    break
                except Exception as e:
                    pass

                if self.redis.sismem(self.pass_key,
                                     username) or self.redis.sismem(
                                         self.fail_key, username):
                    continue

                self.logger.info('[{}]'.format(username))
                try:
                    basic, followings = self.get_user_basic(username)
                    self.redis.sadd_items(self.pass_key, username)
                    self.redis.put(self.queue_key, *tuple(followings))
                    self.mongo.save_user(basic)
                    s_cnt += 1
                except Exception as e:
                    self.logger.info(e.message)
                    self.logger.info(
                        '--------{}--------failed'.format(username))
                    self.redis.sadd_items(self.fail_key, username)
                    f_cnt += 1

                # 知乎反爬虫力度太大,由于只有俩账号,只好放慢速度
                if (f_cnt + s_cnt + 1) % 5 == 0:
                    self.logger.info(
                        '---------\nsleep at {}\n---------'.format(
                            datetime.datetime.now()))
                    time.sleep(5)
                if (f_cnt + s_cnt + 1) % 50 == 0:
                    self.logger.info(
                        '---------\nsleep at {}\n---------'.format(
                            datetime.datetime.now()))
                    time.sleep(15)
                if (f_cnt + s_cnt + 1) % 25 == 0:
                    if not self.use_account():
                        self.logger.error('Account Error')
                        raise Exception('Account Error')
                    else:
                        self.logger.info('--------\nchange account\n--------')

            self.redis.put(self.queue_key, depths[index + 1])
            self.logger.info(
                '---------\nDepth {} crawled.\t Fail/Success: {}/{} got\n----------'
                .format(index, f_cnt, s_cnt))
            index = index + 1