class RedisManager():
    def __init__(self, settings, stats):
        self.logger = logging.getLogger(__name__)
        self.settings = settings
        self.stats = stats

        REDIS_HOST = self.settings.get('REDIS_HOST')
        REDIS_PORT = self.settings.get('REDIS_PORT')
        REDIS_PASSWORD = self.settings.get('REDIS_PASSWORD')

        try:
            self.rb = Client(host=REDIS_HOST,
                             port=REDIS_PORT,
                             password=REDIS_PASSWORD)
            self.logger.info(f"Successfully connected to redis server")
        except Exception as e:
            self.logger.error(f"Unable to connect to redis server: {e}")

    def _bf_add_url_(self, url):
        try:
            bf_add = self.rb.bfAdd('bf_urls', url)
            if bf_add:
                self.stats.inc_value('redis/bloomfilter/added_urls')
                self.logger.info(f"Added '{url}' to bloomfilter.")
            else:
                self.logger.error(f"Couldn't add '{url}' to bloomfilter")
        except Exception as e:
            self.logger.error(e)

    def _bf_check_url_pres_(self, url):
        if self.rb.bfExists('bf_urls', url):
            self.logger.debug(f"Found '{url}' in bloomfilter")
            self.stats.inc_value('redis/bloomfilter/existing_urls')
            return True
        else:
            self.logger.debug(f"Couldn't find '{url}' in bloomfilter")
            self.stats.inc_value('redis/bloomfilter/not_existing_urls')
            return False


# if __name__ == '__main__':
#     rm = RedisManager()
#     rm._bf_add_url_("test1")
Exemple #2
0
from redisbloom.client import Client

# 因为我使用的是虚拟机中docker的redis, 填写虚拟机的ip地址和暴露的端口
rb = Client(host='node01', port=6379)
rb.bfAdd('urls', 'baidu')
rb.bfAdd('urls', 'google')
print(rb.bfExists('urls', 'baidu'))  # out: 1
print(rb.bfExists('urls', 'tencent2'))  # out: 0

rb.bfMAdd('urls', 'a', 'b')
print(rb.bfMExists('urls', 'google', 'baidu', 'tencent'))  # out: [1, 1, 0]
Exemple #3
0
def get_item(key, item):
    """判断是否存在"""
    rb = Client(connection_pool=pool)
    return rb.bfExists(key, item)
Exemple #4
0
class Follow(object):
    def __init__(self, config):
        """Follow类初始化"""
        self.rb = Client()
        self.filter_redis_key = 'uidfilter'
        self.validate_config(config)
        self.cookie = {'Cookie': config['cookie']}
        user_id_list = config['user_id_list']
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            user_id_list = self.get_user_list(user_id_list)
        self.user_id_list = user_id_list  # 要爬取的微博用户的user_id列表
        self.user_id = ''
        self.follow_list = []  # 存储爬取到的所有关注微博的uri和用户昵称
        self.fans_list = []  # 存储爬取到的所有粉丝微博的uri和用户昵称
        self.file_name = 'user_id_list' + str(time()) + '.txt'

    def validate_config(self, config):
        """验证配置是否正确"""
        user_id_list = config['user_id_list']
        if (not isinstance(user_id_list,
                           list)) and (not user_id_list.endswith('.txt')):
            sys.exit(u'user_id_list值应为list类型或txt文件路径')
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            if not os.path.isfile(user_id_list):
                sys.exit(u'不存在%s文件' % user_id_list)

    def deal_html(self, url):
        """处理html"""
        try:
            html = requests.get(url, cookies=self.cookie, verify=False).content
            selector = etree.HTML(html)
            return selector
        except Exception as e:
            print('Error: ', e)
            traceback.print_exc()

    def get_page_num(self):
        """获取关注列表页数"""
        url = "https://weibo.cn/%s/follow" % self.user_id
        selector = self.deal_html(url)
        if selector.xpath("//input[@name='mp']") == []:
            page_num = 1
        else:
            page_num = (int)(
                selector.xpath("//input[@name='mp']")[0].attrib['value'])
        return page_num

    def get_one_page(self, page):
        """获取第page页的user_id"""
        print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30))
        url = 'https://weibo.cn/%s/follow?page=%d' % (self.user_id, page)
        selector = self.deal_html(url)
        table_list = selector.xpath('//table')
        if (page == 1 and len(table_list) == 0):
            print(u'cookie无效或提供的user_id无效')
        else:
            for t in table_list:
                im = t.xpath('.//a/@href')[-1]
                uri = im.split('uid=')[-1].split('&')[0].split('/')[-1]
                nickname = t.xpath('.//a/text()')[0]
                # if {'uri': uri, 'nickname': nickname} not in self.follow_list:
                if self.rb.bfExists(self.filter_redis_key, uri) == 0:
                    self.rb.bfAdd(self.filter_redis_key, uri)
                    self.follow_list.append({'uri': uri, 'nickname': nickname})
                    print(u'%s %s' % (nickname, uri))

    def get_follow_list(self):
        """获取关注用户主页地址"""
        page_num = self.get_page_num()
        print(u'用户关注页数:' + str(page_num))
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
            self.get_one_page(page)

            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        print(u'用户关注列表爬取完毕')

    def get_fans_page_num(self):
        """获取关注列表页数"""
        url = "https://weibo.cn/%s/fans" % self.user_id
        selector = self.deal_html(url)
        if selector.xpath("//input[@name='mp']") == []:
            page_num = 1
        else:
            page_num = (int)(
                selector.xpath("//input[@name='mp']")[0].attrib['value'])
        return page_num

    def get_fans_one_page(self, page):
        """获取第page页的user_id"""
        print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30))
        url = 'https://weibo.cn/%s/fans?page=%d' % (self.user_id, page)
        selector = self.deal_html(url)
        table_list = selector.xpath('//table')
        if (page == 1 and len(table_list) == 0):
            print(u'cookie无效或提供的user_id无效')
        else:
            for t in table_list:
                im = t.xpath('.//a/@href')[-1]
                uri = im.split('uid=')[-1].split('&')[0].split('/')[-1]
                nickname = t.xpath('.//a/text()')[0]
                #if {'uri': uri, 'nickname': nickname} not in self.fans_list:
                if self.rb.bfExists(self.filter_redis_key, uri) == 0:
                    self.rb.bfAdd(self.filter_redis_key, uri)
                    self.fans_list.append({'uri': uri, 'nickname': nickname})
                    print(u'%s %s' % (nickname, uri))

    def get_fans_list(self):
        """获取关注用户主页地址"""
        page_num = self.get_fans_page_num()
        print(u'用户关注页数:' + str(page_num))
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
            self.get_fans_one_page(page)

            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        print(u'用户粉丝列表爬取完毕')

    def write_to_txt(self):
        with open(self.file_name, 'ab') as f:
            for user in self.follow_list:
                f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode(
                    sys.stdout.encoding))
            for user in self.fans_list:
                f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode(
                    sys.stdout.encoding))

    def get_user_list(self, file_name):
        """获取文件中的微博id信息"""
        with open(file_name, 'rb') as f:
            try:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8-sig') for line in lines]
            except UnicodeDecodeError:
                sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name)
            user_id_list = []
            for line in lines:
                info = line.split(' ')
                if len(info) > 0 and info[0].isdigit():
                    user_id = info[0]
                    if user_id not in user_id_list:
                        user_id_list.append(user_id)
        return user_id_list

    def initialize_info(self, user_id):
        """初始化爬虫信息"""
        self.follow_list = []
        self.fans_list = []
        self.user_id = user_id

    def check_unique(self, user_id):
        """查看user_id是否已经保存过"""

    def start(self):
        """运行爬虫"""
        for user_id in self.user_id_list:
            self.initialize_info(user_id)
            print(u'开始抓取:' + user_id)
            print('*' * 100)
            try:
                self.get_follow_list()  # 爬取关注列表
                self.get_fans_list()  # 爬取粉丝列表
            except Exception as e:
                print('Error: ', e)
                traceback.print_exc()
                sleep(10)  # 如果出错则跳过用户,而不是退出
            self.write_to_txt()
            print(u'信息抓取完毕')
            print('*' * 100)
Exemple #5
0
class Redis(BaseDb):
    '''
    proxy以 proxy:IP:port作为key,以hash方式存储,field为type, protocol,score, ctime
    '''

    __slots__ = ('_filter_name')

    @property
    def filter_name(self):
        return self._filter_name

    @filter_name.setter
    def filter_name(self, value):
        self._filter_name = value

    def __init__(self, host, pwd=None, port=6379, db=0):
        super().__init__()
        self.host = host
        self.pwd = pwd
        self.port = port
        self.db = db
        self._filter_name = ''

    def connect_to_redis(self):
        try:
            self.conn = Client(host=self.host,
                               port=self.port,
                               db=self.db,
                               password=self.pwd)
        except Exception as e:
            print(e)
            return False

        return True

    def gen_key_name(self, record):
        # print(record)
        # print('ip' in record)
        # print('port' in record)
        if 'ip' in record and 'port' in record:
            return 'Proxy:%s:%s' % (record['ip'], record['port'])
        else:
            return None

    def exists(self, key_name):
        '''
        判断key是否已经存在,普通方式,和bf做对比,实际不使用
        :param key_name:
        :return: 0(false)/1(True)
        '''
        return self.conn.exists(key_name)

    def delete(self, key_name):
        return self.conn.delete(key_name)

    def delete_all(self):
        return self.conn.flushdb()

    # def hdelete(self, key_name):
    #     return self.conn.hdel(key_name)

    def hmset(self, record, validate_time):
        valid_fields = ['ip', 'port', 'proxy_type', 'protocol', 'score']
        # print(record)
        for single_valid_field in valid_fields:
            # print(single_valid_field)
            # print(single_valid_field not in record)
            if single_valid_field not in record:
                raise InvalidFieldException(single_valid_field)

        key_name = self.gen_key_name(record)
        field_value = {
            'proxy_type': record['proxy_type'],
            'protocol': record['protocol'],
            'score': record['score'],
            # 'ctime': record['ctime']
        }

        self.conn.hmset(key_name, field_value)
        self.conn.expire(key_name, validate_time)

    def multi_hmet(self, records, validate_time):
        for single_record in records:
            # print(single_record)
            self.hmset(single_record, validate_time)

    def time_interval_in_seconds(self, old_date_time, new_date_time):
        '''
        计算old_date_time和new_date_time之间时间间隔,单位秒
        :param old_date_time:
        :param new_date_time:
        :return:    int
        '''

        if not helper.match_expect_type(old_date_time, 'datetime.datetime'):
            if helper.match_expect_type(old_date_time, 'str'):
                old_date_time = datetime.datetime.strptime(
                    old_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('old_date_time的格式不正确')

        if not helper.match_expect_type(new_date_time, 'datetime.datetime'):
            if helper.match_expect_type(new_date_time, 'str'):
                new_date_time = datetime.datetime.strptime(
                    new_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('new_date_time的格式不正确')

        # datetime.datetime.now()+datetime.timedelta(days=1)
        return int((new_date_time - old_date_time).total_seconds())
        # print((new_date_time - old_date_time).total_seconds())

    def expire(self, key_name, ttl):
        return self.conn.expire(key_name, ttl)

    def bf_create(self, fpp=0.001, capacity=1000, expansion=1):
        '''
        创建一个bloom过滤器
        :param filter_name: 过滤器名称
        :param fpp: 假阳性概率
        :param capacity: 过滤器存储元素的个数
        :param expansion: 当filter填满后,新建的子filter的capacity是当前filter的几倍大小。1,说明同样大小
        :return: 0(create fail)/1(create success)
        '''
        try:
            self.conn.bfCreate(key=self._filter_name,
                               errorRate=fpp,
                               capacity=capacity,
                               expansion=expansion)
        except redis.exceptions.ResponseError as e:
            # print(e)    #item exists
            return 0
        return 1

    def bf_madd(self, records):
        items = ''
        for single_record in records:
            items += self.gen_key_name(single_record)
        self.conn.bfMAdd(self._filter_name, items)

    def bf_add(self, record):
        item = self.gen_key_name(record)

        self.conn.bfMAdd(self._filter_name, item)

    def bf_exists(self, item):
        return self.conn.bfExists(self._filter_name, item)

    def bf_mexists(self, items):
        '''
        :param items: 是一个list,调用bfMExists,加*变成可变参数
        :return:
        '''
        return self.conn.bfMExists(self._filter_name, *items)
Exemple #6
0
class FullLayeredCache(LayeredCache):
    """
    Multi-Layered key value store with bloom filter and dgraph.

    Layer 1: In Memory LRU Key Value Map
    Layer 2: Redis Key Value Store
    Layer 3: Bloom filter
    Layer 4: DGraph

    The primary difference between this class and the LayeredCache class is that this
    one includes the bloom filter and DGraph.
    """
    def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000):
        """
        Initialize last two layers of cache

        :param node_name:
        :param lru_size:
        """
        super(FullLayeredCache, self).__init__(node_name, lru_size)

        # Set to true so we add a timeout to layer 2 redis key value stores
        self.set_timeout = True

        # Create the bloom filter client object
        self.bloom = RedisBloom(port=6378)

        # Create a dgraph client, stub, and transaction
        self.dgraph, self.stub = get_client()
        self.txn = self.dgraph.txn()

        # Initialize the bloom filter (if it doesnt already exist)
        try:
            self.bloom.bfInfo(node_name)
        except exceptions.ResponseError:
            self.bloom.bfCreate(node_name, p, n)

    def __contains__(self, key: str) -> bool:
        """
        Check to see if key is in a layer of the cache. We will start at
        layer 1 and go walk through each layer until we find a result.
        We will update previous layers if we cache miss.

        We'll return True if the key was found at a layer, False if we
        cache miss.

        :param key:
        :return:
        """

        # Check layer 1 and 2
        if super(FullLayeredCache, self).__contains__(key):
            return True

        # Check the layer 3 bloom filter
        exists_in_bloom = self.bloom.bfExists(self.node_name,
                                              self._get_key(key))
        if exists_in_bloom == 1:
            # Unfortunately, we can't store the actual value in the bloom filter.
            # For this, we can't update previous layers with the value for this key.
            return True

        # All else has failed, we must now check dgraph. This is super super slow.
        query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name
        dgraph_result = self.txn.query(query, variables={"$a": str(key)})
        thing = json.loads(dgraph_result.json)
        if len(thing["all"]) > 0:
            # Update previous layers
            self[key] = thing["all"][0]["uid"]
            return True

        # Cache miss, return False
        return False

    def __getitem__(self, key: str) -> Union[str, None]:
        """
        Check each layer iteratively for the key specified. If we find the result
        at a given layer, we update previous layers with the result.

        If the result was not found, return None.

        :param key:
        :return:
        """
        # Check layer 1 and 2
        item = super(FullLayeredCache, self).__getitem__(key)
        if item is not None:
            return item

        # Check layer 3 bloom filter
        exists_in_bloom = self.bloom.bfExists(self.node_name,
                                              self._get_key(key))
        if exists_in_bloom == 1:
            return True

        # All else has failed, we must now check dgraph. This is super super slow.
        query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name
        dgraph_result = self.txn.query(query, variables={"$a": str(key)})
        thing = json.loads(dgraph_result.json)
        if len(thing["all"]) > 0:
            # Update previous layers
            self[key] = thing["all"][0]["uid"]
            return thing["all"][0]["uid"]

        # Cache miss, return None
        return None

    def close(self):
        """
        Close all outstanding connections

        :return:
        """

        # Close the layer 2 redis connection
        super(FullLayeredCache, self).close()

        # Close layer 3 bloom filter connection
        self.bloom.close()

        # Close layer 4 dgraph connections
        self.stub.close()