class DoubleBloom:
    def __init__(self, values, probability):
        self.zero_bloom = BloomFilter(values, probability, False)
        self.one_bloom = BloomFilter(values, probability, False)
        self.next_level = None

    def insert(self, key, value):
        if value == '0':
            self.zero_bloom.insert(key)
        else:
            self.one_bloom.insert(key)

    def get_value(self, key):
        if self.zero_bloom.contains(key):
            if self.one_bloom.contains(key):
                if self.next_level is not None:
                    return self.next_level.get_value(key)
                return 'Both'
            return '0'
        elif self.one_bloom.contains(key):
            return '1'
        return None

    def add_level(self, values, probability):
        self.next_level = DoubleBloom(values, probability)
Beispiel #2
0
def test_bloom_filter():

    with pytest.assertRaises(ValueError):
        blf = BloomFilter(n=-1, p=0.1)

    with pytest.assertRaises(ValueError):
        blf = BloomFilter(n=1, p=-0.1)

    with pytest.assertRaises(ValueError):
        blf = BloomFilter(n=1.0, p=0.1)

    # aiming to cause collision
    blf = BloomFilter(n=3, p=0.1)
    blf.insert("Gondor")

    assert blf.is_present("Gondor 1") == True
    assert blf.is_present("Isenguard") == False
Beispiel #3
0
class Node:
    def __init__(self, k, expected_num, fp_prob):
        """
        Represents a single node of Bloom Tree

        """
        self.children: List[Node] = []
        self.parent: Optional[Node] = None
        self.filter = BloomFilter(expected_num, fp_prob)

        self.dataset_id: Optional[str] = None
        self.k = k

    def populate_dataset_info(self, dataset: List[Read]) -> None:
        self.dataset_id = dataset[0].filename
        self.insert_kmers_from_dataset(dataset)

    def insert_kmers_from_dataset(self, dataset: List[Read]) -> None:
        for read in dataset:
            for kmer in read.kmers(self.k):
                self.filter.insert(kmer)

    def add_node_kmers(self, other: 'Node') -> None:
        self.filter.filter |= other.filter.filter

    def num_children(self) -> int:
        return len(self.children)

    def score(self, other: 'Node') -> int:
        """
        "Hamming distance" score where lower is better
        :param other: The node to compare against
        """
        return count_xor(self.filter.filter, other.filter.filter)

    def get_size(self):
        """
        Returns the total number of bytes occupied by the filter object
        """
        return (sys.getsizeof(self.children) + sys.getsizeof(self.parent) +
                sys.getsizeof(self.dataset_id) + sys.getsizeof(self.k) +
                self.filter.get_size())
 def test_can_insert(self):
     bloom = BloomFilter(2000, 4)
     bloom.insert(5)
 def test_inserted_is_probably_contained(self):
     bloom = BloomFilter(2000, 4)
     bloom.insert(42)
     self.assertTrue(42 in bloom)
Beispiel #6
0
class TwitterApiSpider(object):
    # 初始化,连接数据库,建表
    def __init__(self, seed):
        self.db = pymysql.connect(host=host,
                                  port=3306,
                                  user='******',
                                  passwd='fit4-305',
                                  db=db,
                                  charset='utf8mb4')
        self.cursor = self.db.cursor()

        self.seed = seed
        self.api_number = 0  #记录API顺序
        self.location = 0  #记录checkpoints位置
        self.api_error_times = 0  #记录API连续error次数
        self.bloom_filter = BloomFilter(name=name,
                                        length=length,
                                        number=number,
                                        save_frequency=save_frequency)
        # checkpoints_id表
        sql = 'create table if not exists checkpoints_id(' \
              'checkpoints bigint not null)' \
              'ENGINE=InnoDB DEFAULT CHARSET=utf8mb4'
        self.cursor.execute(sql)
        print('建立checkpoints_id表')

        # checkpoints_info表
        sql = 'create table if not exists checkpoints_info(' \
              'checkpoints bigint not null)' \
              'ENGINE=InnoDB DEFAULT CHARSET=utf8mb4'
        self.cursor.execute(sql)
        print('建立checkpoints_info表')

        # user_id表
        sql = 'create table if not exists user_id(' \
              'id bigint not null auto_increment,' \
              'userid bigint not null,' \
              'primary key(id))' \
              'ENGINE=InnoDB DEFAULT CHARSET=utf8mb4'
        self.cursor.execute(sql)
        print('建立user_id表')

        sql = 'create table if not exists user_info(' \
              'id bigint not null auto_increment,' \
              'userid bigint not null,' \
              'username varchar(64) not null,' \
              'userscreenname varchar(64) not null,' \
              'description mediumtext,' \
              'createat timestamp not null default "1970-01-02 00:00:00",' \
              'url mediumtext,' \
              'profileimageurl mediumtext,' \
              'profilebackgroundimageurl mediumtext,' \
              'location mediumtext,' \
              'timezone mediumtext,' \
              'accesslevel bigint not null,' \
              'statuscount bigint not null,' \
              'followerscount bigint not null,' \
              'friendscount bigint not null,' \
              'favouritescount bigint not null,' \
              'listedcount bigint not null,' \
              'isprotected tinyint not null,' \
              'isgeoenabled tinyint not null,' \
              'isshowallinlinemedia tinyint not null,' \
              'iscontributorsenable tinyint not null,' \
              'isfollowrequestsent tinyint not null,' \
              'isprofilebackgroundtiled tinyint not null,' \
              'isprofileusebackgroundtiled tinyint not null,' \
              'istranslator tinyint not null,' \
              'isverified tinyint not null,' \
              'vtcoffset bigint,' \
              'lang varchar(64) default "en",' \
              'biggerprofileimageurl mediumtext,' \
              'biggerprofileimageurlhttps mediumtext,' \
              'miniprofileimageurl mediumtext,' \
              'miniprofileimageurlhttps mediumtext,' \
              'originalprofileimageurl mediumtext,' \
              'originalprofileimageurlhttps mediumtext,' \
              'profilebackgroundimageurlhttps mediumtext,' \
              'profilebanneripadurl mediumtext,' \
              'profilebanneripadretinaurl mediumtext,' \
              'profilebannermobileurl mediumtext,' \
              'profilebannermobileretinaurl mediumtext,' \
              'profilebannerretinaurl mediumtext,' \
              'profilebannerurl mediumtext,' \
              'profileimageurlhttps mediumtext,' \
              'updatetime timestamp not null default now(),' \
              'sensitivity float not null,' \
              'sensitivity2 float not null,' \
              'primary key(id))ENGINE=InnoDB default CHARSET=utf8mb4'
        self.cursor.execute(sql)
        print('建立user_info表')

        # 将seed加入到user_id中,如果user_id中没有内容
        sql = 'select userid from user_id where id = 1'
        self.check_status()
        self.cursor.execute(sql)
        if self.cursor.fetchall():
            pass
        else:
            sql = 'insert into user_id (userid) value (%s)' % self.seed
            self.check_status()
            self.cursor.execute(sql)
            self.db.commit()

        # 初始化checkpoints_id/info
        sql = 'select * from checkpoints_id'
        self.check_status()
        self.cursor.execute(sql)
        if self.cursor.fetchall():
            pass
        else:
            sql = 'insert into checkpoints_id (checkpoints) value (0)'
            self.check_status()
            self.cursor.execute(sql)
            self.db.commit()

        sql = 'select * from checkpoints_info'
        self.check_status()
        self.cursor.execute(sql)
        if self.cursor.fetchall():
            pass
        else:
            sql = 'insert into checkpoints_info (checkpoints) value (0)'
            self.check_status()
            self.cursor.execute(sql)
            self.db.commit()

    # 随机返回api
    def get_random_api(self):
        # 从tokens.txt中读取需要用到的密钥
        file = open('tokens.txt', 'r')
        tokens = []
        for line in file:
            tokens.append(line.split(','))

        random_int = self.api_number % len(tokens)
        print('使用app_key序号:' + str(random_int))
        self.api_number = self.api_number + 1
        consumer_key = tokens[random_int][0]
        consumer_secret = tokens[random_int][1]
        access_token = tokens[random_int][2]
        access_token_secret = tokens[random_int][3][0:-1]
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)

        return tweepy.API(auth,
                          proxy=proxy,
                          wait_on_rate_limit=True,
                          wait_on_rate_limit_notify=True)

    # 根据id,获取其朋友和粉丝,加入到user_id中,此函数用户中文用户id爬取
    def search_id_chinese(self, id):
        # 获取粉丝id
        cursor = -1
        while cursor != 0:
            api = self.get_random_api()
            try:
                result = api.followers_ids(user_id=id,
                                           cursor=cursor,
                                           count=5000)
                self.api_error_times = 0  #成功使用API,将错误次数计数清零
            except:
                self.api_error_times = self.api_error_times + 1
                if self.api_error_times % 10 == 0:  #连续10次出错,则休息300s,并打印提示信息
                    self.api_error_times = 0
                    print(
                        'something goes wrong with followers_ids api, we will retry it in 300s'
                    )
                    time.sleep(300)
            else:
                followers_ids = result[0]
                for follower_id in followers_ids:
                    if self.bloom_filter.is_contain(
                            str(follower_id)):  #判断是否在BF中,是则跳过,不是则进一步判断是不是中国人
                        pass
                    else:
                        self.bloom_filter.insert(
                            str(follower_id))  #不论是不是中国人都要将其插入BF,这样可以减少后面重复判断
                        if self.is_chinese(follower_id):
                            sql = 'insert into user_id (userid) value (%s)' % follower_id
                            self.check_status()
                            self.cursor.execute(sql)
                        else:
                            pass
                cursor = result[1][1]  #获得新的cursor
                self.db.commit()
        # 处理朋友
        cursor = -1
        while cursor != 0:
            api = self.get_random_api()
            try:
                result = api.friends_ids(user_id=id, cursor=cursor, count=5000)
                self.api_error_times = 0
            except:
                self.api_error_times = self.api_error_times + 1
                if self.api_error_times % 10 == 0:
                    self.api_error_times = 0
                    print(
                        'something goes wrong with followers_ids api, we will retry it in 300s'
                    )
                    time.sleep(300)
            else:
                friends_ids = result[0]
                for friend_id in friends_ids:
                    # sql = 'select * from searched_list where userid = %s limit 1' % friend_id
                    # self.cursor.execute(sql)
                    if self.bloom_filter.is_contain(str(friend_id)):
                        pass
                    else:
                        self.bloom_filter.insert(str(friend_id))
                        if self.is_chinese(friend_id):
                            sql = 'insert into user_id (userid) value (%s)' % friend_id
                            self.check_status()
                            self.cursor.execute(sql)
                        else:
                            pass
                cursor = result[1][1]
                self.db.commit()

    # 此函数用户全网用户id爬取,不会判断是否是中国人
    def search_id(self, id):
        # 获取粉丝
        cursor = -1
        while cursor != 0:
            api = self.get_random_api()
            try:
                result = api.followers_ids(user_id=id,
                                           cursor=cursor,
                                           count=5000)
                self.api_error_times = 0
            except:
                self.api_error_times = self.api_error_times + 1
                if self.api_error_times % 10 == 0:
                    self.api_error_times = 0
                    print(
                        'something goes wrong with followers_ids api, we will retry it in 300s'
                    )
                    time.sleep(300)
            else:
                followers_ids = result[0]
                for follower_id in followers_ids:
                    # sql = 'select * from searched_list where userid = %s limit 1' % follower_id
                    # self.cursor.execute(sql)
                    if self.bloom_filter.is_contain(str(follower_id)):
                        pass
                    else:
                        sql = 'insert into user_id (userid) value (%s)' % follower_id
                        self.check_status()
                        self.cursor.execute(sql)
                        self.bloom_filter.insert(str(follower_id))

                cursor = result[1][1]
                self.db.commit()
        # 获取朋友
        cursor = -1
        while cursor != 0:
            api = self.get_random_api()
            try:
                result = api.friends_ids(user_id=id, cursor=cursor, count=5000)
                self.api_error_times = 0
            except:
                self.api_error_times = self.api_error_times + 1
                if self.api_error_times % 10 == 0:
                    self.api_error_times = 0
                    print(
                        'something goes wrong with followers_ids api, we will retry it in 300s'
                    )
                    time.sleep(300)
            else:
                friends_ids = result[0]
                for friend_id in friends_ids:
                    # sql = 'select * from searched_list where userid = %s limit 1' % friend_id
                    # self.cursor.execute(sql)
                    if self.bloom_filter.is_contain(str(friend_id)):
                        pass
                    else:
                        sql = 'insert into user_id (userid) value (%s)' % friend_id
                        self.check_status()
                        self.cursor.execute(sql)
                        self.bloom_filter.insert(str(friend_id))
                cursor = result[1][1]
                self.db.commit()

    # 根据user_id获取用户信息,
    def get_user_info(self):
        print('开始获取用户信息')
        sql = 'select checkpoints from checkpoints_info'
        self.check_status()
        self.cursor.execute(sql)
        result = self.cursor.fetchall()
        if result:
            if result == -1:
                print('已经获取了全部user_info')
                return
            else:
                self.location = result[0][0]
        else:
            self.location = self.location
        flag = 1
        while flag:
            api = self.get_random_api()
            sql = 'select userid from user_id limit %s, 100' % self.location  # 每次处理100个用户
            self.check_status()
            self.cursor.execute(sql)
            result = self.cursor.fetchall()
            if result:
                userids = self.tuple_to_list(result)
                try:
                    users_info = api.lookup_users(user_ids=userids)
                    self.api_error_times = 0
                except:
                    self.api_error_times = self.api_error_times + 1
                    if self.api_error_times % 10 == 0:
                        self.api_error_times = 0
                        print(
                            'something goes wrong with lookup_users api, we will retry it in 300s'
                        )
                        time.sleep(300)
                else:
                    for user_info in users_info:
                        info = user_info._json
                        userid = info['id']
                        username = info['name']
                        userscreenname = info['screen_name']
                        description = info['description'].replace('\"', '')
                        createat = info['created_at']
                        createat_timestamp = datetime.datetime.strptime(
                            createat, '%a %b %d %H:%M:%S +0000 %Y')
                        createat = createat_timestamp.strftime(
                            '%Y-%m-%d %H:%M:%S')
                        url = info['url']
                        profileimageurl = info['profile_image_url']
                        profilebackgroundimageurl = info[
                            'profile_background_image_url']
                        location = info['location']
                        timezone = info['time_zone']
                        accesslevel = 0
                        statuscount = info['statuses_count']
                        followerscount = info['followers_count']
                        friendscount = info['friends_count']
                        favouritescount = info['favourites_count']
                        listedcount = info['listed_count']
                        isprotected = info['protected']
                        isgeoenabled = info['geo_enabled']
                        isshowallinlinemedia = False
                        iscontributorsenable = info['contributors_enabled']
                        isfollowrequestsent = info['follow_request_sent']
                        isprofilebackgroundtiled = info[
                            'profile_background_tile']
                        isprofileusebackgroundtiled = False
                        istranslator = info['is_translator']
                        isverified = info['verified']
                        vtcoffset = info['utc_offset']
                        lang = info['lang']
                        biggerprofileimageurl = info['profile_image_url']
                        biggerprofileimageurlhttps = 'NULL'
                        miniprofileimageurl = 'NULL'
                        miniprofileimageurlhttps = 'NULL'
                        originalprofileimageurl = 'NULL'
                        originalprofileimageurlhttps = 'NULL'
                        profilebackgroundimageurlhttps = info[
                            'profile_background_image_url_https']
                        profilebanneripadurl = 'NULL'
                        profilebanneripadretinaurl = 'NULL'
                        profilebannermobileurl = 'NULL'
                        profilebannermobileretinaurl = 'NULL'
                        profilebannerretinaurl = 'NULL'
                        profilebannerurl = 'NULL'
                        profileimageurlhttps = info['profile_image_url_https']
                        updatetime = datetime.date.today()
                        sensitivity = 0
                        sensitivity2 = 0

                        sql = 'insert into user_info (userid, username, userscreenname, description, createat, url, ' \
                              'profileimageurl, profilebackgroundimageurl, location, timezone, accesslevel, statuscount, ' \
                              'followerscount, friendscount, favouritescount, listedcount, isprotected, isgeoenabled, ' \
                              'isshowallinlinemedia, iscontributorsenable, isfollowrequestsent, isprofilebackgroundtiled, ' \
                              'isprofileusebackgroundtiled, ' \
                              'istranslator, isverified, vtcoffset, lang, biggerprofileimageurl, biggerprofileimageurlhttps, ' \
                              'miniprofileimageurl, miniprofileimageurlhttps, originalprofileimageurl, originalprofileimageurlhttps, ' \
                              'profilebackgroundimageurlhttps, profilebanneripadurl, profilebanneripadretinaurl, ' \
                              'profilebannermobileurl, profilebannermobileretinaurl, profilebannerretinaurl, profilebannerurl, ' \
                              'profileimageurlhttps, updatetime, sensitivity, sensitivity2) value (%s, "%s", "%s", "%s", "%s", "%s", "%s", ' \
                              '"%s", "%s", "%s", %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "%s", "%s", %s, %s, %s, %s, ' \
                              '%s, "%s", %s, %s, %s, %s, %s, %s, "%s", "%s", %s, %s)' % (
                                  userid, username, userscreenname, description, createat, url, profileimageurl,
                                  profilebackgroundimageurl, location, timezone, accesslevel,
                                  statuscount, followerscount, friendscount, favouritescount, listedcount, isprotected,
                                  isgeoenabled, isshowallinlinemedia,
                                  iscontributorsenable, isfollowrequestsent, isprofilebackgroundtiled,
                                  isprofileusebackgroundtiled,
                                  istranslator, isverified, vtcoffset,
                                  lang, biggerprofileimageurl, biggerprofileimageurlhttps, miniprofileimageurl,
                                  miniprofileimageurlhttps, originalprofileimageurl,
                                  originalprofileimageurlhttps, profilebackgroundimageurlhttps, profilebanneripadurl,
                                  profilebanneripadretinaurl, profilebannermobileurl,
                                  profilebannermobileretinaurl, profilebannerretinaurl, profilebannerurl, profileimageurlhttps,
                                  updatetime, sensitivity, sensitivity2)
                        sql = sql.replace('\"None\"',
                                          'NULL').replace('None', 'NULL')
                        self.check_status()
                        # 这里用try和except是为了防止sql语句格式错误导致程序崩溃。毕竟我也很难预料到会出现什么奇怪的字符!
                        try:
                            self.cursor.execute(sql)
                        except:
                            print(sql)
                    self.db.commit()
                    self.location = self.location + 100
                    self.save_checkpoints_info(self.location)
            else:
                flag = 0
                self.save_checkpoints_info(-1)
                print('get all users info, all jobs done')

    # 获取所有的user_id
    def get_user_id(self):
        # 从checkpoints中读取出数据
        sql = 'select checkpoints from checkpoints_id'
        self.check_status()
        self.cursor.execute(sql)
        result = self.cursor.fetchall()
        if result:
            if result == -1:
                print('已经获取了全部user_id')
                return
            else:
                self.location = result[0][0]
        else:
            self.location = self.location

        # 开始迭代遍历user_id中的用户
        flag = 1
        while flag:
            sql = 'select userid from user_id limit %s, 1' % self.location
            self.check_status()
            self.cursor.execute(sql)
            result = self.cursor.fetchall()
            if result:
                userid = result[0][0]
                print('将要处理id:' + str(userid))
                if functions == 'whole_net_chinese':
                    self.search_id_chinese(userid)
                if functions == 'whole_net':
                    self.search_id(userid)
                self.location = self.location + 1
                self.save_checkpoints_id(self.location)
            else:
                flag = 0
                self.save_checkpoints_id(-1)
        print('已经获取了全部user_id')

    # 判断是否是中国人
    def is_chinese(self, id):
        # 使用正则表达式判断文本内容中是否有中文字符,python3 的str本身就是unicode编码,所以不用decode()
        # 判断是否是中文的原则就是,最近的10条推文中,出现过某条推文中包含中文但是不包含日文,因为日文中某些字与中文一致
        pattern_zh = re.compile(u'[\u4e00-\u9fa5]+')
        pattern_ja_ka = re.compile(u'[\u30a0-\u30ff]+')
        pattern_ja_hi = re.compile(u'[\u3040-\u309f]+')
        api = self.get_random_api()
        try:
            tweets = api.user_timeline(id=id, count=10)
            self.api_error_times = 0
            for tweet in tweets:
                text = tweet.text
                # print(tweet.text)
                if pattern_zh.search(text) and not pattern_ja_hi.search(
                        text) and not pattern_ja_ka.search(text):
                    print(text)
                    print('包含中文,不包含日文')
                    return True
                else:
                    pass
        except:
            self.api_error_times = self.api_error_times + 1
            if self.api_error_times % 10 == 0:
                self.api_error_times = 0
                print(
                    'failed to get the tweets, so we take it as no chinese!, but we will take 300s for break.'
                )
                time.sleep(300)
        return False

    # 保存id的checkpoints
    def save_checkpoints_id(self, num):
        sql = 'update checkpoints_id set checkpoints = %s' % num
        self.check_status()
        self.cursor.execute(sql)
        self.db.commit()

    # 保存info的checkpoints
    def save_checkpoints_info(self, num):
        sql = 'update checkpoints_info set checkpoints = %s' % num
        self.check_status()
        self.cursor.execute(sql)
        self.db.commit()

    # tuple 转 list
    def tuple_to_list(self, tuple):
        list = []
        for i in tuple:
            list.append(i[0])
        return list

    # check status
    def check_status(self):
        try:
            self.db.ping()
        except:
            print("lost connection to database server, we need to reconnect!")
            self.db = pymysql.connect(host=host,
                                      port=3306,
                                      user='******',
                                      passwd='fit4-305',
                                      db=db,
                                      charset='utf8')
            self.cursor = self.db.cursor()
class WebParser(object):
    def __init__(self, redis_key):
        self.bloom_filter = BloomFilter(
            redis.StrictRedis(host='localhost', port=6379), redis_key)

    def list_zhilian(self, response):
        urls = []
        page = json.loads(response.text)

        if not len(page['data']['results']):
            return None

        for info in page['data']['results']:
            url = info.get('positionURL')
            if not self.bloom_filter.exists(url):
                self.bloom_filter.insert(url)
                urls.append(url)
        return urls

    def list_qiancheng(self, response):
        response.encoding = response.apparent_encoding
        selector = Selector(response, type='html')
        list = selector.xpath('//div[@class="el"]/p/span/a/@href').getall()
        urls = self._filter_list(list)
        return urls

    def list_liepin(self, response):
        selector = Selector(response, type='html')
        list = selector.xpath('//span[@class="job-name"]/a/@href').getall()
        urls = self._filter_list(list)
        return urls

    def list_boss(self):
        pass

    def list_shixi(self):
        pass

    def list_lagou(self):
        pass

    def content_zhilian(self, response, database, url):
        response.encoding = response.apparent_encoding
        selector = Selector(response)
        title = selector.xpath(
            '//*[@class="summary-plane__title"]/text()').get()
        salary = selector.xpath(
            '//*[@class="summary-plane__salary"]/text()').get()
        city = selector.xpath(
            '//*[@class="summary-plane__info"]/li/a/text()').get()
        company = selector.xpath('//a[@class="company__title"]/text()').get()
        company_url = selector.xpath(
            '//a[@class="company__title"]/@href').get()
        # description 结构上比较混乱,先爬取再说
        description = selector.xpath(
            '//*[@class="describtion__detail-content"]').getall()
        summary_info = selector.xpath(
            '//*[@class="summary-plane__info"]/li/text()').getall()

        if len(summary_info) == 3:
            experience = summary_info[0]
            education = summary_info[1]
            data = {
                'url': url,
                "title": title,
                "salary": salary,
                "city": city,
                "company": company,
                "company_url": company_url,
                "experience": experience,
                "education": education,
                "description": description
            }
        else:
            data = {
                'url': url,
                "title": title,
                "salary": salary,
                "city": city,
                "company": company,
                "company_url": company_url,
                "summary_info": summary_info,
                "description": description
            }

        database.insert(data)

    def content_qiancheng(self, response, database, url):
        response.encoding = response.apparent_encoding
        selector = Selector(response)
        title = selector.xpath('//div[@class="cn"]/h1/text()').get()
        salary = selector.xpath('//div[@class="cn"]/strong/text()').get()
        company = selector.xpath('//div[@class="com_msg"]/a/p/text()').get()
        company_url = selector.xpath('//div[@class="com_msg"]/a/@href').get()

        # description 结构上比较混乱,先爬取再说
        description = selector.xpath(
            '//div[@class="bmsg job_msg inbox"]').getall()
        summary_info = city = selector.xpath(
            '//p[@class="msg ltype"]/@title').get()
        if summary_info:
            summary_info = summary_info.split('\xa0\xa0|\xa0\xa0')

        if len(summary_info) >= 3:
            city = summary_info[0]
            experience = summary_info[1]
            education = summary_info[2]
            data = {
                'url': url,
                "title": title,
                "salary": salary,
                "city": city,
                "company": company,
                "company_url": company_url,
                "experience": experience,
                "education": education,
                "description": description
            }
        else:
            data = {
                'url': url,
                "title": title,
                "salary": salary,
                "company": company,
                "company_url": company_url,
                "summary_info": summary_info,
                "description": description
            }

        database.insert(data)

    def content_liepin(self, response, database, url):
        selector = Selector(response)
        title = selector.xpath('//h1[@title]/text()').get()
        salary = selector.xpath('//p[@class="job-item-title"]/text()').get()
        if salary:
            salary = salary.strip()
        city = selector.xpath('//p[@class="basic-infor"]/span/a/text()').get()
        if city:
            city = city.split('-')[0]
        company = selector.xpath(
            '//div[@class="company-logo"]/p/a/text()').get()
        company_url = selector.xpath(
            '//div[@class="company-logo"]/p/a/@href').get()
        # description 结构上比较混乱,先爬取再说
        description = selector.xpath(
            '//div[@class="job-item main-message job-description"]').getall()
        summary_info = selector.xpath(
            '//div[@class="job-qualifications"]/span/text()').getall()

        if len(summary_info) >= 2:
            education = summary_info[0]
            experience = summary_info[1]
            data = {
                'url': url,
                "title": title,
                "salary": salary,
                "city": city,
                "company": company,
                "company_url": company_url,
                "experience": experience,
                "education": education,
                "description": description
            }
        else:
            data = {
                'url': url,
                "title": title,
                "salary": salary,
                "city": city,
                "company": company,
                "company_url": company_url,
                "summary_info": summary_info,
                "description": description
            }

        database.insert(data)

    def content_boss(self):
        pass

    def content_shixi(self):
        pass

    def content_lagou(self):
        pass

    def _filter_list(self, list):
        urls = []
        for url in list:
            if not self.bloom_filter.exists(url):
                self.bloom_filter.insert(url)
                urls.append(url)
        print('done one page')
        return urls