Ejemplo n.º 1
0
def bloom_url(url):
    is_exist = os.path.exists(r'C:\spiders\zhilian_celery\bloom.blm')
    if is_exist:
        bf = BloomFilter.fromfile(
            open(r'C:\spiders\zhilian_celery\bloom.blm', 'rb', buffering=40))
    else:
        bf = BloomFilter(10000000, 0.001)

        # for animal in animals:
    if url in bf:
        print(1)
        return 0
    else:
        bf.add(url)
        bf.tofile(open(r'C:\spiders\zhilian_celery\bloom.blm', 'wb'))
        return 1
Ejemplo n.º 2
0
def train_bloom_filter():
    # -- training the Bloom filter
    hot_display_names = set()
    with open('./resources/0.xml', 'rb') as f:
        for line in f:
            user = row_to_dict(line)
            hot_display_names.add(user['displayname'])

    bf = BloomFilter(len(hot_display_names), error_rate=0.001)

    for name in hot_display_names:
        bf.add(name)

    with open('./resources/hot_names_bloom_filter', 'wb') as f:
        bf.tofile(f)

    return bf
Ejemplo n.º 3
0
class BloomCheckPipeline(object):
    def __init__(self):
        self.file_name = 'Z:/朱靖/布隆滤波器过滤文件/学科网/bloomfilter_xuekew.blm'
        self.bf = None
        self.cap_begin = 0
        self.cap_end = 0
        self.cnt = 0

    def open_spider(self, spider):
        if os.path.exists(self.file_name):
            self.bf = BloomFilter.fromfile(open(self.file_name, 'rb'))
            print('open blm file success')
            self.cap_begin = len(self.bf)
            print('open blm file success')
            print('初始容量:%d' % self.cap_begin)
        else:
            self.bf = BloomFilter(100000000, 0.001)
            print('Not find the blm file')

    def process_item(self, item, spider):
        if item['url'] in self.bf:
            print('drop one item %s for exits' % item['title'])
            raise DropItem('drop an item %s for exits' % item['title'])
        else:
            try:
                self.bf.add(item['url'])
                self.cnt += 1
            except Exception as reason:
                print("BloomFilter Error------:%s" % reason)
            if self.cnt > 10000:
                self.save_blm()
                self.cnt = 0
            return item

    def save_blm(self):
        print('Save Blm File ******')
        self.cap_end = len(self.bf)
        print('此次存入文章数:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))

    def close_spider(self, spider):
        print('close spider tofile-------')
        self.cap_end = len(self.bf)
        print('此次存入文章数:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))
Ejemplo n.º 4
0
class BloomCheckPipeline(object):
    def __init__(self):
        self.file_name = r'Z:/朱靖/布隆滤波器过滤文件/carpicture/BloomFiltercnki.blm'
        self.bf = None
        self.cap_begin = 0
        self.cap_end = 0
        self.cnt = 0

    def open_spider(self, spider):
        if os.path.exists(self.file_name):
            self.bf = BloomFilter.fromfile(open(self.file_name, 'rb'))
            self.cap_begin = len(self.bf)  # 打开blm文件时读入初始数量
            print('open blm file success')
            print('初始容量:%d' % self.cap_begin)
        else:
            self.bf = BloomFilter(100000000, 0.001)
            print('Not find the blm file, creat one')

    def process_item(self, item, spider):
        if item['image_url'] in self.bf:
            print('drop one item %s for exist' % item['title'])
            raise DropItem('drop an item %s for exists' % item['title'])
        else:
            try:
                self.bf.add(item['image_url'])
                self.cnt += 1
            except Exception as reason:
                print('BloomFilter Error------:%s' % reason)
            # 每写入1w个url时就保存blm文件一次
            if self.cnt > 10000:
                self.save_blm()
                self.cnt = 0
            return item

    def save_blm(self):
        print('Save Blm File ******')
        self.cap_end = len(self.bf)
        print('此次存入图片数量:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))

    def close_spider(self, spider):
        print('close_spider tofile------')
        self.cap_end = len(self.bf)
        print('此次存入图片数:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))
class MyBloomUtil:
    def __init__(self, bloom_name):
        self.bloom_path = '%s.blm' % bloom_name
        is_exist = os.path.exists(self.bloom_path)
        if is_exist:
            self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb'))
        else:
            self.bf = BloomFilter(20000, 0.001)

    def process_item(self, item):
        if item in self.bf:
            logger.info('[%s] is already in bloom.' % item)
            return None
        else:
            print('add one')
            self.bf.add(item)
            self.bf.tofile(open(self.bloom_path, 'wb'))
            return item
Ejemplo n.º 6
0
 def filter_url(self, url):
     """
     进行url去重处理,可能需要的请求数据过多,防止重复
     :param url:对url进行判断,看是否重复
     :return:
     """
     bloom_path = '{}.blm'.format(self.name)
     # 判断是否存在这个文件
     is_exist = os.path.exists(bloom_path)
     if is_exist:
         bf = BloomFilter.fromfile(open(bloom_path, 'rb'))
     else:
         # 新建一个,储存在内存中
         bf = BloomFilter(1000000, 0.01)
     if url in bf:
         return False
     # 不存在将url添加进去
     bf.add(url)
     bf.tofile(open(bloom_path, 'wb'))
     return True
Ejemplo n.º 7
0
class BloomCheckFunction(object):
    def __init__(self):
        self.filename = 'bloomFilter.blm'
        is_exist = os.path.exists(self.filename)
        if is_exist:
            self.bf = BloomFilter.fromfile(open(self.filename, 'rb'))
        else:
            self.bf = BloomFilter(100000000, 0.001)

    def process_item(self, data):
        data_encode_md5 = hashlib.md5(
            data.encode(encoding='utf-8')).hexdigest()
        if data_encode_md5 in self.bf:
            return False
        else:
            self.bf.add(data_encode_md5)
            return True

    def save_bloom_file(self):
        self.bf.tofile(open(self.filename, 'wb'))
Ejemplo n.º 8
0
def build(
    infile,
    outfile,
    error_rate=0.0001,
    delim=None,
    column=1,
    skip_first=False,
    unhex=False,
    comment_prefix=None,
    num_items=None,
):
    print("[BUILDING] Using error-rate: {}".format(error_rate))
    if os.path.isfile(infile):
        print("[BUILDING] Reading in Hashset: {}".format(infile))
        print("[BUILDING] Calculating number of hashes...")
        if not num_items:
            num_items = get_number_of_items(infile, skip_first, comment_prefix)
        print("[BUILDING] There are {} hashes in the Hashset".format(num_items))
        print("[BUILDING] Creating bloomfilter")
        bf = BloomFilter(num_items, error_rate)
        print("[BUILDING] Inserting hashes into bloomfilter")
        for item in get_items(
            infile,
            delim=delim,
            column=column,
            skip_first=skip_first,
            unhex=unhex,
            comment_prefix=comment_prefix,
        ):
            try:
                bf.add(item)
            except Exception as e:
                print("[ERROR] {}".format(e), file=sys.stderr)
        print("[BUILDING] Hashset bloomfilter contains {} items.".format(len(bf)))
        with open(outfile, "wb") as fh:
            bf.tofile(fh)
        print("[BUILDING] Complete")
    else:
        print("[ERROR] No such file or directory: {}".format(infile), file=sys.stderr)

    return
Ejemplo n.º 9
0
def bloom_file_init():
    path = '../spiders/sites.blm'
    is_exist = os.path.exists(path)
    # 判断是否存在bloom文件
    # 判断存在就读取
    if is_exist:
        bf = BloomFilter.fromfile(open(path, 'rb'))
    # 没有该文件则创建bf对象 最后的时候保存文件
    else:
        bf = BloomFilter(10000000, 0.01)

    with MongoClient(get_project_settings()['MONGODB_URL']) as client:
        sites_coll = client.site.sites
        sites_unverified_coll = client.site.sites_unverified
        for x in sites_coll.find():
            result = bf.add(x['url'])
            print(x['url'], ' ', result)
        for x in sites_unverified_coll.find({}):
            result = bf.add(x['url'])
            print(x['url'], ' ', result)

    bf.tofile(open(path, 'wb'))
Ejemplo n.º 10
0
animals = [
    'dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird',
    'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken',
    'dolphin', 'donkey', 'crow', 'crocodile', 'testadd'
]

# 判断文件是否存在
# 存在时读取,不存在时创建
is_exist = os.path.exists('test.blm')
if is_exist:
    bf = BloomFilter.fromfile(open('test.blm', 'rb'))
# 若没有该文件则创建bf对象
else:
    bf = BloomFilter(20000, 0.001)

# 如果存在则跳过,否则写入
for i in range(10):
    if i in bf:
        print('pass')
        pass
    else:
        print('add %s' % i)
        bf.add(i)
        bf.tofile(open('test.blm', 'wb'))

#判断是否存在
for i in range(20):
    if i in bf:
        print("written")
    else:
        print("unwritten")
Ejemplo n.º 11
0

#
# TRAIN BLOOM FILTER
#
# -- training the Bloom filter
bf = BloomFilter(capacity=10**5, error_rate=0.01)
with open('./resources/0-users.xml', 'r') as f:

    for line in f:
        user = row_to_dict(line)
        if int(user['reputation']) > 1500:
            bf.add(user['id'])

with open('./resources/hot_user_ids.bf', 'wb') as f:
    bf.tofile(f)


class ReduceSideWithBloomFilterJob(MRJob):
    def mapper_init(self):
        with open(os.path.join(basedir, 'resources/hot_user_ids.bf'),
                  'rb') as f:
            self.filter = BloomFilter.fromfile(f)

    def mapper(self, _, line):
        entity = row_to_dict(line)

        if 'reputation' in entity:
            if int(entity['reputation']) > 1500:
                yield entity['id'], {'type': 'user', 'entity': entity}
Ejemplo n.º 12
0
                    items = line.split(' ')
                    if len(items) != 4:
                        continue
                    items = items[:-1]

                    for i in range(2):
                        if not all([
                                item.startswith('<') and item.endswith('>')
                                for item in items[i:i + 2]
                        ]):
                            continue
                        key = ':'.join([items[i][1:-1], items[i + 1][1:-1]])
                        bloom.add(key)

        with open(os.path.join(blooms_path, 'spo1.bloom'), 'wb') as f:
            bloom.tofile(f)

    with open(os.path.join(blooms_path, 'spo1.bloom'), 'rb') as f:
        one_hop_bloom = BloomFilter.fromfile(f)

        ds = LC_Qaud_Linked(
            path=os.path.join(args.base_path, args.dataset_path))
        ds.load()
        ds.parse()
        for row in ds.qapairs:
            for item in row.sparql.where_clause:
                if item[0].startswith('<'):
                    key = ':'.join([item[0][1:-1], item[1][1:-1]])
                elif item[2].startswith('<'):
                    key = ':'.join([item[1][1:-1], item[2][1:-1]])
                else:
Ejemplo n.º 13
0
class Main():
    def __init__(self):
        self.taskCode = ""

        #读取配置文件
        configPath = "config.ini"
        WebConfig = configparser.ConfigParser()
        WebConfig.read(configPath, encoding='utf-8-sig')
        self.redisHost = WebConfig.get("redis", "host")
        self.redisPort = WebConfig.get("redis", "port")
        self.redisPassword = WebConfig.get("redis", "password")
        self.redisDb = WebConfig.get("redis", "database")
        self.redis_platform_address = WebConfig.get("redis","redis_platform_address")

        self.url_key_name = self.redis_platform_address+":url:" + self.taskCode
        self.redis = redis.Redis(host=self.redisHost, port=self.redisPort, decode_responses=True, password=self.redisPassword, db=self.redisDb)

        mongoHost = WebConfig.get("mongodb", "host")
        mongoPort = WebConfig.get("mongodb", "port")
        mongoUser = WebConfig.get("mongodb", "user")
        mongoPassword = WebConfig.get("mongodb", "password")
        mongourl = "mongodb://" + mongoUser + ":" + mongoPassword + "@" + mongoHost + ":" + mongoPort
        conn = pymongo.MongoClient(mongourl)
        mongoDatabase = WebConfig.get("mongodb", "database")  # mongo数据库名
        self.myMongo = conn[mongoDatabase]  # 数据库名


        self.bloom = None

        self.webType = ""
        self.executionType =""

        # 页面翻页设置
        self.start_url = ""
        self.second_page_value = ""
        self.page_interval = ""
        self.end_page_value = ""
        self.url_type = ""
        self.lineListXpath = ""
        self.json_page_re = ""
        self.page_xpath = ""    #page页如果有需要提取的数据
        # 获取页面元素
        self.titleXpath = ""
        self.contentXpath = ""

        self.proxy = None
        self.proxy_url = None
        self.headers = {
            'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
                           'Windows NT 6.1; Win64; x64; Trident/5.0)'),
        }  # header
        self.timeout = 10
        self.timeInterval = 0  # 时间间隔
        self.post_data = ""
        self.page_num_str = ""
    # 从数据库读布隆过滤器数据
    def bloom_readfrom_db(self):
        tempFile = open("tempFile", "wb")

        bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode})

        if bloom_dict: #如果有布隆过滤器,读取
            bloomData = bloom_dict["bloom_data"]
            tempFile.write(bloomData)
            tempFile.close()
            bloomFile = open("tempFile", "rb")
            self.bloom = BloomFilter.fromfile(bloomFile)
        else:
            self.bloom = BloomFilter(capacity=1000000, error_rate=0.00001)

    def get_proxy(self):
        ps = requests.get(self.proxy_url).text
        return ps

    # 将布隆过滤器数据写入数据库保存
    def bloom_writeto_db(self):
        bloomDbKeyName = self.redis_platform_address + ":bloom:" + self.taskCode

        tempFile_del = open("tempFile", "wb")

        self.bloom.tofile(tempFile_del)         #将布隆过滤器数据写入文件
        tempFile_del.close()

        bloomFile = open("tempFile", "rb")      #打开保存数据的文件
        bloomData = bloomFile.read()

        insert_data = {"_id": self.taskCode, "bloom_data": bloomData}

        bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode})

        if bloom_dict:  #更新布隆过滤器
            self.myMongo["bloom"].update_one({"_id": self.taskCode},{"$set": {"bloom_data":bloomData}})
        else:
            self.myMongo["bloom"].insert_one(insert_data)

        bloomFile.close()
        logging.info("布隆过滤器成功保存到数据库"+bloomDbKeyName)

    # 构造链接页的所有链接
    def get_PageUrlList(self):
        """构造翻页链接"""
        urlList = []
        for i in range(int(self.second_page_value), int(self.end_page_value)):
            page_num = str(i)
            page_url = self.url_type.replace("%d", page_num)
            urlList.append(page_url)
        urlList.append(self.start_url)
        return urlList

    #根据url下载数据
    def download(self, url):
        try:
            if self.proxy:
                proxy = self.get_proxy().strip()
                proxies={'https':proxy}  # 获取代理
                response = requests.get(url, proxies=proxies, timeout=self.timeout, headers=self.headers,verify=False)
                logging.info(url)
                logging.info("以使用代理")
            else:
                response = requests.get(url, timeout=self.timeout, headers=self.headers,verify=False)

            statusCode = response.status_code
            codeStyle = cchardet.detect(response.content)["encoding"]
            if not codeStyle:
                codeStyle = "utf-8"
            webData = response.content.decode(codeStyle, errors="ignore")
            return (webData, statusCode)
        except Exception as e:
            print(e)
            return (0,0)

    def change_outqueue_num(self):
        keyName = self.redis_platform_address + ":status:" + self.taskCode  # 获取任务状态键值
        status_data = self.redis.get(keyName)  # 获取所有状态数据
        print("-------------------------", self.taskCode)
        taskData = json.loads(status_data)
        taskData["outQueue"] = 1        #更新json数据
        keyname_data = json.dumps(taskData)  # 转化为字符串
        self.redis.set(keyName, keyname_data)  # 更新redis

    # 更新所有需要的属性
    def update_attr(self):
        keyName = self.redis_platform_address+":status:" + self.taskCode  # 获取任务状态键值
        status_data = self.redis.get(keyName)  # 获取所有状态数据
        print("-------------------------", self.taskCode)

        taskData = json.loads(status_data)

        self.executionType = int(taskData["executionType"])
        self.taskCode = taskData["taskCode"]
        self.timeInterval = taskData["timeInterval"]
        self.url_key_name = self.redis_platform_address+":url:" + self.taskCode



        # 下载 设置
        if "proxy" in taskData:
            self.proxy = taskData["proxy"]
        else:
            self.proxy = ""
        if "proxyProductValue" in taskData:
            self.proxy_url = taskData["proxyProductValue"]
        else:
            self.proxy_url = ""

        if "timeout" in taskData:
            self.timeout = taskData["timeout"]
        else:
            self.timeout = 10


        temp_data = json.loads(taskData["templateInfo"])    #模板数据
        print(temp_data)
        try:
            self.webType = temp_data["web_type"]
        except KeyError:
            self.webType = temp_data["webType"]

        # 页面翻页设置
        self.start_url = temp_data["start_url"]
        self.second_page_value = int(temp_data["second_page_value"])
        if "page_interval" in temp_data:
            self.page_interval = int(temp_data["page_interval"])
        else:
            self.page_interval = 1
        self.end_page_value = int(temp_data["end_page_value"])
        self.url_type = temp_data["url_type"]
        try:
            self.lineListXpath = temp_data["line_list_xpath"]
        except KeyError:
            self.lineListXpath = temp_data["lineListXpath"]

        if "headers" in temp_data:
            self.headers = temp_data["headers"]
        else:
            self.headers = {
            'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ''Windows NT 6.1; Win64; x64; Trident/5.0)'),
        }  # header

        if "json_page_re" in temp_data:
            self.json_page_re = temp_data["json_page_re"]
        else:
            self.json_page_re = ""
        if "post" in temp_data:
            self.post_data = temp_data["post"]
        else:
            self.post_data = None
        if "page_num_str" in temp_data:
            self.page_num_str = temp_data["page_num_str"]
        else:
            self.page_num_str = ""

        if "page_xpath" in temp_data:
            self.page_xpath = temp_data["page_xpath"]
        else:
            self.page_xpath = ""

    def deal_html_page_data(self,base_url,line,swtich=False):   #处理链接页的数据
        if self.page_xpath:
            one_data_dict = {}
            for key, keyxpath in self.page_xpath.items():
                if key == "url_xpath" or key == "url":
                    content_url = line.xpath(keyxpath)
                    if content_url:
                        endUrl = urljoin(base_url, content_url[0])
                        one_data_dict["url"] = endUrl
                        continue
                    else:  # 没有获取到url
                        swtich = True

                keystr = line.xpath(keyxpath)
                keystr = "".join(keystr)

                if keystr == "images" or keystr == "images_xpath":  # 对图片的链接进行处理
                    keystr = urljoin(base_url, keystr)

                one_data_dict[key] = keystr
            end_data = json.dumps(one_data_dict)  # 将字典转化为字符串

        else:
            end_data = urljoin(base_url,line)
        return end_data,swtich

    def deal_json_page_data(self,base_url,line,swtich=False):
        if self.page_xpath:
            one_data_dict = {}
            swtich = False
            for key, keyxpath in self.page_xpath.items():
                if key == "url_xpath" or key == "url":
                    content_url = jsonpath.jsonpath(line, keyxpath)
                    if content_url:
                        endUrl = urljoin(base_url, content_url[0])
                        one_data_dict["url"] = endUrl
                        continue
                    else:  # 没有获取到url
                        swtich = True

                keystr = jsonpath.jsonpath(line, keyxpath)
                keystr = " ".join(keystr)
                one_data_dict[key] = keystr
            end_data = json.dumps(one_data_dict)  # 将字典转化为字符串
        else:
            end_data = urljoin(base_url, line)
        return end_data,swtich
    # 根据url获取该页面的所有文本的链接或者链接字典

    def judge_url_in_bloom(self,judge_data):
        """判断url或字典里的url是否在布隆过滤器,不在的话加入布隆过滤器,并将数据加入redis"""
        if judge_data.startswith("{"):
            judge_data_json = json.loads(judge_data)
            insert_url = judge_data_json["url"]
            if insert_url in self.bloom:
                return True
            else:
                self.bloom.add(insert_url)
                print(judge_data)
                self.redis.lpush(self.url_key_name, judge_data)
                return False
        else:
            if judge_data in self.bloom:
                return True
            else:
                self.bloom.add(judge_data)
                print(judge_data)
                self.redis.lpush(self.url_key_name, judge_data)
                return False

    def get_content_url_list(self, url):
        """获取静态链接页内容"""
        """获取静态链接页内容"""
        endUrlList = []
        response = self.download(url)
        if response[1] == 200:
            ps = response[0]
            mytree = lxml.etree.HTML(ps)
            linelist = mytree.xpath(self.lineListXpath)
            for line in linelist:
                dealed_page_data, swtich = self.deal_html_page_data(url, line)
                if dealed_page_data and not swtich:  # swtich处理链接页,有一行没有获取到链接的情况
                    endUrlList.append(dealed_page_data)
        return endUrlList


    # json  根据 url获取该  json  页面所有的链接以及其他数据
    def get_json_content_url_list(self, url):
        """获取动态链接页内容"""
        end_data_list = []
        response = self.download(url)
        if response[1] == 200:
            ps = response[0]
            ps = ps.replace("\n", "")
            if self.json_page_re:
                ps = re.compile(self.json_page_re).findall(ps)
                if ps:
                    ps = ps[0]
                else:
                    logging.info(url + "---------这个url用json_page_re处理,结果为空")
                    return
            myjson = json.loads(ps)
            linelist = jsonpath.jsonpath(myjson, self.lineListXpath)
            for line in linelist:
                one_data_dict, swtich = self.deal_json_page_data(url, line)
                if swtich:
                    continue
                end_data_list.append(one_data_dict)
        return end_data_list

    #  post 的有关函数
    #根据url和datapost下载数据
    def post_download(self,url,data):
        try:
            if self.proxy == "1":
                proxy = self.get_proxy().strip()
                proxies = {'https': proxy}  # 获取代理
                response = requests.post(url, proxies=proxies, timeout=self.timeout, headers=self.headers,data=data)
                logging.info(url)
                logging.info("以使用代理")
            else:
                response = requests.post(url, timeout=self.timeout, headers=self.headers,data=data)

            statusCode = response.status_code
            codeStyle = cchardet.detect(response.content)["encoding"]
            if not codeStyle:
                codeStyle = "utf-8"
            webData = response.content.decode(codeStyle, errors="ignore")
            print(webData)
            return (webData, statusCode)
        except Exception as e:
            print(e)
            return (0, 0)

    def get_post_data_list(self):
        data_list = []
        for i in range(int(self.second_page_value), int(self.end_page_value),int(self.page_interval)):
            current_page_data = self.post_data.copy()
            current_page_data[self.page_num_str] = str(i)
            data_list.append(current_page_data)
        return data_list

    def post_html(self,post_data_list):
        switch = False
        for post_data in post_data_list:
            time.sleep(self.timeInterval)
            response = self.post_download(self.start_url, post_data)
            if response[1] == 200:
                ps = response[0]
                mytree = lxml.etree.HTML(ps)
                linelist = mytree.xpath(self.lineListXpath)
                for line in linelist:
                    one_data_dict, swtich_url = self.deal_html_page_data(self.start_url, line)
                    if swtich_url:
                        continue
                    judge_answer = self.judge_url_in_bloom(one_data_dict)
                    if self.executionType != 1 and judge_answer:  # 增量爬虫
                        switch = True

            if switch:  # 布隆过滤器判断有去重
                break


    def post_json(self,post_data_list):
        for post_data in post_data_list:
            swtich = False  # 判断这一页是否有布隆过滤器去重

            time.sleep(self.timeInterval)
            response = self.post_download(self.start_url, post_data)
            if response[1] == 200:
                ps = response[0]
                myjson = json.loads(ps)
                linelist = jsonpath.jsonpath(myjson, self.lineListXpath)
                for line in linelist:
                    # 每一行的操作
                    one_data_dict, swtich_url = self.deal_json_page_data(self.start_url, line)
                    if swtich_url:  # 这一行没有url,跳过这一行
                        continue

                    judge_answer = self.judge_url_in_bloom(one_data_dict)

                    if self.executionType != 1 and judge_answer:  # 增量爬虫
                        swtich = True
            if swtich:
                break

    def get_post_url_list(self):
        """针对wen_type为4,即post的url变化但是post  data不变的情况
        http://www.nhsa.gov.cn/module/web/jpage/dataproxy.jsp?startrecord=%d&endrecord=%p&perpage=15
        """
        end_url_list = []
        for first_num in range(int(self.second_page_value),int(self.end_page_value),int(self.page_interval)):
            second_num = first_num+int(self.page_interval)-1
            if second_num>int(self.end_page_value):
                second_num = int(self.end_page_value)
            post_url = self.start_url.replace("%d",str(first_num)).replace("%p",str(second_num))
            end_url_list.append(post_url)
        return end_url_list

    def post_url_change(self):
        if self.page_xpath:
            switch = False
            url_list = self.get_post_url_list()
            for url in url_list:
                time.sleep(self.timeInterval)
                response = self.post_download(url,self.post_data)
                if response[1] == 200:
                    ps = response[0]
                    mytree = lxml.etree.HTML(ps)
                    linelist = mytree.xpath(self.lineListXpath)
                    for line in linelist:
                        one_data_dict = {}
                        swtich_url = False
                        for key, keyxpath in self.page_xpath.items():
                            if key == "url_xpath" or key == "url":
                                content_url = line.xpath(keyxpath)
                                if content_url:
                                    content_url = content_url[0]
                                    content_url = parse.unquote(content_url)
                                    endUrl = urljoin(self.start_url, content_url)
                                    one_data_dict["url"] = endUrl
                                    continue
                                else:  # 没有获取到url
                                    swtich_url=True

                            keystr = line.xpath(keyxpath)
                            keystr = "".join(keystr)

                            if keystr == "images" or keystr == "images_xpath":  # 对图片的链接进行处理
                                keystr = urljoin(self.start_url, keystr)

                            one_data_dict[key] = keystr
                        if swtich_url:
                            continue
                        bloom_url = one_data_dict["url"]
                        if self.executionType != 1:  # 增量爬虫
                            if bloom_url in self.bloom:
                                logging.info(self.taskCode+"判断url在布隆过滤器成功")
                                switch = True
                            else:
                                self.bloom.add(bloom_url)
                                one_data_dict = json.dumps(one_data_dict)  # 将字典转化为字符串
                                print(one_data_dict)
                                self.redis.lpush(self.url_key_name, one_data_dict)
                        else:
                            one_data_dict = json.dumps(one_data_dict)  # 将字典转化为字符串
                            print(one_data_dict)
                            self.redis.lpush(self.url_key_name, one_data_dict)

                if switch:  # 布隆过滤器判断有去重
                    break
        else:
            swtich = False
            url_list = self.get_post_url_list()
            for url in url_list:
                time.sleep(self.timeInterval)
                response = self.post_download(url,self.post_data)
                if response[1] == 200:
                    ps = response[0]
                    mytree = lxml.etree.HTML(ps)
                    linelist = mytree.xpath(self.lineListXpath)
                    for ii in linelist:
                        content_url = parse.unquote(ii)
                        endUrl = urljoin(self.start_url, content_url)
                        if self.executionType != 1:  # 增量爬虫
                            if endUrl in self.bloom:
                                logging.info(self.taskCode + "判断url在布隆过滤器成功")
                                swtich=True
                            else:
                                self.bloom.add(endUrl)
                                print(endUrl)
                                self.redis.lpush(self.url_key_name, endUrl)
                        else:
                            print(endUrl)
                            self.redis.lpush(self.url_key_name, endUrl)

                if swtich:
                    break


        url_list = self.get_post_url_list()
        for url in url_list:
            response = self.post_download(url,self.post_data)
            if response[0]==200:
                ps = response[1]


    def post_start(self):
        """post_data,page_num_str"""
        if self.webType == 2:  #post,html类型
            post_data_list = self.get_post_data_list()  # 构造post请求数据
            self.post_html(post_data_list)
        elif self.webType == 3: # post  json类型
            post_data_list = self.get_post_data_list()  # 构造post请求数据
            self.post_json(post_data_list)
        else:   #web_type==4,url变化但是postdata不变的情况
            self.post_url_change()

    #html和json的get方法处理
    def get_start(self):

        # 存量爬虫
        if self.executionType == 1:
            pageList = self.get_PageUrlList()  # 页数链接

            for url in pageList:
                time.sleep(self.timeInterval)
                if self.webType == 0:
                    urlList = self.get_content_url_list(url)
                else:
                    urlList = self.get_json_content_url_list(url)
                time.sleep(self.timeInterval)

                for content_data in urlList:
                    print(content_data)
                    self.redis.lpush(self.url_key_name, content_data)
        # 增量爬虫
        else:
            switch = False
            if self.webType == 0:
                start_data_urlList = self.get_content_url_list(self.start_url)
            else:
                start_data_urlList = self.get_json_content_url_list(self.start_url)
            time.sleep(self.timeInterval)

            # 链接页只有url的情况下
            if not self.page_xpath:
                for start_data in start_data_urlList:  # 判断第一页
                    if start_data in self.bloom:
                        logging.info(self.taskCode + "判断url在布隆过滤器成功")
                        switch = True  # 如果第一页出现以前爬过的url,switch为true,后续的就不在爬了
                    else:
                        self.bloom.add(start_data)
                        print(start_data)
                        self.redis.lpush(self.url_key_name, start_data)

                if not switch:  # 判断第二页及以后页数
                    for pageIndex in range(int(self.second_page_value), int(self.end_page_value)):
                        swtich2 = False
                        theUrl = self.url_type.replace("%d", str(pageIndex))

                        if self.webType == 0:
                            second_content_urlList = self.get_content_url_list(theUrl)  # 每一页的文本链接列表
                        else:
                            second_content_urlList = self.get_json_content_url_list(theUrl)  # json格式的每一页的文本链接列表

                        for second_content_url in second_content_urlList:
                            if second_content_url in self.bloom:
                                logging.info(self.taskCode + "判断url在布隆过滤器成功")
                                swtich2 = True
                            else:
                                self.bloom.add(second_content_url)
                                self.redis.lpush(self.url_key_name, second_content_url)
                                print(second_content_url)
                        if swtich2:
                            break
            # 文本链接在一个字典里    {"url": "http://www.nea.gov.cn/2015-01/16/c_133924732.htm","statement_time_xpath":  "2015-01-16"}
            else:
                for start_data in start_data_urlList:  # 判断第一页
                    start_data_json = json.loads(start_data)
                    current_url = start_data_json["url"]
                    if current_url in self.bloom:
                        logging.info(self.taskCode + "判断url在布隆过滤器成功")
                        switch = True  # 如果第一页出现以前爬过的url,switch为true,后续的就不在爬了
                    else:
                        self.bloom.add(current_url)
                        self.redis.lpush(self.url_key_name, start_data)
                        print(start_data)

                if not switch:  # 判断第二页及以后页数
                    for pageIndex in range(int(self.second_page_value), int(self.end_page_value)):
                        swtich2 = False
                        theUrl = self.url_type % pageIndex  # 从第二页开始构造链接

                        if self.webType == 0:
                            second_content_urlList = self.get_content_url_list(theUrl)  # 每一页的文本链接列表
                        else:
                            second_content_urlList = self.get_json_content_url_list(theUrl)  # json格式的每一页的文本链接列表

                        for second_content_data in second_content_urlList:
                            second_content_data_json = json.loads(second_content_data)
                            current_url = second_content_data_json["url"]
                            if current_url in self.bloom:
                                logging.info(self.taskCode + "判断url在布隆过滤器成功")
                                swtich2 = True
                            else:
                                self.bloom.add(current_url)
                                print(current_url)
                                self.redis.lpush(self.url_key_name, second_content_data)
                                print(second_content_data)
                        if swtich2:
                            break
        


    def judge_status(self,task_data):
        """处理周期执行任务,判断周期执行的任务状态,在暂停和停止状态下的处理情况"""

        task_data_json = json.loads(task_data)
        task_code = task_data_json["taskCode"]
        task_key_name = self.redis_platform_address + ":task"   #任务队列键值

        status_key_name = self.redis_platform_address + ":status:" + task_code  # 状态队列键值
        status_data = self.redis.get(status_key_name)
        print("status_key_name",status_key_name)
        print("status_data",status_data)
        status_data = json.loads(status_data)
        status = status_data["status"]

        if status=="1" or status=="2":
            print("判断状态为进行中", task_data)
            self.redis.lrem(task_key_name, 0, task_data)
            print("删除任务", task_data)
            return True
        if status=="3":
            print("判断状态为暂停",task_data)
            time.sleep(1)
            return False
        if status=="4":
            print("判断状态为停止",task_data)
            time.sleep(1)
            self.redis.lrem(task_key_name,0,task_data)
            print("删除任务",task_data)
            return False

    def start(self):
        while True:
            task_key_name = self.redis_platform_address+":task"
            task_data_list = self.redis.lrange(task_key_name,0,100)
            print(task_data_list)
            time.sleep(5)
            for task_data in task_data_list:
                swtich = self.judge_status(task_data)   # 更新self.taskCode

                if swtich:
                    print(self.taskCode)
                    self.taskCode = json.loads(task_data)["taskCode"]
                    self.change_outqueue_num()      #更改outQueue值为1
                    self.update_attr()  # 更新属性

                    if self.executionType != 1:    #增量爬虫      更新布隆过滤器      executionType
                        self.bloom_readfrom_db()

                    if self.post_data or type(self.post_data) == dict:
                        self.post_start()        #处理post
                    else:
                        self.get_start()     #处理get方法html和json

                    if self.executionType != 1:
                        self.bloom_writeto_db()  # 布隆过滤器保存到数据库
Ejemplo n.º 14
0
#你完全可以避免下载庞大的vc运行库,在https://www.lfd.uci.edu/~gohlke/pythonlibs/下载whl文件
'''
animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
           'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
           'chicken', 'dolphin', 'donkey', 'crow', 'crocodile','testadd']
'''
is_exist = os.path.exists('test.blm')
#判断是否存在bloom文件
#判断存在就读取
if is_exist:
    bf = BloomFilter.fromfile(open('test.blm', 'rb'))
#没有该文件则创建bf对象 最后的时候保存文件
else:
    bf = BloomFilter(20000, 0.001)

for i in range(10):
    if i in bf:
        print('pass')
        pass
    else:
        print('add %s' % i)
        bf.add(i)
        n = open('test.blm', 'wb')
        bf.tofile(n)
        n.close()
for i in range(20):
    if i in bf:
        print("written")
    else:
        print("unwritten")
Ejemplo n.º 15
0
ls = ["1049be49dc584707"]
os.chdir(r'E:\Myproject\Scan\chizhou\chizhou\spiders')

is_exist = os.path.exists('chizhou.blm')
# 判断是否存在bloom文件
# 判断存在就读取
if is_exist:
    bf = BloomFilter.fromfile(open('chizhou.blm', 'rb'))
    # 没有该文件则创建bf对象 最后的时候保存文件
else:
    bf = BloomFilter(1000000, 0.0000001)

i = 1
for room_url in ls:
    if room_url in bf:
        print('pass')
        pass
    else:
        # 加入布隆列表
        bf.add(room_url)
        print('添加了 %s 个' % i)
        i += 1
# 创建,写入布隆文件(单次写入)
bf.tofile(open('chizhou.blm', 'wb'))

print("测试git")

# cur.close()
# coon.close()