Esempio n. 1
0
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file, 'rb') as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path, 'rb') as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
Esempio n. 2
0
    async def run(self) -> None:
        try:
            with open('/data/bloom-filter', 'rb') as f:
                log('debug', 'Using saved bloom-filter')
                self.filter = ScalableBloomFilter.fromfile(f)
        except FileNotFoundError:
            log('debug', 'Creating new bloom-filter')
            self.filter = ScalableBloomFilter(initial_capacity=100000)

        self.conn_pool = await retry(
            partial(asyncpg.create_pool,
                    host='db',
                    user='******',
                    database='ipfs_crawler'), 'database', gaierror,
            ConnectionRefusedError, asyncpg.CannotConnectNowError)

        # start consumers
        for _ in range(8):
            self.workers.append(asyncio.ensure_future(self.worker()))
        # start producer
        self.producer: Future = asyncio.ensure_future(self.read_logs())
        log('info', 'Started crawling')

        # If an exception is thrown in the background task,
        # our crawler should not ignore it and continue to run, but throws it.
        await asyncio.gather(self.producer, *self.workers)
Esempio n. 3
0
logger.setLevel(logging.INFO)
handler = logging.FileHandler("log.txt")
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
## 控制台handler
chhandler = logging.StreamHandler()
chhandler.setFormatter(formatter)
logger.addHandler(chhandler)
from pybloom_live import ScalableBloomFilter

blommfilter_file = "bloomfilter.suffix"
titlefilter_file = "titlefilter.suffix"
deque_file = "sites.pkl"
try:
    sbf = ScalableBloomFilter.fromfile(open(blommfilter_file, "rb"))
    sbf_title = ScalableBloomFilter.fromfile(open(titlefilter_file, "rb"))
    with open(deque_file,'rb') as f:
        sites_deque = pickle.load(f)
except:
    # logger.warning('去重文件不存在')
    sbf = ScalableBloomFilter(
        initial_capacity=5000,
        error_rate=0.001,
        mode=ScalableBloomFilter.LARGE_SET_GROWTH,
    )
    sbf_title = ScalableBloomFilter(
        initial_capacity=5000,
        error_rate=0.001,
        mode=ScalableBloomFilter.LARGE_SET_GROWTH,
    )