bf.tofile(f) #扩容的方式 SMALL_SET_GROWTH 小量两倍扩容 LARGE_SET_GROWTH 大量 4倍扩容 sbf = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) #布隆过滤器的一些信息 print(sbf.filters) # print(sbf_address,sbf_size,sbf_endianness,sbf_unused,sbf_allocated, str_filesize(sbf_size)) #超出自动扩容 for i in range(0, 111000): try: sbf.add("zjl-{}".format(i)) except Exception as e: print(i, e) for bf in sbf.filters: address, size, endianness, unused, allocated = bf.bitarray.buffer_info() print(address, size, endianness, unused, allocated, str_filesize(size)) #写入某个文件 sbf_file = '{}/demo_sbf.bytes'.format( os.path.dirname(os.path.abspath(__file__))) print('zjl-1' in sbf) #b 参数可以写入bytes with open(sbf_file, 'wb') as f: sbf.tofile(f)
class Crawler: def __init__(self) -> None: self.queue: asyncio.Queue = asyncio.Queue(maxsize=10) self.workers: List[Future] = [] self.ipfs = Ipfs() async def run(self) -> None: try: with open('/data/bloom-filter', 'rb') as f: log('debug', 'Using saved bloom-filter') self.filter = ScalableBloomFilter.fromfile(f) except FileNotFoundError: log('debug', 'Creating new bloom-filter') self.filter = ScalableBloomFilter(initial_capacity=100000) self.conn_pool = await retry( partial(asyncpg.create_pool, host='db', user='******', database='ipfs_crawler'), 'database', gaierror, ConnectionRefusedError, asyncpg.CannotConnectNowError) # start consumers for _ in range(8): self.workers.append(asyncio.ensure_future(self.worker())) # start producer self.producer: Future = asyncio.ensure_future(self.read_logs()) log('info', 'Started crawling') # If an exception is thrown in the background task, # our crawler should not ignore it and continue to run, but throws it. await asyncio.gather(self.producer, *self.workers) async def stop(self) -> None: # cancel producer and consumer self.producer.cancel() for w in self.workers: w.cancel() # ensure exited res = await asyncio.gather(self.producer, *self.workers, return_exceptions=True) for exc in res: if not isinstance(exc, asyncio.CancelledError): log('error', repr(exc)) log('debug', 'Saving bloom-filter') with open('/data/bloom-filter', 'wb') as f: self.filter.tofile(f) await asyncio.gather(self.ipfs.close(), self.conn_pool.close()) log('info', 'Exited') async def read_logs(self) -> NoReturn: while True: async with self.ipfs.log_tail() as log_iter: async for event in log_iter: if event.get('Operation') == 'handleAddProvider': await self.queue.put((event['Tags']['key'], '')) log('warning', 'Log tail restarted') async def worker(self) -> NoReturn: while True: hash, filename = await self.queue.get() if hash in self.filter: continue self.filter.add(hash) try: info = await self.parse(hash, filename) if info is not None: await self.add_result(info) except asyncio.CancelledError: # self.parse() will probably raise CancelledError # when self.stop() called. Won't log this. raise except asyncio.TimeoutError: log('warning', f'Timed out: {hash}') except IpfsError as exc: log('error', repr(exc)) except Exception as exc: log('error', f'Failed to parse {hash}, worker exited: {exc!r}') raise async def parse(self, hash: str, filename: str) -> Union[HtmlInfo, None]: try: head = await self.ipfs.cat(hash, length=128) # This hash is a directory, add files in it to the queue. There is # currently no good way to determine if it's a file or a directory. except IsDirError: links = await self.ipfs.ls(hash) for link in links: # Using `await self.queue.put()` will block the worker, if all # workers are blocked, the crawler will fall into a deadlock. # Note: the queue won't increase infinitely because: # 1. if the queue's size >= max size, the producer will stop # production # 2. the files in the directory are not unlimited, they will # be used up sooner or later. self.queue._queue.append((link['Hash'], link['Name'])) return None mime = magic.from_buffer(head, mime=True) if mime != 'text/html': return None info = await self.parse_html(hash) info.hash = hash info.filename = filename return info async def parse_html(self, hash: str) -> HtmlInfo: html = (await self.ipfs.cat(hash)).decode(errors='ignore') soup = BeautifulSoup(html, 'html.parser') # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(' ')) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return HtmlInfo(title=soup.title.string if soup.title else '', text=text) async def add_result(self, info: HtmlInfo) -> None: await self.conn_pool.execute( 'INSERT INTO html(hash, filename, title, text) ' 'values ($1, $2, $3, $4)', # dataclass is not iterable info.hash, info.filename, info.title, info.text)