class SpellChecker(): def __init__(self): self.dictionary = BloomFilter() word_list = open('/usr/share/dict/words', 'r') for word in word_list: self.dictionary.add(word.strip()) def valid(self, string): return self.dictionary.includes(string)
def testDumpAndLoadBase64BloomFilter(self): bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE) for key in self.all_keys: bloom_filter.add(key); dump_str = bloom_filter.dump_to_base64_str(gzipped=True); bloom_filter2 = BloomFilter.load_from_base64_str(dump_str); self.assertEqual(bloom_filter, bloom_filter2) self.check_contains(bloom_filter2)
def testDumpGzippedAndLoadBloomFilter(self): bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE) for key in self.all_keys: bloom_filter.add(key) dump_bytes = bloom_filter.dump(gzipped=True); bloom_filter2 = BloomFilter.load(dump_bytes) self.assertEqual(bloom_filter, bloom_filter2) self.check_contains(bloom_filter2)
class WordLookup: def __init__(self, file_name): self.bf = BloomFilter(10000000, 8) input_file = open(file_name, "r") for file_line in input_file: file_line = file_line.rstrip() self.bf.add(file_line) input_file.close() def is_qualified(self, string): str_len = len(string) if str_len != 6: return False for i in range(1, str_len - 1): first = string[:i] second = string[i:] if self.bf.lookup(first) and self.bf.lookup(second): #print first + '+' + second + '=>' + string return True return False
def main(): inital_page = "http://yue.ifeng.com" url_queue = Queue.Queue() filter = BloomFilter() filter.add(inital_page) url_queue.put(inital_page) while True: urls = [] current_url = url_queue.get() # 取队列第一个元素 try: store(current_url) urls = extract_urls(current_url) # 抽取页面中的链接 except Exception, e: print "Error extract_urls" print e for next_url in urls: if filter.notcontains(next_url): filter.add(next_url) url_queue.put(next_url)
class TxPool(BaseService): """ The :class:`~trinity.tx_pool.pool.TxPool` class is responsible for holding and relaying of transactions, represented as :class:`~eth.rlp.transactions.BaseTransaction` among the connected peers. .. note:: This is a minimal viable implementation that only relays transactions but doesn't actually hold on to them yet. It's still missing many features of a grown up transaction pool. """ def __init__(self, event_bus: EndpointAPI, peer_pool: ETHProxyPeerPool, tx_validation_fn: Callable[[BaseTransactionFields], bool], token: CancelToken = None) -> None: super().__init__(token) self._event_bus = event_bus self._peer_pool = peer_pool if tx_validation_fn is None: raise ValueError('Must pass a tx validation function') self.tx_validation_fn = tx_validation_fn # 1m should give us 9000 blocks before that filter becomes less reliable # It should take up about 1mb of memory self._bloom = BloomFilter(max_elements=1000000) self._bloom_salt = str(uuid.uuid4()) # This is a rather arbitrary value, but when the sync is operating normally we never see # the msg queue grow past a few hundred items, so this should be a reasonable limit for # now. msg_queue_maxsize: int = 2000 async def _run(self) -> None: self.logger.info("Running Tx Pool") async for event in self.wait_iter(self._event_bus.stream(TransactionsEvent)): txs = cast(List[BaseTransactionFields], event.msg) await self._handle_tx(event.remote, txs) async def _handle_tx(self, sender: NodeAPI, txs: List[BaseTransactionFields]) -> None: self.logger.debug('Received %d transactions from %s', len(txs), sender) self._add_txs_to_bloom(sender, txs) for receiving_peer in await self._peer_pool.get_peers(): if receiving_peer.remote is sender: continue filtered_tx = self._filter_tx_for_peer(receiving_peer, txs) if len(filtered_tx) == 0: continue self.logger.debug2( 'Sending %d transactions to %s', len(filtered_tx), receiving_peer, ) receiving_peer.sub_proto.send_transactions(filtered_tx) self._add_txs_to_bloom(receiving_peer.remote, filtered_tx) def _filter_tx_for_peer( self, peer: ETHProxyPeer, txs: List[BaseTransactionFields]) -> List[BaseTransactionFields]: return [ val for val in txs if self._construct_bloom_entry(peer.remote, val) not in self._bloom # TODO: we need to keep track of invalid txs and eventually blacklist nodes if self.tx_validation_fn(val) ] def _construct_bloom_entry(self, remote: NodeAPI, tx: BaseTransactionFields) -> bytes: return f"{repr(remote)}-{tx.hash}-{self._bloom_salt}".encode() def _add_txs_to_bloom(self, remote: NodeAPI, txs: Iterable[BaseTransactionFields]) -> None: for val in txs: self._bloom.add(self._construct_bloom_entry(remote, val)) async def do_cleanup(self) -> None: self.logger.info("Stopping Tx Pool...")
class TxPool(BaseService, PeerPoolSubscriber): """ The :class:`~trinity.tx_pool.pool.TxPool` class is responsible for holding and relaying of transactions, represented as :class:`~evm.rlp.transactions.BaseTransaction` among the connected peers. .. note:: This is a minimal viable implementation that only relays transactions but doesn't actually hold on to them yet. It's still missing many features of a grown up transaction pool. """ logger = logging.getLogger("trinity.tx_pool.TxPool") def __init__(self, peer_pool: PeerPool) -> None: super().__init__() self._peer_pool = peer_pool # 1m should give us 9000 blocks before that filter becomes less reliable # It should take up about 1mb of memory self._bloom = BloomFilter(max_elements=1000000) self._bloom_salt = str(uuid.uuid4()) def register_peer(self, peer: BasePeer) -> None: pass async def _run(self) -> None: self.logger.info("Running Tx Pool") with self.subscribe(self._peer_pool): while True: peer: ETHPeer peer, cmd, msg = await self.wait(self.msg_queue.get(), token=self.cancel_token) if isinstance(cmd, Transactions): await self._handle_tx(peer, msg) async def _handle_tx(self, peer: ETHPeer, txs: List[BaseTransactionFields]) -> None: self.logger.debug('Received transactions from %r: %r', peer, txs) self._add_txs_to_bloom(peer, txs) for receiving_peer in self._peer_pool.peers: receiving_peer = cast(ETHPeer, receiving_peer) if receiving_peer is peer: continue filtered_tx = self._filter_tx_for_peer(receiving_peer, txs) if len(filtered_tx) == 0: continue self.logger.debug('Sending transactions to %r: %r', receiving_peer, filtered_tx) receiving_peer.sub_proto.send_transactions(filtered_tx) self._add_txs_to_bloom(receiving_peer, filtered_tx) def _filter_tx_for_peer( self, peer: BasePeer, txs: List[BaseTransactionFields]) -> List[BaseTransactionFields]: return [ val for val in txs if self._construct_bloom_entry(peer, val) not in self._bloom ] def _construct_bloom_entry(self, peer: BasePeer, tx: BaseTransactionFields) -> bytes: return "{!r}-{}-{}".format(peer.remote, tx.hash, self._bloom_salt).encode() def _add_txs_to_bloom(self, peer: BasePeer, txs: Iterable[BaseTransactionFields]) -> None: for val in txs: self._bloom.add(self._construct_bloom_entry(peer, val)) async def _cleanup(self) -> None: self.logger.info("Stopping Tx Pool...")
from bloom_filter import BloomFilter tree = BloomFilter(max_elements=2**16, error_rate=0.1) tree.add("nihao") tree.add("sleep") print(tree) btree = {} btree["a.fn"] = tree print(btree)
def main(): parser = argparse.ArgumentParser(description='Markov modeling and generation of DNS names') parser.add_argument('-n', metavar='COUNT', type=int, help='Number of names to generate') parser.add_argument('INPUT_FILE', help='input list of observed DNS names') args = parser.parse_args() count = args.n input_file_name = args.INPUT_FILE observed = BloomFilter(count * 50, 0.0001) suffix_map = {} suffix_freq = {} suffix_models = {} def preprocess(dns_name): return dns_name.strip().lower() # TODO: Should we add an epsilon to this to try and generate new suffix? def generate_val(dict_freq): rnd = random.random() gen_total = 0.0 for k, v in dict_freq.items(): gen_total = gen_total + v if rnd < gen_total: return k assert False with open(input_file_name, 'r') as input_file: dns_names = list(map(preprocess, input_file.readlines())) num_names = len(dns_names) for name in dns_names: parts = name.split('.')[1:] suffix = '.'.join(parts) t = tldextract.extract(suffix) if t.domain == '': num_names -= 1 continue if suffix not in suffix_map: suffix_map[suffix] = [name] suffix_freq[suffix] = 1 else: suffix_freq[suffix] += 1 suffix_map[suffix].append(name) observed.add(name) for suffix in suffix_freq: suffix_freq[suffix] /= num_names while count > 0: suffix = generate_val(suffix_freq) if suffix not in suffix_models: if len(suffix_map[suffix]) > 1: suffix_models[suffix] = MarkovChain() names = map(lambda x: x.replace('.' + suffix, ''), suffix_map[suffix]) suffix_models[suffix].train(list(names)) else: continue name = suffix_models[suffix].generate_name() + '.' + suffix if name not in observed: observed.add(name) count -= 1 print(name)
def main(args): """ Parses command line arguments, and does the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ options = parse_args(args) # This holds the nicely-parsed options object if options.k == 0: sys.stderr.write("Cannot use empty k-mers\n") sys.exit(1) if options.n == 0: sys.stderr.write("Cannot use no k-mers\n") sys.exit(1) sys.stderr.write('Load FASTA...\n') # We access the same strings so many times that if we index here we use many times the genome's size in memory. # There may be non-cached string views involved. # Just load the whole FASTA. index = SeqIO.to_dict(SeqIO.parse(options.fasta, "fasta")) sys.stderr.write('Analyze sequence sizes...\n') # Compute lengths of all sequences, for sampling sequence_names = list(index.keys()) sequence_lengths = [len(index[name]) for name in sequence_names] # Weight sequences by how many full-length kmers fit in them sequence_weights = [max(l - (options.k - 1), 0) for l in sequence_lengths] # And get ready to look up how many start positions we have available in each sequence available_starts_by_name = dict(zip(sequence_names, sequence_weights)) if sum(sequence_weights) == 0: # We can't sample anything actually sys.stderr.write("No long enough sequences in file\n") sys.exit(1) sys.stderr.write('Sample {}-mers...\n'.format(options.k)) # Make the bloom filter for kmers and RCs acceptable = BloomFilter(max_elements=options.n, error_rate=options.bloom_error) with enlighten.Counter(total=options.n, desc='Sample', unit='{}-mers'.format(options.k)) as bar: # This will be all the k-mers we want to count. # We use a counter because if we sample the same k-mer multiple times at the # sampling stage we want to count it multiple times for mappability # assessment. kmers = collections.Counter() kmers_sampled = 0 while kmers_sampled < options.n: # Sample a k-mer sequence_name = random.choices(sequence_names, sequence_weights)[0] kmer_start = random.randint( 0, available_starts_by_name[sequence_name]) kmer = index[sequence_name].seq[kmer_start:kmer_start + options.k] # Convert to upper case kmer = kmer.upper() if not all_ACGT(kmer): # Reject this one for having unacceptable letters continue # All k-mers shall be forward strand # Note that we are looking for this k-mer kmers[kmer] += 1 # And that we sampled one. kmers_sampled += 1 # Record it and its RC in the Bloom filter acceptable.add(kmer) acceptable.add(kmer.reverse_complement()) bar.update() sys.stderr.write('Count {}-mers...\n'.format(options.k)) # Now traverse the whole FASTA and count counts = collections.Counter() with enlighten.Counter(total=sum(sequence_weights), desc='Count', unit='{}-mers'.format(options.k)) as bar: # We will do the counting in processes processes = multiprocessing.pool.Pool(options.thread_count) # We put the AsyncResults in this queue and handle them as they become ready. # A straggler could make it get a bit big. result_queue = collections.deque() def handle_result(result): """ Process the return value from a counting job. Runs in main thread. """ # See if something is done reply_counts, reply_kmers_processed = result # If so, handle it for kmer, count in reply_counts.items(): if kmer in kmers: # Not a bloom filter false positive counts[kmer] += count # Also see if the reverse complement is there. # If we sampled a kmer and its RC then seeing one should count for both. # If we sampled a plaindrome we should count it twice every time we see it on the forward strand. rc = kmer.reverse_complement() if rc in kmers: counts[rc] += count bar.update(reply_kmers_processed) for sequence in index.values(): # Pre upper case everything sequence = sequence.upper() # Where is the past-end for the k-mer start positions? start_past_end = max(len(sequence) - (options.k - 1), 0) # Where will the next batch start in this sequence cursor = 0 while cursor < start_past_end: # Work out how big a batch to make this_batch_size = min(options.batch_size, start_past_end - cursor) # Find the piece of sequence to process. # Make sure to provide the tail end where k-mer starts aren't. part = sequence.seq[cursor:cursor + this_batch_size + (options.k - 1)] async_result = processes.apply_async( count_kmers, (options.k, acceptable, part)) result_queue.append(async_result) cursor += this_batch_size while len(result_queue) > 0 and result_queue[0].ready(): # Pop off tasks that are done handle_result(result_queue[0].get()) result_queue.popleft() while len(result_queue) > options.thread_count * 4: # Too many things waiting. Wait and pick some up before continuing. handle_result(result_queue[0].get()) result_queue.popleft() while len(result_queue) > 0: # Collect all the jobs left at the end handle_result(result_queue[0].get()) result_queue.popleft() # Bucket k-mers by multiplicity in the genome. # We know they all appear at least once. kmers_by_multiplicity = collections.defaultdict(list) for kmer, multiplicity in counts.items(): kmers_by_multiplicity[multiplicity].append(kmer) # Count up the total number of kmers with each multiplicity, properly # weighting multiple sampling kmers_with_multiplicity = { m: sum((kmers[k] for k in ks)) for m, ks in kmers_by_multiplicity.items() } # Compute mappability effective_mapped = sum( (count / m for m, count in kmers_with_multiplicity.items())) possible_mapped = sum(kmers.values()) print( "Expect to map {:.2f} {}-mers of out of {} total, for mappability of {:.2f}%" .format(effective_mapped, options.k, options.n, effective_mapped / possible_mapped * 100)) for m in sorted(kmers_with_multiplicity.keys()): print("{} copies: \t{} sampled {}-mers".format( m, kmers_with_multiplicity[m], options.k)) return 0
for fileid in movie_reviews.fileids('neg'): neg_reviews.extend(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids('pos'): pos_reviews.extend(movie_reviews.words(fileid)) """### Your task In this Colab, you will develop a very simplistic spell-checker. By no means you should think of using it for a real-world use case, but it is an interesting exercise to highlight the strenghts and weaknesses of Bloom Filters! """ from bloom_filter import BloomFilter word_filter = BloomFilter(max_elements=236736) for word in word_list: word_filter.add(word) word_set = set(word_list) """If you executed the cell above, you now have 3 different variables in your scope: 1. ```word_list```, a Python list containing the English dictionary (in case insensitive order) 2. ```word_filter```, a Bloom filter where we have already added all the words in the English dictionary 3. ```word_set```, a [Python set](https://docs.python.org/3.6/library/stdtypes.html#set-types-set-frozenset) built from the same list of words in the English dictionary Let's inspect the size of each datastructure using the [getsizeof()](https://docs.python.org/3/library/sys.html#sys.getsizeof) method! """ from sys import getsizeof print(f'Size of word_list (in bytes): {getsizeof(word_list)}')
class Partition: partitions = {} created_dirs = set() def __init__(self, partition_id, src_dir, dest_dir, dry_run, flatten): self.partition_id = partition_id self.src_dir = src_dir self.dest_dir = dest_dir self.dest_bloom = BloomFilter(max_elements=EST_MAX_FILES_PER_YEAR) self.dry_run = dry_run self.flatten = flatten def _dest_path(self, file_name): """ Given a source path, determine a safe final destination path, disallowing any overwrites within a single job run. """ if self.flatten: base = os.path.basename(file_name) (x, y) = os.path.splitext(base) # linear probing until we find an unused destination name tmp_dest = base i = 0 while tmp_dest in self.dest_bloom: i += 1 tmp_dest = ''.join([x, '-%d' % i, y]) self.dest_bloom.add(tmp_dest) return os.path.join(self.dest_dir, str(self.partition_id), tmp_dest) else: common_prefix = os.path.commonprefix([self.src_dir, file_name]) path_suffix = file_name[len(common_prefix) + 1:] return os.path.join(self.dest_dir, str(self.partition_id), path_suffix) def _ingest(self, file_name): dest_file_name = self._dest_path(file_name) if not self.dry_run: dest_dir = os.path.dirname(dest_file_name) if not dest_dir in Partition.created_dirs: Partition.created_dirs.add(dest_dir) if not os.path.isdir(dest_dir): os.makedirs(dest_dir) shutil.copy2(file_name, dest_file_name) CMD_LOG.info('Partition %s\tcp %s %s' % (self.partition_id, file_name, dest_file_name)) @staticmethod def _get_partition(file_name, run_stats): exif = _read_exif_hachoir(file_name) if 'creation_date' in exif: p = _parse_exif_year(exif['creation_date']) if p: run_stats.count_partition_method('exif') return p path_year = _parse_filename_year(file_name) if path_year: run_stats.count_partition_method('path') return path_year run_stats.count_partition_method('unknown') return UNKNOWN_PARTITION @staticmethod def handle_file(file_name, src_dir, dest_dir, dry_run, flatten, run_stats): part = Partition._get_partition(file_name, run_stats) # if first time, do partition set-up if part not in Partition.partitions: Partition.partitions[part] = Partition(part, src_dir, dest_dir, dry_run, flatten) Partition.partitions[part]._ingest(file_name) base, ext = os.path.splitext(file_name) run_stats.count_type(ext) run_stats.count_partition(part)
class Crawler: def __init__(self, start_urls: List[str], crawled_pages_count: int, chunk_size: int, fetch_workers: int, database_workers: int): # При использоание set на больших объемах достигнем лимита # памяти.Поэтому используем фильтр блума, при этом проигрываем # по скорости. self._visited = BloomFilter(max_elements=crawled_pages_count) self._logger = get_logger(__name__) self._stop_crawling = False self._urls = start_urls self._data = [] self._buffer = [] self._total_crawled_pages = 0 self._stop_crawling = False self._fetch_error_rate = 0.9 self._crawled_pages_count = crawled_pages_count self._chunk_size = chunk_size self._fetch_workers = fetch_workers self._database_workers = database_workers self._max_buffer_len = self._chunk_size * self._fetch_error_rate def _get_urls(self) -> Generator: urls = self._urls self._urls = [] for chunk in chunks_by_size(urls, self._chunk_size): yield chunk def _get_data(self, urls: List[str]): with ThreadPoolExecutor(self._fetch_workers) as executor: self._data = executor.map(fetch, urls) def _process_data(self): for status, url, clean_text, parsed_urls in self._data: if status != 0: continue self._visited.add(url) self._urls.extend( [u for u in parsed_urls if u not in self._visited]) self._buffer.append((url, clean_text)) def _save_data(self): self._total_crawled_pages += len(self._buffer) with ThreadPoolExecutor(self._database_workers) as executor: executor.map(bulk_insert, chunks_by_count(self._buffer, self._database_workers)) self._buffer = [] def run(self): while True: if not self._urls or self._stop_crawling: self._logger.info('Total pages parsed: ' + str(self._total_crawled_pages)) break for urls in self._get_urls(): self._get_data(urls) self._process_data() if len(self._buffer) >= self._max_buffer_len: self._save_data() if self._total_crawled_pages >= self._crawled_pages_count: self._stop_crawling = True break
info, repl, last_info = log_parser(single_log, last_info) date = str(info[0]).split()[0].replace("-", "_") create_new_table(date, cursor) query_infos = [] reply_infos = [] while single_log: info, repl, last_info = log_parser(single_log, last_info) if info: query_infos.append(info) if repl: reply_infos.append(repl) single_log = f.readline() count += 1 if count % 100000 == 0: query_info_sql(query_infos, cursor) reply_info_kfk(reply_infos) connection.commit() print(count) query_infos = [] connection.commit() if enable_bloom_filter: tobe_check = [] for domain in get_distinct_fld(date, cursor): domain = domain[0] if domain not in domain_filter: print(domain) domain_filter.add(domain) tobe_check.append(domain) with open("拉清单.txt", "a") as f: for i in tobe_check: f.write(i + "\n")
# import BF library from bloom_filter import BloomFilter from collections import defaultdict d = defaultdict(int) Inputs = ["1", "2", "3", "2", "2", "2", "2", "3", "3", "3"] myBF = BloomFilter(max_elements=10, error_rate=0.001) myBF2 = BloomFilter(max_elements=10, error_rate=0.01) myBF3 = BloomFilter(max_elements=10, error_rate=0.01) for x in Inputs: print(f'x {x}') if x not in myBF: #90% print(f'adding bf1 {x}') myBF.add(x) # 90% low high error elif x not in myBF2: print(f'adding bf2 {x}') myBF2.add(x) # 90% low high error elif x not in myBF3: print(f'adding bf3 {x}') myBF3.add(x) # 90% low high error else: d[x] += 1 print(d.keys())
class JiangxiSpider(scrapy.Spider): # 重点 启动参数 name = 'jiangxi_spider' allowed_domains = ['jxsggzy.cn'] # 初始化 def __init__(self, *args, **kwargs): # // 要爬取网站的跟 self.base_url = 'http://jxsggzy.cn/web/' # super(QhSpider, self).__init__(*args, **kwargs) self.bloom_filter = BloomFilter(max_elements=1000000, error_rate=0.1, filename='bf.data') self.num = 0 self.scrawl_mode = ScrawlMode.HISTORY self._stop_parse = False # main 启动函数 def start_requests(self): """ 爬虫默认接口,启动方法 :return: """ # 获取爬取时传过来的参数 # command example: # py -3 -m scrapy crawl jiangxi_spider -a start_time="2019:01:01" -a end_time="2019:01:02" # assert self.start_time is not None # assert self.end_time is not None # self.scrawl_mode = ScrawlMode.REAL_TIME if str(self.start_time).lower() == 'now' else ScrawlMode.HISTORY # # if self.scrawl_mode == ScrawlMode.HISTORY: # if (len(self.start_time) != 10 or len(self.end_time) != 10 # or self.start_time[4] != ':' or self.end_time[4] != ':'): # logging.error('Bad date format. Example: 2019:01:01') # return # else:x # # 取当天日期 # _dt = datetime.fromtimestamp(time.time()) # self.start_time = _dt.strftime("%Y:%m:%d") # self.end_time = self.start_time # info_type = { "01": { "name": "房屋及市政工程", "type": [1, 2, 3, 4] }, "02": { "name": "交通工程", "type": [2, 3, 5] }, "03": { "name": "水利工程", "type": [1, 2, 3, 4, 5] }, "05": { "name": "重点工程", "type": [1, 2, 3, 4] }, "06": { "name": "政府采购", "type": [1, 2, 3, 4, 5, 6] }, "07": { "name": "国土资源交易", "type": [1, 2] }, "08": { "name": "产权交易", "type": [3, 1, 2] }, "09": { "name": "林权交易", "type": [1, 2] }, "10": { "name": "医药采购", "type": [1, 2] }, "13": { "name": "其他项目", "type": [1, 2] } } for _info_item in list(info_type.keys()): for _index, _info_item_num in enumerate( info_type[_info_item]["type"]): _change_url = "http://jxsggzy.cn/web/jyxx/0020{}/0020{}00{}".format( _info_item, _info_item, _info_item_num) _page_url = "http://jxsggzy.cn/web/jyxx/0020{}/0020{}00{}/1.html".format( _info_item, _info_item, _info_item_num) _page_meta = { "_info_item": _info_item, "_info_item_num": _info_item_num, "_index": _index, "_change_url": _change_url, } time.sleep(1) yield scrapy.Request(url=_page_url, callback=self.parse_init, meta={'_page_meta': _page_meta}) def parse_init(self, response): """ :param response: :return: """ self._stop_parse = False _total_num = response.xpath( '//span[@id="index"]/text()').extract_first() _total_num = _total_num.split('/')[1] if int(_total_num) > 0: try: for _page_num_item in range(int(_total_num)): _page_init_detail_url = response.meta["_page_meta"][ "_change_url"] _page_init_detail_url = _page_init_detail_url + "/{}.html".format( _page_num_item + 1) response.meta["_page_meta"][ "_page_init_detail_url"] = _page_init_detail_url # if self._stop_parse: # break time.sleep(1) yield scrapy.Request( url=_page_init_detail_url, callback=self.parse_detail, meta={'_page_meta': response.meta["_page_meta"]}) except: logging.exception(' _total_num is faild {}'.format( response.url)) def parse_detail(self, response): # print(1) _info_type_detail = { "01": { "name": "房屋及市政工程", "type": ["招标公告", "答疑澄清", "文件下载", "中标公告"] }, "02": { "name": "交通工程", "type": ["招标公告", "补疑书", "中标公告"] }, "03": { "name": "水利工程", "type": ["资格预审公告/招标公告", "澄清补遗", "文件下载", "中标候选人公示", "中标结果公示"] }, "05": { "name": "重点工程", "type": ["招标公告", "答疑澄清", "文件下载", "结果公示"] }, "06": { "name": "政府采购", "type": ["采购公告", "变更公告", "答疑澄清", "结果公示", "单一来源公告", "合同公示"] }, "07": { "name": "国土资源交易", "type": ["交易公告", "成交公告"] }, "08": { "name": "产权交易", "type": ["信息披露", "交易公告", "成交公告"] }, "09": { "name": "林权交易", "type": ["信息披露", "成交公示"] }, "10": { "name": "医药采购", "type": ["采购公告", "结果公示"] }, "13": { "name": "其他项目", "type": ["交易公告", "成交公示"] } } item = JiangXiItem() for selector in response.xpath('.//div[@class="ewb-infolist"]/ul/li'): time.sleep(random.randint(100, 200) / 1000.0) # 100 - 200 ms # 公告所对应url _content_url = selector.xpath('.//a/@href').extract_first() _detail_page_url = response.urljoin(_content_url) item['url'] = _detail_page_url # 唯一标识 _unq_id = CcgpUtil.get_unique_id(_detail_page_url) item['_id'] = _unq_id # 如果是重复数据,不处理 if _unq_id in self.bloom_filter: continue self.bloom_filter.add(_unq_id) # 公告所在地区 item['area'] = "江西" item['bid_type'] = _info_type_detail[_info_item_detail]["name"] print(_detail_page_url) # 公告所在具体地区 # item['area_detail'] = self.__get_area_detail__(selector, _detail_page_url) # 招标人 item['buyer'] = " " # 公告类型 _index_detail = response.meta["_page_meta"]["_index"] _info_item_detail = response.meta["_page_meta"]["_info_item"] item['notice_type'] = _info_type_detail[_info_item_detail]["type"][ _index_detail] # source item['source'] = "jx" # site item['site'] = "jx" # 公告所对应时间 item['notice_time'] = self.__get_notice_time__( selector, _detail_page_url) # if self.start_time or self.end_time: # try: # self.start_time = self.start_time.split(" ")[0].replace(":", ".") # self.end_time = self.end_time.split(" ")[0].replace(":", ".") # # if len(self.start_time) == 10: # self.start_time = self.start_time + " 00:00:00" # # if len(self.end_time) == 10: # self.end_time = self.end_time + " 00:00:00" # # except: # logging.exception( # 'self.start_time {} or self.end_time failed {}'.format(self.start_time, # self.end_time)) # print(self.start_time, item['notice_time']) # if self.start_time > item['notice_time'] or self.end_time < item['notice_time']: # self._stop_parse = True # logging.info('time interval') # return # else: # self._stop_parse = False # 公告的标题 item['title'] = self.__get_title__(selector, _detail_page_url) # 内容 item['content'] = self.__get_content__(selector, _detail_page_url) print(item) yield item @staticmethod def __get_area_detail__(selector, url): _ret = '' _area_detail = ["上绕市", "银川市", "石嘴山市", "吴忠市", "固原市", "中卫市"] try: _content_text = selector.xpath( 'string(./div[@class="ewb-info-a"]/a)').extract()[0] _content_text = ''.join(_content_text.split()) for _item in _area_detail: if _item in _content_text: _ret = _item break except: logging.exception('{} get_area_detail__ failed'.format(url)) return _ret @staticmethod def __get_notice_time__(selector, url): _ret = '' try: _bid_info = selector.xpath( './/span[@class="ewb-list-date"]/text()').extract_first() if _bid_info: _ret = _bid_info.replace('-', '.') + " 00:00:00" except: logging.exception('{} get_notice_time failed'.format(url)) return _ret @staticmethod def __get_title__(selector, url): _ret = '' try: _ret = selector.xpath( './a[@class="ewb-list-name"]/text()').extract_first().replace( '\\n', '').rstrip().lstrip() except: logging.exception('{} get_title failed'.format(url)) return _ret @staticmethod def __get_content__(selector, url): """ 正文内容 如果提取正文内容失败,则判断此次爬取失败,所以这里不能用try except :param selector: :param url: :return: """ _bad = False _ret = '' try: _r = requests.get(url, timeout=15) _r.encoding = 'utf-8' _ret = base64.b64encode(zlib.compress( _r.text.encode('utf-8'))).decode('utf-8') except: _bad = True # 如果有异常,重试一次 if _bad: time.sleep(1) _r = requests.get(url, timeout=15) _r.encoding = 'utf-8' _ret = base64.b64encode(zlib.compress( _r.text.encode('utf-8'))).decode('utf-8') return _ret
def get_urls_bloom_filter(urls): bloom_filter = BloomFilter(max_elements=10000, error_rate=0.001) for url in urls: bloom_filter.add(url) return bloom_filter
class SpicSpider(scrapy.Spider): # 重点 启动参数 name = 'spic_spider' allowed_domains = ['cpeinet.com.cn'] # 初始化 def __init__(self, *args, **kwargs): # // 要爬取网站的跟 self.base_url = 'http://www.cpeinet.com.cn/' # super(QhSpider, self).__init__(*args, **kwargs) self.bloom_filter = BloomFilter(max_elements=1000000, error_rate=0.1, filename='bf.data') self.num = 0 self.scrawl_mode = ScrawlMode.HISTORY self._stop_parse = False # main 启动函数 def start_requests(self): """ 爬虫默认接口,启动方法 :return: """ # 获取爬取时传过来的参数 # command example: # py -3 -m scrapy crawl ccgp_search -a start_time="2019:01:01" -a end_time="2019:01:02" # assert self.start_time is not None # assert self.end_time is not None # self.scrawl_mode = ScrawlMode.REAL_TIME if str(self.start_time).lower() == 'now' else ScrawlMode.HISTORY # # if self.scrawl_mode == ScrawlMode.HISTORY: # if (len(self.start_time) != 10 or len(self.end_time) != 10 # or self.start_time[4] != ':' or self.end_time[4] != ':'): # logging.error('Bad date format. Example: 2019:01:01') # return # else: # # 取当天日期 # _dt = datetime.fromtimestamp(time.time()) # self.start_time = _dt.strftime("%Y:%m:%d") # self.end_time = self.start_time # info_type = [1, 2, 3, 7, 33, 4, 5] for _info_item in info_type: _page_url = "http://www.cpeinet.com.cn/cpcec/bul/bul_list.jsp?type={}".format( _info_item) _page_meta = {"_info_item": _info_item, "_change_url": _page_url} time.sleep(1) yield scrapy.Request(url=_page_url, callback=self.parse_init, meta={'_page_meta': _page_meta}) def parse_init(self, response): self._stop_parse = False _total_num = response.xpath( './/div[@class="page"]/font/text()').extract()[0] print(_total_num) if int(_total_num) > 0: try: for _page_num_item in range(int(_total_num)): _page_init_detail_url = response.meta["_page_meta"][ "_change_url"] _page_init_detail_url = _page_init_detail_url + "/{}.html".format( _page_num_item + 1) response.meta["_page_meta"][ "_page_init_detail_url"] = _page_init_detail_url # if self._stop_parse: # break time.sleep(1) yield scrapy.Request( url=_page_init_detail_url, callback=self.parse_detail, meta={'_page_meta': response.meta["_page_meta"]}) except: logging.exception(' _total_num is faild {}'.format( response.url)) def parse_datail(self, response): item = SpicItem() for selector in response.xpath('.//div[@class="article_list_lb"]/li'): time.sleep(random.randint(100, 200) / 1000.0) # 100 - 200 ms # 公告所对应url _content_url = selector.xpath('.//span/a/@href').extract_first() _detail_page_url = response.urljoin(_content_url) item['url'] = _detail_page_url # 唯一标识 _unq_id = CcgpUtil.get_unique_id(_detail_page_url) item['_id'] = _unq_id # 如果是重复数据,不处理 if _unq_id in self.bloom_filter: continue self.bloom_filter.add(_unq_id) # 公告所在地区 item['area'] = "江西" print(_detail_page_url) # 公告所在具体地区 # item['area_detail'] = self.__get_area_detail__(selector, _detail_page_url) # 招标人 item['buyer'] = " " # 公告类型 _index_detail = response.meta["_page_meta"]["_index"] _info_item_detail = response.meta["_page_meta"]["_info_item"] item['notice_type'] = _info_type_detail[_info_item_detail]["type"][ _index_detail] # source item['source'] = "jx" # site item['site'] = "jx" # 公告所对应时间 item['notice_time'] = self.__get_notice_time__( selector, _detail_page_url) # 公告的标题 item['title'] = self.__get_title__(selector, _detail_page_url) # 内容 item['content'] = self.__get_content__(selector, _detail_page_url) print(item) pass @staticmethod def __get_notice_time__(selector, url): _ret = '' try: _bid_info = selector['showdate'] if _bid_info: _ret = _bid_info.replace('-', '.') if len(_bid_info) == 10: _bid_info = _bid_info + " 00:00:00" else: _ret = _ret.split(" ")[0] + " 00:00:00" except: logging.exception('{} get_notice_time failed'.format(url)) return _ret @staticmethod def __get_title__(selector, url): _ret = '' try: _ret = selector['title'] except: logging.exception('{} get_title failed'.format(url)) return _ret @staticmethod def __get_content__(selector, url): """ 正文内容 如果提取正文内容失败,则判断此次爬取失败,所以这里不能用try except :param selector: :param url: :return: """ _bad = False _ret = '' try: _r = requests.get(url, timeout=15) _r.encoding = 'utf-8' _ret = base64.b64encode(zlib.compress( _r.text.encode('utf-8'))).decode('utf-8') except: _bad = True # 如果有异常,重试一次 if _bad: time.sleep(1) _r = requests.get(url, timeout=15) _r.encoding = 'utf-8' _ret = base64.b64encode(zlib.compress( _r.text.encode('utf-8'))).decode('utf-8') return _ret
from bloom_filter import BloomFilter # from pybloom import BloomFilter fruit = BloomFilter(100000, error_rate=0.001, filename='/tmp/fruit.bloom') # fruit = BloomFilter(100000, error_rate=0.001) [fruit.add(x) for x in ['apple', 'pear', 'orange']] print('aple' in fruit)
class TopBuzzVideo(): def __init__(self): self.headers = { 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 8.0.0; MIX 2 MIUI/V10.2.2.0.ODECNXM) NewsArticle/8.4.4' } self.cookies = { 'cookies': 'install_id=6672646082571388678; ttreq=1$a9ed7f4ce8fc84fced473d6e25c22226f381c13d; odin_tt=3e76568447d177856560d524c6ef5400407a437cfdd62767a36fb3b2decdeb01d43b9a7978232dc05c57af3c81bd10c277e78619093795e8392c1302c9aa8a75; sid_guard=c8f84a23bcce86b376964aeb42991709%7C1554173959%7C5184000%7CSat%2C+01-Jun-2019+02%3A59%3A19+GMT; uid_tt=2ad7176029f7302e11b7924e6e6566b7120075732cedcd39bc999fa5cbcf07a1; sid_tt=c8f84a23bcce86b376964aeb42991709; sessionid=c8f84a23bcce86b376964aeb42991709' } self.headers_details = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } self.cookies_details = { 'Cookie': 'tt_webid=6683297640216282629; __tea_sdk__user_unique_id=6683297640216282629; __tea_sdk__ssid=40d2e59e-696c-4a93-ace8-e1479b10aeef; csrf-token=61575f8b568b577d9d06c777d103ae53e6c10723; csrf-secret=6qDUsFL6WZ1aG2soaPw7PpmCtnxCv7fw' } self.post_video_url = 'http://127.0.0.1:30008/crawler/video/transfer' self.filter_url = 'http://console.cc.clipclaps.tv/crawler/log' self.have_met = BloomFilter(max_elements=100000, error_rate=0.1) def run(self): number = 0 while number < 5: t = time.time() result = re.findall('.\d*', str(t)) # 正则匹配时间戳小数位 sign = tb.hash_code(result[1][1:]) # 对时间戳进行解密 timestamp = result[0] start_url = 'https://i16-tb.isnssdk.com/api/844/stream?session_impr_id=0&tab=General&count=20&min_behot_time=1.554174097999E9&loc_mode=7&lac=4314&cid=6439033' \ '&sign=' + sign + \ '×tamp=' + timestamp + \ '&logo=topbuzz&gender=0&bv_is_auto_play=0&youtube=0&manifest_version_code=844&app_version=8.4.4&iid=6672646082571388678&gaid=54b268f4-52c2-470c-a815-abd1d00acce9&original_channel=gp&channel=gp&fp=TlTrJzK1FYsqFYs5PlU1LMGSL2Xr&device_type=MIX+2&language=en&app_version_minor=8.4.4.01&resolution=2030*1080&openudid=ab50caa43e995042&update_version_code=8440&sys_language=zh&sys_region=cn&os_api=26&tz_name=Asia%2FShanghai&tz_offset=28800&dpi=440&brand=Xiaomi&ac=WIFI&device_id=6672637176796333574&os=android&os_version=8.0.0&version_code=844&hevc_supported=1&device_brand=Xiaomi&device_platform=android&sim_region=cn®ion=us&aid=1106&ui_language=en' tb.analysis_topBuzz(start_url=start_url) number += 1 time.sleep(random.uniform(60, 70)) # 每隔 1min 进行一次访问 def hash_code(self, pwd): # 通过模块构造出一个hash对象 h = hashlib.sha1() h.update(pwd.encode()) # 获得字符串类型的加密后的密文 return h.hexdigest() def analysis_topBuzz(self, start_url): try: res = requests.post(url=start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) data = json.loads(res) item = data['data']['items'] # 分析列表页,获得详情页url for i in range(len(item)): cls = item[i]['article_class'] if cls == 'Video': duration = item[i]['video']['duration'] if duration < 360: share_url = item[i]['share_url'] video_url = item[i]['video']['url_list'][0]['urls'][0] data = self.parsing_details_url(details_url=share_url, video_url=video_url) print('analysis_topBuzz_data:\n', data) self.save_video(data=data) else: pass except: pass def parsing_details_url(self, details_url=None, video_url=None): status = self.filter_data(details_url=details_url) if status: print('Data already exists!') else: time.sleep(random.uniform(0, 3)) result = requests.get(url=details_url, headers=self.headers_details, cookies=self.cookies_details).text html = etree.HTML(result) # 调度任务 jobId = time.time() # 文章标题 title = html.xpath('//div[@class="title"]/text()')[0] # 作者 authorName = ' '.join( html.xpath('//div[@class="name active"]/text()')) if authorName == '': authorName = ' '.join( html.xpath('//div[@class="name"]/text()')) # 文章发布时间 releaseTime = ' '.join( html.xpath('//div[@class="publishTime"]/text()')) # 视频 video = self.download_video(videoUrl=video_url) return { 'jobId': jobId, 'sourceUrl': details_url, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'video': video } def download_video(self, videoUrl): videoId = str(uuid.uuid4()).replace('-', '') downloadPath = '/data/crawler' videoPath = '/topbuzz/video/' urllib.request.urlretrieve( videoUrl, r'%s.mp4' % (downloadPath + videoPath + str(videoId))) video = '%s.mp4' % (videoPath + str(videoId)) return video def filter_data(self, details_url): data1 = urllib.parse.urlencode({ 'type': int(4), 'days': int(3), }) data2 = data1.encode('utf-8') re = urllib.request.urlopen(url=self.filter_url, data=data2) status = re.read().decode('utf-8') result = json.loads(status) data = result['data'] for kw in data: self.have_met.add(data[kw]) if details_url in self.have_met: return True else: return False def save_video(self, data): data1 = urllib.parse.urlencode({ 'source': 1, 'sourceUrl': data['sourceUrl'], 'title': data['title'], 'authorName': data['authorName'], 'releaseTime': data['releaseTime'], 'video': data['video'], }) data2 = data1.encode('utf-8') re = urllib.request.urlopen(url=self.post_video_url, data=data2) status = re.read().decode('utf-8') print('status:\n', status)
return makeMessage(magic, b"filterload", inv) def mempoolMessage(): return makeMessage(magic, b"mempool", b"\x00\x00\x00\x00") def getHeadersMessage(hash): version = 70015 return makeMessage(magic, b"getheaders", struct.pack('<ih32s32s', version, 1, unhexlify(hash), b'\x00')) k, m = optimal_km(1, 0.001) bfilter = BloomFilter(m, k) data_to_hash = unhexlify("n4ewvXymapgcMARgjMNPvYy2BnCji95SMz") bfilter.add(data_to_hash) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("116.203.72.215", 18333)) sock.send(versionMessage()) a = sock.recv(24*8) # receive version b = sock.recv(1000) sock.send(verackMessage()) c = sock.recv(1000) # receive verack
class CompanySpider(scrapy.Spider): name = "companySpider" def __init__(self, name=None, **kwargs): self.bloom = BloomFilter(1000000, 0.001) for url in dbComponent.get_all_company_url(): self.bloom.add(url) print("bloomFilter初始化完毕") super().__init__(name, **kwargs) def start_requests(self): # category file = open("category.txt", "r", encoding="utf-8") for url in file.readlines(): url = url.replace("\n", "") yield scrapy.Request(url=url, callback=self.parse_list) print("start_requests 初始化完毕") # 解析列表页 def parse_list(self, response): next_url = response.css(".PagedList-skipToNext a::attr(href)").extract_first() if (next_url is not None) and (next_url != ''): next_url = "https://vinabiz.org" + next_url yield scrapy.Request(url=next_url, callback=self.parse_list) url_list = response.css("h4 a::attr(href)").extract() for url in url_list: url = "https://vinabiz.org" + url if url not in self.bloom: yield scrapy.Request(url=url, callback=self.parse_company) # 解析详情页 def parse_company(self, response): td_list = response.css("#wid-detail-info td") index = 0 data = {} data['guid'] = str(uuid.uuid4()).replace("-", "") data['url'] = response.url while index < td_list.__len__(): td = td_list[index] if td.css("::attr(class)").extract_first() == "bg_table_td": index += 1 content = self.porcess_content(td_list[index]) title = td.css("::text").extract_first() # BUSINESS if title == 'Tên chính thức': data['official_name'] = content if title == 'Tên giao dịch': data['trading_name'] = content if title == 'Mã doanh nghiệp': data['business_code'] = content if title == 'Ngày cấp': data['date_range'] = content if title == 'Cơ quan thuế quản lý': data['tax_authorities_manage'] = content if title == 'Ngày bắt đầu hoạt động': data['date_of_commencement_of_operation'] = content if title == 'Trạng thái': data['status'] = content # CONTACT if title == 'Địa chỉ trụ sở': data['office_address'] = content if title == 'Điện thoại': data['phone1'] = content if title == 'Fax': data['fax'] = content if title == 'Email': data['email'] = content if title == 'Website': data['website'] = content if title == 'Người đại diện': data['representative'] = content if title == 'Điện thoại': data['phone2'] = content if title == 'Địa chỉ người đại diện': data['representative_address'] = content if title == 'Giám đốc': data['manager'] = content if title == 'Điện thoại giám đốc': data['phone_director'] = content if title == 'Địa chỉ giám đốc': data['address_director'] = content if title == 'Kế toán': data['accountant'] = content if title == 'Điện thoại kế toán': data['phone_accounting'] = content if title == 'Địa chỉ kế toán': data['account_address'] = content # INDUSTRY if title == 'Ngành nghề chính': data['main_job'] = content if title == 'Lĩnh vực kinh tế': data['economic_field'] = content if title == 'Loại hình kinh tế': data['type_of_economic'] = content if title == 'Loại hình tổ chức': data['type_of_organization'] = content if title == 'Cấp chương': data['class_chapters'] = content if title == 'Loại khoản': data['item_type'] = content index += 1 dbComponent.add_company(data) def porcess_content(self, td): try: result = '' text_list = td.css("::text").extract() for text in text_list: result += text result = result.replace("\n", "") result = result.replace("'", "\\\'") except Exception as e: print(e) return result
class WebCrawler: """ The WebCrawler class implements a multithreaded web crawler starting from the base_url. The crawler runs based on the breadth-first search algorithm and starts running from the base_url and parsing the content to get more urls to be crawled. :param base_url: the input of the starting url of the web crawling :param closure_url: the stopping condition is based on this parameter :param pool: the threadpool :param task_queue: the task_queue that include all the urls that need to be crawled :param crawled_pages: a bloom filter used to eliminated visited pages from the BFS algorithm :param total: counter for the total number of pages being crawled :param lock: mutex lock to prevent race condition when read/write crawled_pages :param run_time: timer for the time spent on running the web crawler """ def __init__(self, base_url: str, cfg: configparser) -> None: self.base_url = base_url self.config = cfg self.closure_url = '{scheme}://{netloc}'.format( scheme=urlsplit(self.base_url).scheme, netloc=urlsplit(self.base_url).netloc) self.pool = ThreadPoolExecutor( max_workers=int(self.config['MAX_WORKER'])) self.task_queue = Queue(maxsize=3 * int(self.config['MAX_WORKER'])) self.task_queue.put(self.base_url) self.crawled_pages = BloomFilter( max_elements=int(self.config['MAX_ELEMENTS']), error_rate=float(self.config['ERROR_RATE'])) self.crawled_pages.add(self.base_url) self.total = 1 self.lock = Lock() self.run_time = time.time() def _add_to_task_queue(self, child_url) -> None: """ Add url to the task queue concurrently If the child_url pass the bloom filter, it will be added to the task queue :param child_url: the url to be added to the task queue :return: None """ ret = self.lock.acquire(timeout=int(self.config['TIMEOUT'])) if ret: if child_url not in self.crawled_pages: self.crawled_pages.add(child_url) self.total += 1 self.lock.release() try: self.task_queue.put(child_url, block=True, timeout=int(self.config['TIMEOUT'])) except Full: logger.error( "Task queue full when putting {child_url}".format( child_url=child_url)) else: logger.info("\t{child_url}".format(child_url=child_url)) else: self.lock.release() else: logger.error("Lock timed out.") def _parse_html(self, html: str, parent_url: str) -> None: """ Parse the html content of the page from the parent_url. Get all the urls (must start with the closure_url as the stopping condition) from the html page and add them to the task queue. :param html: the html content of the parent_url :param parent_url: the url of the html page to be parsed :return None """ soup = BeautifulSoup(html, 'html.parser') links = soup.find_all('a', href=True) logger.info("{parent_url}".format(parent_url=parent_url)) for link in set(links): child_url = link['href'] if child_url.startswith('/') or child_url.startswith( self.closure_url): child_url = urljoin(self.closure_url, child_url) if child_url.endswith('/'): child_url = child_url[:-1] self._add_to_task_queue(child_url) def _callback(self, res: concurrent.futures) -> None: """ Callback when the html page is downloaded. :param res: the request used to fetch the html page in _get_page :return None """ result, url = res.result() if not result: return if result.status_code == 200 and 'html' in result.headers[ 'content-type']: self._parse_html(result.text, url) elif result.status_code in (301, 302): redirect_url = result.headers['Locations'] if redirect_url.endswith('/'): redirect_url = redirect_url[:-1] logger.debug("{url} is redirected to {redirect_url}".format( url=url, redirect_url=redirect_url)) self._add_to_task_queue(redirect_url) def _get_page(self, url: str) -> (requests, str): """ Get the page from the url :param url: the input of requests :return (requests, url) """ try: res = requests.get(url, timeout=int(self.config['TIMEOUT'])) return res, url except requests.RequestException as e: logger.warning('{e} for {url}'.format(e=e, url=url)) return None, url def run(self) -> None: """ Run the webcrawler in a parallel fashion. The workers managed by the thread pool get tasks from the shared task queue. Once the worker is fired, it will get the url page by calling _get_page. When the page is got, the _callback will be called to parse the page to get more urls to be added to the task queue. """ while True: try: target_url = self.task_queue.get( timeout=int(self.config['TIMEOUT'])) job = self.pool.submit(self._get_page, target_url) job.add_done_callback(self._callback) except Empty: return except Exception as e: logger.warning(e) continue def report(self) -> None: """ Report the wall time and total pages on the web crawler """ self.pool.shutdown(wait=True) self.run_time = time.time() - self.run_time logger.info( "{time:.2f} seconds is spent to crawl on {total} pages".format( time=self.run_time, total=self.total))
seqs = seqs.read().split('\n') # k = 10 # x = -1.2 # print(sys.getsizeof(x)) # print(seqs[0]) # kmers = [] # for seq in seqs: # kmers.append(get_kmers(seq, k)) # for km in kmers: # print(Counter(km)) # print(get_kmers(seqs[0], k)) # print(sys.getsizeof(kmers)) bloom = BloomFilter(max_elements=16000, error_rate=0.05) bloom2 = BloomFilter(max_elements=6719656, error_rate=0.05) bloom.add(seqs[10]) print(sys.getsizeof(bloom)) # Test whether the bloom-filter has seen a key: with open('testfilter.bloom', 'wb') as testfilter: # Step 3 pickle.dump(bloom, testfilter) with open('testfilter.bloom', 'rb') as testfilter: # Step 3 bloom2 = pickle.load(testfilter) # After config_dictionary is read from file print(bloom2)
for element in item['const_enum_arrays'] if len(element['array']) > config.const_enum_array_length_threshold and len(item['const_enum_arrays']) > 0 ] """build collection 'fn'""" if len(repo_features['func_names']) > 0: db_feature = mongoDB.mongodb_synic( host=config.db_host, port=config.db_port, db_feature=config.db_feature, collection_name=config.db_fn_collection) document = {} bf_fn = BloomFilter(max_elements=config.bloom_filter_volume, error_rate=config.bloom_filter_error_rate) [bf_fn.add(element) for element in repo_features['func_names']] bf_fn_pickle = cPickle.dumps(bf_fn) document[file.split('.json')[0]] = bf_fn_pickle db_feature.do_add(document_item=document) """build collection '1g'""" if len(repo_features['strings']) > 0: db_feature = mongoDB.mongodb_synic( host=config.db_host, port=config.db_port, db_feature=config.db_feature, collection_name=config.db_1g_collection) document = {} bf_1g = BloomFilter(max_elements=config.bloom_filter_volume, error_rate=config.bloom_filter_error_rate) [bf_1g.add(element) for element in repo_features['strings']] bf_1g_pickle = cPickle.dumps(bf_1g)
from Auth_app.models import User, FL_freq from Auth_app.extract import extract_cf from django.views.decorators.csrf import csrf_exempt import json import os import time from datetime import timedelta from django.core.signing import TimestampSigner from bloom_filter import BloomFilter #bloomfilter stores the multiple characteristic frequencies signer = TimestampSigner() bloom = BloomFilter(max_elements=100000) freq = range(15300, 15500) for f in freq: bloom.add(f) # Create your views here. def home(request): if request.GET: # import pdb;pdb.set_trace() # user_list = User.objects.all() # User.objects.all().delete() # import pdb;pdb.set_trace() # res = User.objects.get_or_create(mac = request.GET['mac']) # User.objects.filter(mac = request.GET['mac']).update(gw_id = request.GET['gw_id']\ # ,gw_address=request.GET['gw_address'],gw_port=request.GET['gw_port'],url=request.GET['url']) # res[0].gw_id = request.GET['gw_id']
def merge(self, merge_table): new_file = open("tmp_file", 'w+') new_index = open("tmp_index", 'w+') new_bloom = BloomFilter() self.max = merge_table.max if merge_table.max > self.max else self.max self.min = merge_table.min if merge_table.min < self.min else self.min cursor = csv.writer(new_file) index_cursor = csv.writer(new_index) allset = set() input_file = open(merge_table.data_name, 'r') input_label = input_file.readline() input_label_arr = input_label.rstrip().split(',') allset |= set(input_label_arr) input_file.close() merged_file = open(self.data_name, 'r') merged_label = merged_file.readline() merged_label_arr = merged_label.rstrip().split(',') allset |= set(merged_label_arr) merged_file.close() count = 0 header = [] for val in allset: self.dataid[val] = count header.append(val) count = count + 1 cursor.writerow(header) with open(merge_table.index_name, 'r') as file: input_lines = [line.strip() for line in file] with open(self.index_name, 'r') as file: merged_lines = [line.strip() for line in file] i = 0 j = 0 self.size = 0 while (i < len(input_lines) and j < len(merged_lines)): #index 0 is key, index 1 is offset input_line = input_lines[i].split(',') merged_line = merged_lines[j].split(',') #index 0 is key, index 1 is offset input_file = open(merge_table.data_name, 'r') input_label = input_file.readline() input_label_arr = input_label.rstrip().split(',') input_lst = [""] * len(allset) input_file.seek(int(input_line[1])) input_value = input_file.readline() # Avoid merging deleted files if ("," not in input_value): i = i + 1 input_file.close() continue input_arr = input_value.rstrip().split(',') #index 0 is key, index 1 is offset merged_file = open(self.data_name, 'r') merged_label = merged_file.readline() merged_label_arr = merged_label.rstrip().split(',') merged_lst = [""] * len(allset) merged_file.seek(int(merged_line[1])) merged_value = merged_file.readline() # Avoid merging deleted files if ("," not in merged_value): j = j + 1 merged_file.close() continue merged_arr = merged_value.rstrip().split(',') #print("shit:%s" %(merged_line[0])) if (input_line[0] <= merged_line[0]): for k in range(len(input_label_arr)): input_lst[self.dataid[input_label_arr[k]]] = input_arr[k] pos = new_file.tell() cursor.writerow(input_lst) file_pos = [] file_pos.append(input_line[0]) file_pos.append(pos) index_cursor.writerow(file_pos) new_bloom.add(input_line[0]) self.size = self.size + 1 if (input_line[0] == merged_line[0]): #print("sdfgsdfg:%s, i:%d" %(input_line[0], i)) j = j + 1 i = i + 1 elif (input_line[0] > merged_line[0]): for k in range(len(merged_label_arr)): merged_lst[self.dataid[merged_label_arr[k]]] = merged_arr[k] pos = new_file.tell() cursor.writerow(merged_lst) file_pos = [] file_pos.append(merged_line[0]) file_pos.append(pos) index_cursor.writerow(file_pos) new_bloom.add(merged_line[0]) self.size = self.size + 1 j = j + 1 input_file.close() merged_file.close() while (i < len(input_lines)): input_line = input_lines[i].split(',') #index 0 is key, index 1 is offset input_file = open(merge_table.data_name, 'r') input_label = input_file.readline() input_label_arr = input_label.rstrip().split(',') input_lst = [""] * len(allset) input_file.seek(int(input_line[1])) input_value = input_file.readline() # Avoid merging deleted files if ("," not in input_value): i = i + 1 input_file.close() continue new_bloom.add(input_line[0]) input_arr = input_value.rstrip().split(',') for k in range(len(input_label_arr)): input_lst[self.dataid[input_label_arr[k]]] = input_arr[k] pos = new_file.tell() cursor.writerow(input_lst) file_pos = [] file_pos.append(input_line[0]) file_pos.append(pos) index_cursor.writerow(file_pos) i = i + 1 self.size = self.size + 1 input_file.close() while (j < len(merged_lines)): merged_line = merged_lines[j].split(',') #index 0 is key, index 1 is offset merged_file = open(self.data_name, 'r') merged_label = merged_file.readline() merged_label_arr = merged_label.rstrip().split(',') merged_lst = [""] * len(allset) merged_file.seek(int(merged_line[1])) merged_value = merged_file.readline() # Avoid merging deleted files if ("," not in merged_value): j = j + 1 merged_file.close() continue new_bloom.add(merged_line[0]) merged_arr = merged_value.rstrip().split(',') for k in range(len(merged_label_arr)): merged_lst[self.dataid[merged_label_arr[k]]] = merged_arr[k] pos = new_file.tell() cursor.writerow(merged_lst) file_pos = [] file_pos.append(merged_line[0]) file_pos.append(pos) index_cursor.writerow(file_pos) j = j + 1 self.size = self.size + 1 merged_file.close() new_file.close() new_index.close() os.remove(self.data_name) os.remove(self.index_name) os.remove(merge_table.data_name) os.remove(merge_table.index_name) self.file = new_file self.index = new_index self.bloom = new_bloom os.rename("tmp_file", self.data_name) os.rename("tmp_index", self.index_name)
class Main(): def __init__(self): self.have_met = BloomFilter(max_elements=100000, error_rate=0.1) self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.post_DB = True def mainTopBuzz(self): n = NewsFeeds() s = SaveSqlDb() tb = TopBuzz() # 访问时间 t = time.time() #正则匹配时间戳小数位 result = re.findall('.\d*', str(t)) sign = tb.hash_code(result[1][1:]) timestamp = result[0] url_tb = 'https://i16-tb.isnssdk.com/api/844/stream?session_impr_id=0&tab=General&count=20&min_behot_time=1.554174097999E9&loc_mode=7&lac=4314&cid=6439033' \ '&sign='+sign+ \ '×tamp='+timestamp+ \ '&logo=topbuzz&gender=0&bv_is_auto_play=0&youtube=0&manifest_version_code=844&app_version=8.4.4&iid=6672646082571388678&gaid=54b268f4-52c2-470c-a815-abd1d00acce9&original_channel=gp&channel=gp&fp=TlTrJzK1FYsqFYs5PlU1LMGSL2Xr&device_type=MIX+2&language=en&app_version_minor=8.4.4.01&resolution=2030*1080&openudid=ab50caa43e995042&update_version_code=8440&sys_language=zh&sys_region=cn&os_api=26&tz_name=Asia%2FShanghai&tz_offset=28800&dpi=440&brand=Xiaomi&ac=WIFI&device_id=6672637176796333574&os=android&os_version=8.0.0&version_code=844&hevc_supported=1&device_brand=Xiaomi&device_platform=android&sim_region=cn®ion=us&aid=1106&ui_language=en' news_list = tb.sendRequest(url=url_tb) path = '/data/crawler' pic_path = '/topbuzz/picture/' number = 1 for url in news_list: if url not in self.have_met: self.have_met.add(url) data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('TB_detail_url\t', url) print('TB_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=1) else: s.saveMySql(data=data) else: pass def mainNewsBreak(self): n = NewsFeeds() s = SaveSqlDb() nb = NewsBreak() url_nb = 'http://api.particlenews.com/Website/channel/news-list-for-best-channel?cstart=0&infinite=true&refresh=1&epoch=5&distribution=newsbreak&platform=1&cv=4.7.3&cend=10&appid=newsbreak&weather=true&fields=docid&fields=date&fields=image&fields=image_urls&fields=like&fields=source&fields=title&fields=url&fields=comment_count&fields=fb_share_total&fields=coach_mark_text&fields=up&fields=down&fields=summary&fields=favicon_id&fields=dominant_image&fields=contextMeta&fields=video_urls&fields=viewType&push_refresh=0&modularize=true&ts=2019-04-07+18%3A14%3A01+%2B0800&version=020025&net=wifi' docId = nb.parsingPost(url=url_nb) get_url = 'http://api.particlenews.com/Website/contents/content?related_docs=false&cv=4.7.3' \ '&docid=' + docId + \ '&appid=newsbreak&bottom_channels=false&distribution=newsbreak&platform=1&version=020025&net=wifi' news_list = nb.parsingGet(url=get_url) path = '/data/crawler' pic_path = '/newsbreak/picture/' number = 1 for url in news_list: if url not in self.have_met: self.have_met.add(url) data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('NB_detail_url\t', url) print('NB_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=2) else: s.saveMySql(data=data) else: pass def mainBuzzFeed(self): n = NewsFeeds() s = SaveSqlDb() bf = BuzzFeed() top_urls = bf.parsingTopUrl() news_urls = bf.parsingNewsUrl() urls_list = top_urls + news_urls path = '/data/crawler' pic_path = '/buzzfeed/picture/' number = 1 for url in urls_list: if url not in self.have_met: self.have_met.add(url) data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('BF_detail_url\t', url) print('BF_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=3) else: s.saveMySql(data=data) else: pass def mainGoogleNews(self): n = NewsFeeds() s = SaveSqlDb() gn = GoogleNews() news_list = gn.googleNews() path = '/data/crawler' pic_path = '/googleNews/picture/' number = 1 for new in news_list: url = new.link.text if url not in self.have_met: data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('GN_detail_url\t', url) print('GN_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=4) else: s.saveMySql(data=data) else: pass def mainSmartNews(self): n = NewsFeeds() s = SaveSqlDb() sm = SmartNews() news_list = sm.smartNews() path = '/data/crawler' pic_path = '/smartNews/picture/' number = 1 for new in news_list: if new not in self.have_met: self.have_met.add(new) data = n.parsingUrl(url=new, downloadPath=path, picPath=pic_path) if data is None: pass else: print('SM_detail_url\t', new) print('SM_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=5) else: s.saveMySql(data=data) else: pass
print("已处理日志", count) query_infos = [] print("正在进行日志处理") sql = "INSERT INTO long_domain_queries"\ "(query_time, query_domain, query_fld, query_client_ip) VALUES(%s, %s, %s, %s)" cursor.executemany(sql, long_query_infos) sql = "INSERT INTO suspect_client_ip(ip, date, value) VALUES(%s, %s, 1) "\ "ON DUPLICATE KEY UPDATE value=value+1" cursor.executemany(sql, black_query_infos) connection.commit() tobe_check = [] for record in get_distinct_fld(date, cursor): domain = record[0] if domain not in visited_domain_filter: print(domain) sql = ''' INSERT INTO domain_first_seen SELECT query_fld, MIN(query_time) FROM queries_2020_01_07 WHERE query_fld=%s ''' cursor.execute(sql, domain) visited_domain_filter.add(domain) tobe_check.append(domain) with open("拉清单.txt", "a") as f: for i in tobe_check: f.write(i + "\n") connection.commit()
class DeyangSpider(scrapy.Spider): # 四川德阳公共资源交易 name = 'deyang_spider' allowed_domains = ['ggzyxx.deyang.gov.cn'] # 初始化 def __init__(self, *args, **kwargs): # // 要爬取网站的跟 self.base_url = 'http://ggzyxx.deyang.gov.cn/' super(DeyangSpider, self).__init__(*args, **kwargs) self.bloom_filter = BloomFilter(max_elements=1000000, error_rate=0.1, filename='bf.data') self.num = 0 self.scrawl_mode = ScrawlMode.HISTORY self._stop_parse = False # main 启动函数 def start_requests(self): """ 爬虫默认接口,启动方法 :return: """ _info_type = { "tradeinfo_jygcjs_": {"工程建设"}, "tradeinfo_jycg_": {"政府采购"}, "tradeinfo_gygt_": {"国土矿业权"} # , "tradeinfo_jygzcq_": {"国资产权"} } for _info_item in (_info_type.keys()): _change_url = "http://ggzyxx.deyang.gov.cn/pub/{}/".format( _info_item) _page_url = "http://ggzyxx.deyang.gov.cn/pub/{}.html".format( _info_item) _page_meta = {"_info_item": _info_item} time.sleep(1) yield scrapy.Request(url=_page_url, callback=self.parse_init, meta={'_page_meta': _page_meta}) def parse_init(self, response): """ :param response: :return: """ self._stop_parse = False _total_num = response.xpath( './/div[@class="pagenations"]/a/text()').extract[4] print(_total_num) print("----------------------------") # if int(_total_num) > 0: # try: # for _page_num_item in range(int(_total_num)): # _page_init_detail_url = response.meta["_page_meta"]["_change_url"] # _page_init_detail_url = _page_init_detail_url + "/{}.html".format(_page_num_item + 1) # response.meta["_page_meta"]["_page_init_detail_url"] = _page_init_detail_url # # time.sleep(1) # yield scrapy.Request(url=_page_init_detail_url, callback=self.parse_detail, # meta={'_page_meta': response.meta["_page_meta"]}) # except: # logging.exception(' _total_num is faild {}'.format(response.url)) def parse_detail(self, response): _info_type_detail = {{ "tradeinfo_jygcjs_": "工程建设" }, { "tradeinfo_jycg_": "政府采购" }, { "tradeinfo_gygt_": "国土矿业权" } # , {"tradeinfo_jygzcq_": "国资产权"} } item = DeyangItem() for selector in response.xpath('.//div[@class="search-result"]/ul/li'): time.sleep(random.randint(100, 200) / 1000.0) # 100 - 200 ms # 公告所对应url _content_url = selector.xpath('./a/@href').extract_first() _detail_page_url = response.urljoin(_content_url) item['url'] = _detail_page_url # 唯一标识 _unq_id = CcgpUtil.get_unique_id(_detail_page_url) item['_id'] = _unq_id # 如果是重复数据,不处理 if _unq_id in self.bloom_filter: continue self.bloom_filter.add(_unq_id) # 公告所在地区 item['area'] = "德阳市" print(_detail_page_url) # 公告所在具体地区 # item['area_detail'] = self.__get_area_detail__(selector, _detail_page_url) # 招标人 item['buyer'] = " " # 公告类型 _index_detail = response.meta["_page_meta"]["_index"] _info_item_detail = response.meta["_page_meta"]["_info_item"] item['notice_type'] = _info_type_detail[_info_item_detail][ _index_detail] # source item['source'] = "deYang" # site item['site'] = "deYang" # 公告所对应时间 item['notice_time'] = self.__get_notice_time__( selector, _detail_page_url) # 公告的标题 item['title'] = self.__get_title__(selector, _detail_page_url) # 内容 item['content'] = self.__get_content__(selector, _detail_page_url) print(item) @staticmethod def __get_area_detail__(selector, url): _ret = '' _area_detail = ["上绕市", "银川市", "石嘴山市", "吴忠市", "固原市", "中卫市"] try: _content_text = selector.xpath( 'string(./div[@class="ewb-info-a"]/a)').extract()[0] _content_text = ''.join(_content_text.split()) for _item in _area_detail: if _item in _content_text: _ret = _item break except: logging.exception('{} get_area_detail__ failed'.format(url)) return _ret @staticmethod def __get_notice_time__(selector, url): _ret = '' try: _bid_info = selector.xpath( './span[@class="time"]/text()').extract_first() if _bid_info: _ret = _bid_info.replace('-', '.') + " 00:00:00" except: logging.exception('{} get_notice_time failed'.format(url)) return _ret @staticmethod def __get_title__(selector, url): _ret = '' try: _ret = selector.xpath( './a[@class="weekdays"]/text()').extract_first().replace( '\\n', '').rstrip().lstrip() except: logging.exception('{} get_title failed'.format(url)) return _ret @staticmethod def __get_content__(selector, url): """ 正文内容 如果提取正文内容失败,则判断此次爬取失败,所以这里不能用try except :param selector: :param url: :return: """ _bad = False _ret = '' try: _r = requests.get(url, timeout=15) _r.encoding = 'utf-8' _ret = base64.b64encode(zlib.compress( _r.text.encode('utf-8'))).decode('utf-8') except: _bad = True # 如果有异常,重试一次 if _bad: time.sleep(1) _r = requests.get(url, timeout=15) _r.encoding = 'utf-8' _ret = base64.b64encode(zlib.compress( _r.text.encode('utf-8'))).decode('utf-8') return _ret
class MyFilterSwitch(app_manager.RyuApp): OFP_VERSIONS = [ofproto_v1_3.OFP_VERSION] def _monitor(self): self.web = FlaskAppWrapper('wrap') self.web.add_endpoint(endpoint='/add', endpoint_name='add', handler=self.action) self.web.run() def __init__(self, *args, **kwargs): super(MyFilterSwitch, self).__init__(*args, **kwargs) self.swList = {} #daftar switch self.hostDB = {} #berisi pairing antara host.ID - port number self.bloom = BloomFilter(max_elements=10000, error_rate=0.1) self.randomFilter(12345678, 1000) self.monitor_thread = hub.spawn(self._monitor) def randomFilter(self, seed, maxcount): random.seed(seed) def randomIP(upperbound): return str(random.randint(1, upperbound)) allIP = [] for i in range(maxcount): newIP = "10." + "0" + "." + randomIP(10) + "." + randomIP(250) allIP.append(newIP) self.bloom.add(newIP) print "IP address blocked: " + str(len(allIP)) def action(self): def dropNewIP(dst_ip): dp = self.swList[1] match = dp.ofproto_parser.OFPMatch(in_port=1, eth_type=0x800, ip_proto=0x11, ipv4_dst=dst_ip) actions = [] self.add_flow(dp, match, actions, 2) print("update flow rule, match: IP to " + dst_ip + " output:drop") if request.method != 'POST': ipadr = str(request.args.get('ip')) if ipadr in self.bloom: print "IP address: " + ipadr + " already exist!" else: print "Adding IP address: " + ipadr + " into the black list" self.bloom.add(ipadr) dropNewIP(ipadr) return "Hello world" def add_flow(self, datapath, match, actions, priority=1): ofproto = datapath.ofproto #instruksi dasar untuk mengeksekusi semua perintah di daftar actions inst = [ datapath.ofproto_parser.OFPInstructionActions( ofproto.OFPIT_APPLY_ACTIONS, actions) ] mod = datapath.ofproto_parser.OFPFlowMod( datapath=datapath, #switch id cookie=0, cookie_mask=0, table_id=0, #nomor Flow table dimana flow rule di install command=ofproto.OFPFC_ADD, idle_timeout=0, hard_timeout=0, #timeout = 0 -> tidak memiliki timeout priority=priority, #menentukan urutan matching buffer_id=ofproto.OFP_NO_BUFFER, out_port=ofproto.OFPP_ANY, out_group=ofproto.OFPG_ANY, flags=0, match=match, #perintah match instructions=inst) #perintah actions datapath.send_msg(mod) @set_ev_cls(ofp_event.EventOFPSwitchFeatures, CONFIG_DISPATCHER) def switch_features_handler(self, ev): msg = ev.msg dp = msg.datapath ofproto = dp.ofproto self.swList[dp.id] = dp #semua paket IP tanyakan ke controller match = dp.ofproto_parser.OFPMatch(eth_type=0x800) actions = [ dp.ofproto_parser.OFPActionOutput(ofproto.OFPP_CONTROLLER, ofproto.OFPCML_NO_BUFFER) ] self.add_flow(dp, match, actions) @set_ev_cls(ofp_event.EventOFPPacketIn, MAIN_DISPATCHER) def _packet_in_handler(self, ev): msg = ev.msg in_port = msg.match['in_port'] dp = msg.datapath ofproto = dp.ofproto dpid = dp.id pkt = packet.Packet(msg.data) pkt_ipv4 = pkt.get_protocols(ipv4.ipv4)[0] if pkt_ipv4: dst_ip = pkt_ipv4.dst print "new request: ", dst_ip if dst_ip in self.bloom: #pasang flow rule untuk mendrop paket match = dp.ofproto_parser.OFPMatch(in_port=1, eth_type=0x800, ip_proto=0x11, ipv4_dst=dst_ip) actions = [] self.add_flow(dp, match, actions, 2) print("install flow rule, match: IP to " + dst_ip + " output:drop") else: #pasang flow rule untuk memforward paket ke host 2 actions = [dp.ofproto_parser.OFPActionOutput(2, 0)] data = msg.data out = dp.ofproto_parser.OFPPacketOut(datapath=dp, buffer_id=msg.buffer_id, in_port=in_port, actions=actions, data=data) dp.send_msg(out) #install flowrule untuk memforward paket ke host 2 tanpa menghubungi controller match = dp.ofproto_parser.OFPMatch(in_port=1, eth_type=0x800, ip_proto=0x11, ipv4_dst=dst_ip) actions = [dp.ofproto_parser.OFPActionOutput(2, 0)] self.add_flow(dp, match, actions, 2) print("install flow rule, match: IP to " + dst_ip + " output:2")
list_FN_positions = list() cnt = 0 for label in y_label: if label == 1: if (y_pred[cnt] == 0): list_FN_positions.append(cnt) cnt = cnt + 1 bloom = None from bloom_filter import BloomFilter if len(list_FN_positions) > 0: bloom = BloomFilter(max_elements=len(list_FN_positions), error_rate=fpr_b) for idx in list_FN_positions: bloom.add(str(X[idx, 0])) #Memory usage print("Number of bits:", bloom.num_bits_m) end = datetime.datetime.now( ) # BUILD TIME ENDS - Please use a different function to take this in micro seconds as you did earlier print('Model + Trad Bloom Build Time: ', str(end - start)) # serialize model from joblib import dump, load filename = 'model_rbf' + str(np.random.randint(1000)) # print(filename) dump(rbf_svc, filename + '.compressed', compress=True) fName = filename + '.compressed' print('Model stored in ', fName)
print("Number of Items: " + str(num_of_items)) p = 0.005 print("Probability of false positive error " + str(p)) bit_size = bit_array_size(num_of_items, p) print("Bit Size: "+str(bit_size)) hash_size = size_of_hash(num_of_items, bit_size) print("Hash Size: "+str(hash_size)) bf = BloomFilter(num_of_items, hash_size) word_list = open("word_list.txt").read().splitlines() for word in word_list: bf.add(word) word_list.close() print(bf.lookup("99")) print(bf.lookup("donkey")) print(bf.lookup("oitqv")) print(bf.lookup("fart")) print(bf.lookup("Max")) print(bf.lookup("Dichha")) print(bf.lookup("Khuwalung")) print("++++Random Word SpellChecker++++") alpha=""