def second_new_warn_entity(): minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') b7 = ScalableBloomFilter(100000, 0.001) b30 = ScalableBloomFilter(100000, 0.001) b90 = ScalableBloomFilter(100000, 0.001) for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < 7 and dValue >= 0: [b7.add(i)] if dValue < 30 and dValue >= 0: [b30.add(i)] if dValue < 90 and dValue >= 0: [b90.add(i)] result90 = secondDetectFromBigTable(90, TABLE_REPORT_ILLEGAL, RISK_LEVEL, ILLEGAL_SCORE, 'all', 0, 0, 'all', 'all', TABLE_LOGS, 'all') count7 = 0 count30 = 0 count90 = 0 resultIds = [] for each in result90: if not each['entity_id'] in resultIds: resultIds.append(each['entity_id']) for id in resultIds: if id in b7: count7 += 1 if id in b30: count30 += 1 if id in b90: count90 += 1 result = {'count7': count7, 'count30': count30, 'count90': count90} return json.dumps(result, ensure_ascii=False)
def against_detect_data_from_bigtable(): b = ScalableBloomFilter(1000000, 0.001) date = int(request.args.get('date', '')) operation_mode = request.args.get('operation_mode', '') illegal_type = int(request.args.get('illegal_type', '')) entity_type = int(request.args.get('entity_type', '')) warn_distribute = request.args.get('warn_distribute', '') problem = request.args.get('problem', '') newEntity = int(request.args.get('newEntity', '')) fund_mode = request.args.get('fund_mode', '') result = againstDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL, RISK_LEVEL, ILLEGAL_SCORE, operation_mode, illegal_type, entity_type, warn_distribute, problem, TABLE_LOGS, fund_mode) # 合并相同数据 doubleId = [] for dict in result: if not dict['entity_id'] in b: [b.add(dict['entity_id'])] else: doubleId.append(dict['entity_id']) for id in doubleId: num = 0 illegalTypeList = [] for dict in result: if dict['entity_id'] == id: num += 1 illegalTypeList.append(dict['illegal_type']) dict.update({'illegal_type': illegalTypeList}) if num > 1: result.remove(dict) # 筛选新增实体 if newEntity: bb = ScalableBloomFilter(1000000, 0.001) newResult = [] minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < date and dValue >= 0: [bb.add(i)] for dict in result: if dict['entity_id'] in bb: newResult.append(dict) # 前端传的是id,防止报错,加上id for dict in result: dict.update({'id': dict['entity_id']}) return json.dumps(newResult, ensure_ascii=False) try: result.sort(key=lambda x: x['datetime'], reverse=True) except: pass # 前端传的是id,防止报错,加上id for dict in result: dict.update({'id': dict['entity_id']}) return json.dumps(result, ensure_ascii=False)
def __init__(self, domain, threads, depth, times, headers, father): self.domain = domain if self.domain[self.domain.__len__() - 1] == '/': self.domain = self.domain[0:self.domain.__len__() - 1] self.threads = threads self.times = times self.cookies = {} self.headers = {} self.count = 0 self.controlthread = 0 self.depth = depth self.father = father self.realdomain = '' self.payload = Payload() self.encode = Encode() if headers != '': self.setheader(headers) if 'https' in self.domain: self.domain1 = self.domain.replace('https://', '') self.domain2 = 'http://' + self.domain1 self.domain3 = 'http%3A%2F%2F' + self.domain1 self.domain4 = 'https%3A%2F%2F' + self.domain1 elif 'http' in self.domain: self.domain1 = self.domain.replace('http://', '') self.domain2 = 'https://' + self.domain1 self.domain3 = 'http%3A%2F%2F' + self.domain1 self.domain4 = 'https%3A%2F%2F' + self.domain1 else: self.domain1 = 'http://' + self.domain self.domain2 = 'https://' + self.domain self.domain3 = 'http%3A%2F%2F' + self.domain self.domain4 = 'https%3A%2F%2F' + self.domain self.queue = Queue() self.urlqueue = Queue() self.lock = threading.RLock() self.lock2 = threading.RLock() self.lock3 = threading.RLock() self.lock4 = threading.RLock() self.lock5 = threading.RLock() self.bloomfilter = ScalableBloomFilter( initial_capacity=10000, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.bloomfilter2 = ScalableBloomFilter( initial_capacity=10000, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.blacklist = [ '<', '{', '\'', '"', '.css', '.jpg', '.mp4', '.png', '.gif', '.avi', '.jpeg', '.ico', '.mp3', '.pdf', 'docx', 'doc', 'bmp', '.rmvb', '.zip', '.rar', '.exe', '.ppt', '.pptx', 'xls' ] self.rule = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
def second_detect_data(): b = ScalableBloomFilter(1000000, 0.001) date = int(request.args.get('date', '')) operation_mode = request.args.get('operation_mode', '') illegal_type = int(request.args.get('illegal_type', '')) entity_type = int(request.args.get('entity_type', '')) warn_distribute = request.args.get('warn_distribute', '') problem = request.args.get('problem', '') newEntity = int(request.args.get('newEntity', '')) result = secondDetectData(date, TABLE_ENTITY_LIST, TABLE_MONITOR, TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE, operation_mode, illegal_type, entity_type, warn_distribute, problem, TABLE_INDEX_QUANTILE, TABLE_GUARANTEE_PROMISE, TABLE_LOGS) doubleId = [] for dict in result: if not dict['id'] in b: [b.add(dict['id'])] else: doubleId.append(dict['id']) for id in doubleId: num = 0 illegalTypeList = [] for dict in result: if dict['id'] == id: num += 1 illegalTypeList.append(dict['illegal_type']) dict.update({'illegal_type': illegalTypeList}) if num > 1: result.remove(dict) if newEntity: bb = ScalableBloomFilter(1000000, 0.001) newResult = [] minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < date: [bb.add(i)] for dict in result: if dict['id'] in bb: newResult.append(dict) return json.dumps(newResult, ensure_ascii=False) try: result.sort(key=lambda x: x['datetime'], reverse=True) except: pass return json.dumps(result, ensure_ascii=False)
def total_detect_data_test(): b = ScalableBloomFilter(1000000, 0.001) date = int(request.args.get('date', '')) operation_mode = request.args.get('operation_mode', '') #多选 illegal_type = int(request.args.get('illegal_type', '')) entity_type = int(request.args.get('entity_type', '')) warn_distribute = request.args.get('warn_distribute', '') #多选 problem = request.args.get('problem', '') #多选 newEntity = int(request.args.get('newEntity', '')) checked = int(request.args.get('checked', '')) fund_mode = request.args.get('fund_mode', '') result = totalDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL, operation_mode, illegal_type, entity_type, warn_distribute, problem, checked, fund_mode) # 将illegal_type不同的两个实体合并 doubleId = [] for dict in result: if not dict['entity_id'] in b: [b.add(dict['entity_id'])] else: doubleId.append(dict['entity_id']) for id in doubleId: num = 0 illegalTypeList = [] for dict in result: if dict['entity_id'] == id: num += 1 illegalTypeList.append(dict['illegal_type']) dict.update({'illegal_type': illegalTypeList}) if num > 1: result.remove(dict) # 筛选新增实体 if newEntity: bb = ScalableBloomFilter(1000000, 0.001) newResult = [] minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < date and dValue >= 0: [bb.add(i)] for dict in result: if dict['entity_id'] in bb: newResult.append(dict) return json.dumps(newResult, ensure_ascii=False) return json.dumps(result, ensure_ascii=False)
def generate_task( self, generate_func_name, g_kw={}, sleep=180, times=20, ): ''' params: generate_func_name -> 任务生成函数的名字 params: g_kw -> generate_func的关键字参数 params: sleep,times ->每过sleep秒执行一次generate_func,times为执行的次数 任务生成函数,可多次执行generate_func,无需多次将times设置为1即可 ''' if self.is_filter: self.sbf = ScalableBloomFilter() else: self.sbf = None table = Table(logger=self.logger) generate_func = getattr(table, generate_func_name) e_kw = dict( generate_func=generate_func, g_kw=g_kw, ) self.loop_task(execute_func=self.core_generate_task, e_kw=e_kw, flag=1, sleep=sleep, times=times) table.close()
class RedisJob(object): redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=1) url_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) @classmethod def push_job(cls, job_type, job_info): if 'url' in job_info: if job_info['url'] not in cls.url_filter: cls.url_filter.add(job_info['url']) r = redis.Redis(connection_pool=cls.redis_pool) r.lpush(str(job_type), json.dumps(job_info)) LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info))) else: LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info))) else: r = redis.Redis(connection_pool=cls.redis_pool) r.lpush(str(job_type), json.dumps(job_info)) LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info))) @classmethod def fetch_job(cls, job_type): r = redis.Redis(connection_pool=cls.redis_pool) job_info = r.lpop(job_type) if job_info: LOGGER.info('fetched job: %s' % job_info) return json.loads(job_info) else: return None
def __init__(self, tasks=2, loop=None): self.tasks = tasks self.loop = loop or asyncio.get_event_loop() self.redis_cookie = RedisCookie() self.redis_job = RedisJob() self.bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.weibo_limit = True self.time_current_pattern = re.compile(r'(\d*)分钟前') self.time_today_pattern = re.compile(r'今天\s*(\d*):(\d*)') self.time_year_pattern = re.compile(r'(\d*)月(\d*)日\s*(\d*):(\d*)') self.user_id_pattern = re.compile(r'https://weibo.cn/u/(\d*)') self.weibo_host = 'https://weibo.cn' self.follow_url = self.weibo_host + '/%s/follow' self.fan_url = self.weibo_host + '/%s/fans' self.user_info_url = self.weibo_host + '/%s/info' self.user_tweet_url = self.weibo_host + '/%s' self.user_tweet_url2 = self.weibo_host + '/%s?page=%d' self.user_repost_url = self.weibo_host + '/repost/%s' self.user_repost_url2 = self.weibo_host + '/repost/%s?page=%d' self.tweet_comment_url = self.weibo_host + '/comment/%s' self.tweet_comment_url2 = self.weibo_host + '/comment/%s?page=%d' self.weibo_producer = WeiboProcuder(['localhost:9092'], 'sinaweibo') self.search_url = 'https://weibo.cn/search/?pos=search' self.get_search_url = 'https://weibo.cn/search/mblog/?keyword=%s&filter=hasori'
def dedup_lines_bloom(text, just_words=True, zero_digits=True, capacity=100000, error=0.00001): sbf = ScalableBloomFilter(initial_capacity=capacity, error_rate=error, mode=ScalableBloomFilter.LARGE_SET_GROWTH) for line in text: if not isinstance(line, str): raise TypeError( 'Expected "text" to contain stings, found: {}'.format( type(line))) key = line.strip() if not key: yield line key = normalize('NFKD', key) if just_words: key = ' '.join(re.findall(r'\w+', key)) if zero_digits: key = re.sub(r'\d', '0', key) if key in sbf: line = '' else: sbf.add(key) yield line
def add_to_filter(self, update=False): # https://github.com/bigsnarfdude/Malware-Probabilistic-Data-Structres/blob/master/Mandiant_MD5_BloomFilter.py def stream_lines(filename): file = open(filename) while True: line = file.readline() if not line: file.close() break yield line.strip() def load_file(filename): lines = stream_lines(filename) templist = [] for line in lines: templist.append(line) return templist itemlist = load_file(self.datafile) self.itemcount = len(itemlist) if not update: # reinitialize filter before adding a new set of items self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for item in itemlist: _ = self.filter.add(item)
def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file) as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path) as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
def boot1(self): try: self.multiFile.seek(0) a = ScalableBloomFilter.fromfile(self.multiFile) return a except: return ScalableBloomFilter(ScalableBloomFilter.LARGE_SET_GROWTH)
def __init__(self, datafile, filterfile): # https://github.com/jaybaird/python-bloomfilter/blob/master/pybloom/pybloom.py self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.datafile = datafile self.filterfile = filterfile self.datafilesize = None self.filterfilesize = None self.change = None
class WishLoginSpider(scrapy.Spider): name = "wishlogin" allowed_domains = ["wish.com"] start_urls = ('http://www.wish.com/', ) merchants = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) xsrfpattern = re.compile(r'.*_xsrf=(.*?);') def __init__(self, username, password, ajaxcount=100): self.username = username self.password = password self.ajaxcount = ajaxcount from scrapy import optional_features optional_features.remove('boto') def start_requests(self): yield scrapy.Request('https://www.wish.com/', callback=self.login) def login(self, response): match = self.xsrfpattern.match(str(response.headers)) if match: xsrf = match.group(1) body = urlencode({ 'email': self.username, 'password': self.password, '_buckets': '', '_experiments': '', }) print body request = scrapy.Request( 'https://www.wish.com/api/email-login', method='POST', headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'X-XSRFToken': xsrf, }, body=body, meta={'xsrf': xsrf}, callback=self.request_tab) print request.headers yield request def request_tab(self, response): print response.body
def open_spider(self, spider): self.fileName = spider.name + self.fileName if os.path.exists(self.fileName): with open(self.fileName, 'rb') as f: self.sbf = ScalableBloomFilter.fromfile(f) else: self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) pass
def __init__(self, withDistinct=None): super(DistinctElementCount, self).__init__() self.count = 0 self.bloom = None self.set = None if withDistinct: self.bloom = ScalableBloomFilter(error_rate=0.00001) self.distinct = 0 self.set = set([])
class RedisJob(object): _pool = None url_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) def __init__(self, **kwargs): self._host = kwargs[ 'host'] if 'host' in kwargs else 'redis://localhost:6378' self._db = kwargs['db'] if 'db' in kwargs else 1 self._minsize = kwargs['minsize'] if 'minsize' in kwargs else 5 self._maxsize = kwargs['maxsize'] if 'maxsize' in kwargs else 10 async def init_pool(self): LOGGER.info( "init redis pool (host: %s, db: %d, minsize: %d, maxsize: %d)" % (self._host, self._db, self._minsize, self._maxsize)) self._pool = await aioredis.create_pool(self._host, db=self._db, minsize=self._minsize, maxsize=self._maxsize) async def push_job(self, job_type, job_info): if not self._pool: await self.init_pool() url = job_info.get('url', '') if url and url in self.url_filter: LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info))) return else: self.url_filter.add(url) with await self._pool as conn: await conn.execute('lpush', str(job_type), json.dumps(job_info)) LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info))) async def fetch_job(self, job_type): if not self._pool: await self.init_pool() with await self._pool as conn: job_info = await conn.execute('rpop', job_type) if job_info: LOGGER.info('fetched job: %s' % job_info) return json.loads(job_info) else: return None async def clean(self): if not self._pool: await self.init_pool() with await self._pool as conn: keys = await conn.execute('keys', '*') for key in keys: LOGGER.info("del %s" % key) await conn.execute('del', key)
def __init__(self, capacity=None, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH): self.capacity = capacity if capacity is None: self.bf = ScalableBloomFilter(mode=mode) else: self.bf = BloomFilter(capacity=capacity, error_rate=error_rate)
def __init__(self, BFsNo, deltas, modelsNo=3): '''create bloom filters''' BFs = [] self.deltas = deltas self.modelsNo = modelsNo self.BFsNo = BFsNo deltas_no = len(deltas) for i in range(deltas_no * BFsNo): bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) BFs.append(bf) self.BFs = np.array(BFs) self.BFs = self.BFs.reshape(deltas_no, BFsNo)
def get_city_rank(table, table4, field, province_name, risk_level): cur = defaultDatabase() city_list = [] list = [] province_list = [] sql = "select max(date) from %s" % table cur.execute(sql) end_time = cur.fetchall()[0][0] start_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7) start_time = start_time.strftime("%Y-%m-%d") start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30) start_time1 = start1_time.strftime("%Y-%m-%d") sql1 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % ( table, table4, table4, start_time, end_time, risk_level) cur.execute(sql1) res1 = cur.fetchall() result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1] sql2 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % ( table, table4, table4, start_time1, end_time, risk_level) cur.execute(sql2) res2 = cur.fetchall() result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2] result = result1 + result2 b = ScalableBloomFilter(1000000, 0.001) for p in result: if not p['city'] in b: [b.add(p['city'])] city_list.append({'province': p['province'], 'city': p['city']}) for d in city_list: if not d['province'] in province_list: province_list.append(d['province']) if province_name: for d in city_list: if d['province'] == province_name and d['city']: pro_dict = {"province": d['province'], "city": d['city']} for dict in result1: if dict['city'] == d['city']: pro_dict.update({'count7': dict['count']}) for dict in result2: if dict['city'] == d['city']: pro_dict.update({'count30': dict['count']}) list.append(pro_dict) if not province_name: for p in province_list: if p: pro_dict = {"province": p} count = 0 for dict in result1: if dict['province'] == p: count += dict['count'] pro_dict.update({"count": count}) list.append(pro_dict) return list
def __init__(self, model, dummy=False): super(ScrapeWorker, self).__init__() self.source_q = Queue() self.parse_q = Queue() self.seen = ScalableBloomFilter() self.forwarded = ScalableBloomFilter() self.new_sources = [] self.workers = [] self.to_forward = [] self.parser = None self.done_parsing = False self.no_more_sources = False self.dbs = dict() self.schedule = model.schedule self.model = model self.source_kill = None self.dummy = dummy db_threads = defaultdict(list) # Check if the functions in each template are used properly # and store which types of databases are needed. for phase in self.model.phases: for template in phase.templates: self.check_functions(template, phase) if template.db_type: db_threads[template.db_type].append(template) # Start all the threads necessary for storing the data and give each # template a reference to the thread it needs to store data in. for thread, templates in db_threads.items(): if not dummy: store_thread = databases._threads[thread]() else: store_thread = databases._threads['dummy']() for template in templates: self.dbs[template.name] = store_thread store_thread.start()
def ParseQueue(): # Load Checked Urls File if os.path.isfile(path_checked_url_file): with open(path_checked_url_file, 'rb') as rf: checked_url_pool = ScalableBloomFilter.fromfile(rf) print("bf: Read pybloom from %s.\n" % path_checked_url_file) else: checked_url_pool = ScalableBloomFilter( initial_capacity=1000, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) print("bf: Create pybloom") # Get each Item from Queue i = 1 # URL_QUEUE.put_nowait(None) # sign the end of Queue # for item in iter(URL_QUEUE.get_nowait, None): # cur_url = item[2] URL_DEQUE.appendleft(None) for item in iter(URL_DEQUE.pop, None): cur_url = item[2] if (cur_url in checked_url_pool) == False: # cur_url never checked try: time.sleep(0.3) page_html_raw = requests.get(cur_url, timeout=3) except requests.RequestException as e: print(e) # URL_DEQUE.appendleft(cur_url) with open(path_requestErr_log, 'a') as f_requestErr: f_requestErr.write( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "Timeout " + cur_url + '\n') else: page_html = page_html_raw.content.decode('utf-8', 'ignore') buffer = parser4me.parser_4_1(item, page_html) with open(path_output_folder + os.path.sep + item[1] + item[0][0:128] + ".txt", 'w', encoding='utf-8') as resf: resf.write(buffer) print("%s OK! to file %s" % (i, item[0])) checked_url_pool.add(cur_url) i += 1 else: print("Skip %s" % i) i += 1 with open(path_checked_url_file, 'wb') as wf: checked_url_pool.tofile(wf)
def __init__(self): super(StreamingTriangles, self).__init__() # Set up connection to Redis server self.redis_server = 'localhost' self.redis_db = redis.StrictRedis(host=self.redis_server, port=6379, db=0) # Initialize reservoir sizes self.edge_res_size = 40000 self.wedge_res_size = 40000 # Set Scalable Bloom Filter for ignoring repeated edges self.bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) # Init counters and arrays for Streaming-Triangles algorithm self.edge_count = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0} self.total_wedges = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0} self.edge_res = { RED: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], BLUE: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], YELLOW: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], GREEN: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)] } self.wedge_res = { RED: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], BLUE: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], YELLOW: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], GREEN: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)] } self.is_closed = { RED: [False for _ in xrange(self.wedge_res_size)], BLUE: [False for _ in xrange(self.wedge_res_size)], YELLOW: [False for _ in xrange(self.wedge_res_size)], GREEN: [False for _ in xrange(self.wedge_res_size)] } # Track percent of uncategorized transactions self.num_missed = 0 self.num_colored = 0
def second_new_warn_entity(): minDates = getMinDate(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') b7 = ScalableBloomFilter(100000, 0.001) b30 = ScalableBloomFilter(100000, 0.001) b90 = ScalableBloomFilter(100000, 0.001) for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < 7 and dValue >= 0: [b7.add(i)] if dValue < 30 and dValue >= 0: [b30.add(i)] if dValue < 90 and dValue >= 0: [b90.add(i)] result90 = secondDetectData(90, TABLE_ENTITY_LIST, TABLE_MONITOR, TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE, 'all', 0, 0, 'all', 'all', TABLE_INDEX_QUANTILE, TABLE_GUARANTEE_PROMISE) count7 = 0 count30 = 0 count90 = 0 resultIds = [] for each in result90: if not each['id'] in resultIds: resultIds.append(each['id']) for id in resultIds: if id in b7: count7 += 1 if id in b30: print(id) count30 += 1 if id in b90: count90 += 1 result = {'count7': count7, 'count30': count30, 'count90': count90} return json.dumps(result, ensure_ascii=False)
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001): """ Converts the iterable into a ScalableBloomFilter :rtype : pybloom.ScalableBloomFilter :param iterable: :param init_cap: :param err_rate: """ bloom = ScalableBloomFilter(init_cap, err_rate) for element in iterable: bloom.add(element) return bloom
def __init__(self, bloomfile, spider_name): self.bloomfile = bloomfile self.spider_name = spider_name # item crawled before logger.info("loading crawled items before...") if os.path.isfile(self.bloomfile): f = open(self.bloomfile, 'r') self.item_crawled = ScalableBloomFilter.fromfile(f) f.close() else: self.item_crawled = ScalableBloomFilter( 100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) cnt = self.item_crawled.count logger.info("pipline read %d crawled items" % cnt)
def __init__(self, city): """豆瓣页面抓取,抓取正在上映列表和电影介绍页。 :param city: 抓取影片数据的城市。 """ self._url = 'https://movie.douban.com/cinema/nowplaying/{}/'.format( city.lower()) # 电影列表页请求头 self._list_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep - alive', 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' } # 电影介绍页请求头 self._info_headers = self._list_headers.copy() self._info_headers.update({'Referer': self._url}) # 用布隆过滤器去重 self._bf = ScalableBloomFilter() cfg = ConfigParser() cfg.read('config.ini') db_host = cfg.get('database', 'host') db_port = cfg.getint('database', 'port') db_dbname = cfg.get('database', 'database') db_collection = cfg.get('database', 'collection') self._db = MongoClient(db_host, db_port)[db_dbname][db_collection] for movie in self._db.find({}): self.logger.debug('get {} in database'.format(movie['url'])) self._bf.add(movie['url'])
def add_sbf(self, query=None): ''' params: query -->mysql 查询语句 过滤任务处理结果 ''' if query is None: return None sbf = ScalableBloomFilter() table = Table(logger=self.logger) result_dict = table.execute(query=query) data = result_dict.get('data') for each in data: id = each.get('id') sbf.add(int(id)) table.close() return sbf
def get_province_rank(table, table4, field, risk_level): cur = defaultDatabase() list = [] province_list = [] sql = "select max(date) from %s" % table cur.execute(sql) end_time = cur.fetchall()[0][0] start0_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7) start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30) start_time0 = start0_time.strftime("%Y-%m-%d") start_time1 = start1_time.strftime("%Y-%m-%d") sql1 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % ( table, table4, table4, start_time0, end_time, risk_level) cur.execute(sql1) res1 = cur.fetchall() result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1] sql2 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % ( table, table4, table4, start_time1, end_time, risk_level) cur.execute(sql2) res2 = cur.fetchall() result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2] result = result1 + result2 b = ScalableBloomFilter(1000000, 0.001) for p in result: if not p['province'] in b: [b.add(p['province'])] province_list.append(p['province']) for d in province_list: if d: pro_dict = {"province": d} for dict in result1: if dict['province'] == d: pro_dict.update({'count7': dict['count']}) for dict in result2: if dict['province'] == d: pro_dict.update({'count30': dict['count']}) list.append(pro_dict) for li in list: try: if li['count7']: pass except: li['count7'] = 0 return list
def getHotSpot(entity_list): type = 'type1' results = [] number = 0 for dict in entity_list: indexB = ScalableBloomFilter(1000,0.001) for index_name in ['bbs','forum','webo']: query_body = { "sort":{"publish_time":{"order":"desc"}}, "query": { "bool": { "must": [ { "match": { "content": dict['name'] } }, { "match": { "em1": 1 } } ] } } } res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100) hits = res['hits']['hits'] if(len(hits)): for item in hits: if dict['name'] in item['_source']['content']: if not index_name in indexB: if number < 10: id = dict['id'] entity_name = dict['name'] entity_type = dict['entity_type'] content = item['_source']['content'] results.append({'id':id,'name':entity_name,'content':content,'entity_type':entity_type}) [indexB.add(index_name)] number += 1 if not number < 10: break return results