コード例 #1
0
def second_new_warn_entity():
    minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                           ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
    row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
    b7 = ScalableBloomFilter(100000, 0.001)
    b30 = ScalableBloomFilter(100000, 0.001)
    b90 = ScalableBloomFilter(100000, 0.001)
    for i, k in minDates.items():
        dateTime = datetime.strptime(k, '%Y-%m-%d')
        dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
        if dValue < 7 and dValue >= 0:
            [b7.add(i)]
        if dValue < 30 and dValue >= 0:
            [b30.add(i)]
        if dValue < 90 and dValue >= 0:
            [b90.add(i)]
    result90 = secondDetectFromBigTable(90, TABLE_REPORT_ILLEGAL, RISK_LEVEL,
                                        ILLEGAL_SCORE, 'all', 0, 0, 'all',
                                        'all', TABLE_LOGS, 'all')
    count7 = 0
    count30 = 0
    count90 = 0
    resultIds = []
    for each in result90:
        if not each['entity_id'] in resultIds:
            resultIds.append(each['entity_id'])
    for id in resultIds:
        if id in b7:
            count7 += 1
        if id in b30:
            count30 += 1
        if id in b90:
            count90 += 1
    result = {'count7': count7, 'count30': count30, 'count90': count90}
    return json.dumps(result, ensure_ascii=False)
コード例 #2
0
def against_detect_data_from_bigtable():
    b = ScalableBloomFilter(1000000, 0.001)
    date = int(request.args.get('date', ''))
    operation_mode = request.args.get('operation_mode', '')
    illegal_type = int(request.args.get('illegal_type', ''))
    entity_type = int(request.args.get('entity_type', ''))
    warn_distribute = request.args.get('warn_distribute', '')
    problem = request.args.get('problem', '')
    newEntity = int(request.args.get('newEntity', ''))
    fund_mode = request.args.get('fund_mode', '')
    result = againstDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL,
                                           RISK_LEVEL, ILLEGAL_SCORE,
                                           operation_mode, illegal_type,
                                           entity_type, warn_distribute,
                                           problem, TABLE_LOGS, fund_mode)
    # 合并相同数据
    doubleId = []
    for dict in result:
        if not dict['entity_id'] in b:
            [b.add(dict['entity_id'])]
        else:
            doubleId.append(dict['entity_id'])
    for id in doubleId:
        num = 0
        illegalTypeList = []
        for dict in result:
            if dict['entity_id'] == id:
                num += 1
                illegalTypeList.append(dict['illegal_type'])
                dict.update({'illegal_type': illegalTypeList})
                if num > 1:
                    result.remove(dict)
    # 筛选新增实体
    if newEntity:
        bb = ScalableBloomFilter(1000000, 0.001)
        newResult = []
        minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                               ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
        row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
        for i, k in minDates.items():
            dateTime = datetime.strptime(k, '%Y-%m-%d')
            dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
            if dValue < date and dValue >= 0:
                [bb.add(i)]
        for dict in result:
            if dict['entity_id'] in bb:
                newResult.append(dict)
        # 前端传的是id,防止报错,加上id
        for dict in result:
            dict.update({'id': dict['entity_id']})
        return json.dumps(newResult, ensure_ascii=False)
    try:
        result.sort(key=lambda x: x['datetime'], reverse=True)
    except:
        pass
    # 前端传的是id,防止报错,加上id
    for dict in result:
        dict.update({'id': dict['entity_id']})
    return json.dumps(result, ensure_ascii=False)
コード例 #3
0
ファイル: crawler.py プロジェクト: xssmap/xssmap
 def __init__(self, domain, threads, depth, times, headers, father):
     self.domain = domain
     if self.domain[self.domain.__len__() - 1] == '/':
         self.domain = self.domain[0:self.domain.__len__() - 1]
     self.threads = threads
     self.times = times
     self.cookies = {}
     self.headers = {}
     self.count = 0
     self.controlthread = 0
     self.depth = depth
     self.father = father
     self.realdomain = ''
     self.payload = Payload()
     self.encode = Encode()
     if headers != '':
         self.setheader(headers)
     if 'https' in self.domain:
         self.domain1 = self.domain.replace('https://', '')
         self.domain2 = 'http://' + self.domain1
         self.domain3 = 'http%3A%2F%2F' + self.domain1
         self.domain4 = 'https%3A%2F%2F' + self.domain1
     elif 'http' in self.domain:
         self.domain1 = self.domain.replace('http://', '')
         self.domain2 = 'https://' + self.domain1
         self.domain3 = 'http%3A%2F%2F' + self.domain1
         self.domain4 = 'https%3A%2F%2F' + self.domain1
     else:
         self.domain1 = 'http://' + self.domain
         self.domain2 = 'https://' + self.domain
         self.domain3 = 'http%3A%2F%2F' + self.domain
         self.domain4 = 'https%3A%2F%2F' + self.domain
     self.queue = Queue()
     self.urlqueue = Queue()
     self.lock = threading.RLock()
     self.lock2 = threading.RLock()
     self.lock3 = threading.RLock()
     self.lock4 = threading.RLock()
     self.lock5 = threading.RLock()
     self.bloomfilter = ScalableBloomFilter(
         initial_capacity=10000,
         error_rate=0.001,
         mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     self.bloomfilter2 = ScalableBloomFilter(
         initial_capacity=10000,
         error_rate=0.001,
         mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     self.blacklist = [
         '<', '{', '\'', '"', '.css', '.jpg', '.mp4', '.png', '.gif',
         '.avi', '.jpeg', '.ico', '.mp3', '.pdf', 'docx', 'doc', 'bmp',
         '.rmvb', '.zip', '.rar', '.exe', '.ppt', '.pptx', 'xls'
     ]
     self.rule = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
コード例 #4
0
def second_detect_data():
    b = ScalableBloomFilter(1000000, 0.001)
    date = int(request.args.get('date', ''))
    operation_mode = request.args.get('operation_mode', '')
    illegal_type = int(request.args.get('illegal_type', ''))
    entity_type = int(request.args.get('entity_type', ''))
    warn_distribute = request.args.get('warn_distribute', '')
    problem = request.args.get('problem', '')
    newEntity = int(request.args.get('newEntity', ''))
    result = secondDetectData(date, TABLE_ENTITY_LIST, TABLE_MONITOR,
                              TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE,
                              operation_mode, illegal_type, entity_type,
                              warn_distribute, problem, TABLE_INDEX_QUANTILE,
                              TABLE_GUARANTEE_PROMISE, TABLE_LOGS)
    doubleId = []
    for dict in result:
        if not dict['id'] in b:
            [b.add(dict['id'])]
        else:
            doubleId.append(dict['id'])
    for id in doubleId:
        num = 0
        illegalTypeList = []
        for dict in result:
            if dict['id'] == id:
                num += 1
                illegalTypeList.append(dict['illegal_type'])
                dict.update({'illegal_type': illegalTypeList})
                if num > 1:
                    result.remove(dict)
    if newEntity:
        bb = ScalableBloomFilter(1000000, 0.001)
        newResult = []
        minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                               ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
        row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
        for i, k in minDates.items():
            dateTime = datetime.strptime(k, '%Y-%m-%d')
            dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
            if dValue < date:
                [bb.add(i)]
        for dict in result:
            if dict['id'] in bb:
                newResult.append(dict)
        return json.dumps(newResult, ensure_ascii=False)
    try:
        result.sort(key=lambda x: x['datetime'], reverse=True)
    except:
        pass
    return json.dumps(result, ensure_ascii=False)
コード例 #5
0
def total_detect_data_test():
    b = ScalableBloomFilter(1000000, 0.001)
    date = int(request.args.get('date', ''))
    operation_mode = request.args.get('operation_mode', '')  #多选
    illegal_type = int(request.args.get('illegal_type', ''))
    entity_type = int(request.args.get('entity_type', ''))
    warn_distribute = request.args.get('warn_distribute', '')  #多选
    problem = request.args.get('problem', '')  #多选
    newEntity = int(request.args.get('newEntity', ''))
    checked = int(request.args.get('checked', ''))
    fund_mode = request.args.get('fund_mode', '')
    result = totalDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL,
                                         operation_mode, illegal_type,
                                         entity_type, warn_distribute, problem,
                                         checked, fund_mode)
    # 将illegal_type不同的两个实体合并
    doubleId = []
    for dict in result:
        if not dict['entity_id'] in b:
            [b.add(dict['entity_id'])]
        else:
            doubleId.append(dict['entity_id'])
    for id in doubleId:
        num = 0
        illegalTypeList = []
        for dict in result:
            if dict['entity_id'] == id:
                num += 1
                illegalTypeList.append(dict['illegal_type'])
                dict.update({'illegal_type': illegalTypeList})
                if num > 1:
                    result.remove(dict)
    # 筛选新增实体
    if newEntity:
        bb = ScalableBloomFilter(1000000, 0.001)
        newResult = []
        minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                               ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
        row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
        for i, k in minDates.items():
            dateTime = datetime.strptime(k, '%Y-%m-%d')
            dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
            if dValue < date and dValue >= 0:
                [bb.add(i)]
        for dict in result:
            if dict['entity_id'] in bb:
                newResult.append(dict)
        return json.dumps(newResult, ensure_ascii=False)
    return json.dumps(result, ensure_ascii=False)
コード例 #6
0
ファイル: task_tool.py プロジェクト: chinaylssly/aastory
    def generate_task(
        self,
        generate_func_name,
        g_kw={},
        sleep=180,
        times=20,
    ):
        '''
        params: generate_func_name -> 任务生成函数的名字
        params: g_kw -> generate_func的关键字参数
        params: sleep,times ->每过sleep秒执行一次generate_func,times为执行的次数

        任务生成函数,可多次执行generate_func,无需多次将times设置为1即可
        '''
        if self.is_filter:
            self.sbf = ScalableBloomFilter()
        else:
            self.sbf = None

        table = Table(logger=self.logger)
        generate_func = getattr(table, generate_func_name)
        e_kw = dict(
            generate_func=generate_func,
            g_kw=g_kw,
        )

        self.loop_task(execute_func=self.core_generate_task,
                       e_kw=e_kw,
                       flag=1,
                       sleep=sleep,
                       times=times)
        table.close()
コード例 #7
0
ファイル: redis_cookies.py プロジェクト: zhujiajunup/graduate
class RedisJob(object):
    redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=1)
    url_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    @classmethod
    def push_job(cls, job_type, job_info):

        if 'url' in job_info:
            if job_info['url'] not in cls.url_filter:
                cls.url_filter.add(job_info['url'])
                r = redis.Redis(connection_pool=cls.redis_pool)
                r.lpush(str(job_type), json.dumps(job_info))
                LOGGER.info("push %s job into redis: %s" %
                            (job_type, str(job_info)))
            else:
                LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info)))
        else:
            r = redis.Redis(connection_pool=cls.redis_pool)
            r.lpush(str(job_type), json.dumps(job_info))
            LOGGER.info("push %s job into redis: %s" %
                        (job_type, str(job_info)))

    @classmethod
    def fetch_job(cls, job_type):
        r = redis.Redis(connection_pool=cls.redis_pool)
        job_info = r.lpop(job_type)
        if job_info:
            LOGGER.info('fetched job: %s' % job_info)
            return json.loads(job_info)
        else:
            return None
コード例 #8
0
    def __init__(self, tasks=2, loop=None):
        self.tasks = tasks
        self.loop = loop or asyncio.get_event_loop()
        self.redis_cookie = RedisCookie()
        self.redis_job = RedisJob()
        self.bloom_filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.weibo_limit = True
        self.time_current_pattern = re.compile(r'(\d*)分钟前')
        self.time_today_pattern = re.compile(r'今天\s*(\d*):(\d*)')
        self.time_year_pattern = re.compile(r'(\d*)月(\d*)日\s*(\d*):(\d*)')
        self.user_id_pattern = re.compile(r'https://weibo.cn/u/(\d*)')
        self.weibo_host = 'https://weibo.cn'
        self.follow_url = self.weibo_host + '/%s/follow'

        self.fan_url = self.weibo_host + '/%s/fans'
        self.user_info_url = self.weibo_host + '/%s/info'
        self.user_tweet_url = self.weibo_host + '/%s'
        self.user_tweet_url2 = self.weibo_host + '/%s?page=%d'
        self.user_repost_url = self.weibo_host + '/repost/%s'
        self.user_repost_url2 = self.weibo_host + '/repost/%s?page=%d'
        self.tweet_comment_url = self.weibo_host + '/comment/%s'
        self.tweet_comment_url2 = self.weibo_host + '/comment/%s?page=%d'
        self.weibo_producer = WeiboProcuder(['localhost:9092'], 'sinaweibo')
        self.search_url = 'https://weibo.cn/search/?pos=search'
        self.get_search_url = 'https://weibo.cn/search/mblog/?keyword=%s&filter=hasori'
コード例 #9
0
ファイル: ddup.py プロジェクト: shkarupa-alex/nlpclean
def dedup_lines_bloom(text,
                      just_words=True,
                      zero_digits=True,
                      capacity=100000,
                      error=0.00001):
    sbf = ScalableBloomFilter(initial_capacity=capacity,
                              error_rate=error,
                              mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    for line in text:
        if not isinstance(line, str):
            raise TypeError(
                'Expected "text" to contain stings, found: {}'.format(
                    type(line)))

        key = line.strip()
        if not key:
            yield line

        key = normalize('NFKD', key)

        if just_words:
            key = ' '.join(re.findall(r'\w+', key))
        if zero_digits:
            key = re.sub(r'\d', '0', key)

        if key in sbf:
            line = ''
        else:
            sbf.add(key)

        yield line
コード例 #10
0
  def add_to_filter(self, update=False):
    # https://github.com/bigsnarfdude/Malware-Probabilistic-Data-Structres/blob/master/Mandiant_MD5_BloomFilter.py
    def stream_lines(filename):
      file = open(filename)
      while True:
        line = file.readline()
        if not line:
          file.close()
          break
        yield line.strip()

    def load_file(filename):
      lines = stream_lines(filename)
      templist = []
      for line in lines:
        templist.append(line)

      return templist

    itemlist = load_file(self.datafile)
    self.itemcount = len(itemlist)

    if not update:
      # reinitialize filter before adding a new set of items
      self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    for item in itemlist:
      _ = self.filter.add(item)
コード例 #11
0
ファイル: dbpedia.py プロジェクト: we1l1n/SQG
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file) as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path) as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
コード例 #12
0
ファイル: bloomset.py プロジェクト: goelyash/Spider
 def boot1(self):
     try:
         self.multiFile.seek(0)
         a = ScalableBloomFilter.fromfile(self.multiFile)
         return a
     except:
         return ScalableBloomFilter(ScalableBloomFilter.LARGE_SET_GROWTH)
コード例 #13
0
 def __init__(self, datafile, filterfile):
   # https://github.com/jaybaird/python-bloomfilter/blob/master/pybloom/pybloom.py
   self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
   self.datafile = datafile
   self.filterfile = filterfile
   self.datafilesize = None
   self.filterfilesize = None
   self.change = None
コード例 #14
0
ファイル: wishlogin.py プロジェクト: chntylz/wish
class WishLoginSpider(scrapy.Spider):
    name = "wishlogin"
    allowed_domains = ["wish.com"]
    start_urls = ('http://www.wish.com/', )

    merchants = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    xsrfpattern = re.compile(r'.*_xsrf=(.*?);')

    def __init__(self, username, password, ajaxcount=100):
        self.username = username
        self.password = password
        self.ajaxcount = ajaxcount

        from scrapy import optional_features
        optional_features.remove('boto')

    def start_requests(self):
        yield scrapy.Request('https://www.wish.com/', callback=self.login)

    def login(self, response):
        match = self.xsrfpattern.match(str(response.headers))

        if match:
            xsrf = match.group(1)

            body = urlencode({
                'email': self.username,
                'password': self.password,
                '_buckets': '',
                '_experiments': '',
            })

            print body

            request = scrapy.Request(
                'https://www.wish.com/api/email-login',
                method='POST',
                headers={
                    'Accept': 'application/json, text/javascript, */*; q=0.01',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6',
                    'Content-Type':
                    'application/x-www-form-urlencoded; charset=UTF-8',
                    'X-Requested-With': 'XMLHttpRequest',
                    'X-XSRFToken': xsrf,
                },
                body=body,
                meta={'xsrf': xsrf},
                callback=self.request_tab)

            print request.headers

            yield request

    def request_tab(self, response):
        print response.body
コード例 #15
0
 def open_spider(self, spider):
     self.fileName = spider.name + self.fileName
     if os.path.exists(self.fileName):
         with open(self.fileName, 'rb') as f:
             self.sbf = ScalableBloomFilter.fromfile(f)
     else:
         self.sbf = ScalableBloomFilter(
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     pass
コード例 #16
0
 def __init__(self, withDistinct=None):
     super(DistinctElementCount, self).__init__()
     self.count = 0
     self.bloom = None
     self.set = None
     if withDistinct:
         self.bloom = ScalableBloomFilter(error_rate=0.00001)
         self.distinct = 0
         self.set = set([])
コード例 #17
0
class RedisJob(object):
    _pool = None

    url_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def __init__(self, **kwargs):
        self._host = kwargs[
            'host'] if 'host' in kwargs else 'redis://localhost:6378'
        self._db = kwargs['db'] if 'db' in kwargs else 1
        self._minsize = kwargs['minsize'] if 'minsize' in kwargs else 5
        self._maxsize = kwargs['maxsize'] if 'maxsize' in kwargs else 10

    async def init_pool(self):
        LOGGER.info(
            "init redis pool (host: %s, db: %d, minsize: %d, maxsize: %d)" %
            (self._host, self._db, self._minsize, self._maxsize))
        self._pool = await aioredis.create_pool(self._host,
                                                db=self._db,
                                                minsize=self._minsize,
                                                maxsize=self._maxsize)

    async def push_job(self, job_type, job_info):
        if not self._pool:
            await self.init_pool()
        url = job_info.get('url', '')
        if url and url in self.url_filter:
            LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info)))
            return
        else:
            self.url_filter.add(url)
        with await self._pool as conn:
            await conn.execute('lpush', str(job_type), json.dumps(job_info))
            LOGGER.info("push %s job into redis: %s" %
                        (job_type, str(job_info)))

    async def fetch_job(self, job_type):
        if not self._pool:
            await self.init_pool()
        with await self._pool as conn:
            job_info = await conn.execute('rpop', job_type)
            if job_info:
                LOGGER.info('fetched job: %s' % job_info)
                return json.loads(job_info)
            else:
                return None

    async def clean(self):
        if not self._pool:
            await self.init_pool()
        with await self._pool as conn:
            keys = await conn.execute('keys', '*')
            for key in keys:
                LOGGER.info("del %s" % key)
                await conn.execute('del', key)
コード例 #18
0
ファイル: bfilter.py プロジェクト: hamedhsn/crawler
    def __init__(self,
                 capacity=None,
                 error_rate=0.001,
                 mode=ScalableBloomFilter.LARGE_SET_GROWTH):

        self.capacity = capacity

        if capacity is None:
            self.bf = ScalableBloomFilter(mode=mode)
        else:
            self.bf = BloomFilter(capacity=capacity, error_rate=error_rate)
コード例 #19
0
 def __init__(self, BFsNo, deltas, modelsNo=3):
     '''create bloom filters'''
     BFs = []
     self.deltas = deltas
     self.modelsNo = modelsNo
     self.BFsNo = BFsNo
     deltas_no = len(deltas)
     for i in range(deltas_no * BFsNo):
         bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
         BFs.append(bf)
     self.BFs = np.array(BFs)
     self.BFs = self.BFs.reshape(deltas_no, BFsNo)
コード例 #20
0
def get_city_rank(table, table4, field, province_name, risk_level):
    cur = defaultDatabase()
    city_list = []
    list = []
    province_list = []
    sql = "select max(date) from %s" % table
    cur.execute(sql)
    end_time = cur.fetchall()[0][0]
    start_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7)
    start_time = start_time.strftime("%Y-%m-%d")
    start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30)
    start_time1 = start1_time.strftime("%Y-%m-%d")
    sql1 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % (
        table, table4, table4, start_time, end_time, risk_level)
    cur.execute(sql1)
    res1 = cur.fetchall()
    result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1]
    sql2 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % (
        table, table4, table4, start_time1, end_time, risk_level)
    cur.execute(sql2)
    res2 = cur.fetchall()
    result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2]
    result = result1 + result2
    b = ScalableBloomFilter(1000000, 0.001)
    for p in result:
        if not p['city'] in b:
            [b.add(p['city'])]
            city_list.append({'province': p['province'], 'city': p['city']})
    for d in city_list:
        if not d['province'] in province_list:
            province_list.append(d['province'])
    if province_name:
        for d in city_list:
            if d['province'] == province_name and d['city']:
                pro_dict = {"province": d['province'], "city": d['city']}
                for dict in result1:
                    if dict['city'] == d['city']:
                        pro_dict.update({'count7': dict['count']})
                for dict in result2:
                    if dict['city'] == d['city']:
                        pro_dict.update({'count30': dict['count']})
                list.append(pro_dict)
    if not province_name:
        for p in province_list:
            if p:
                pro_dict = {"province": p}
                count = 0
                for dict in result1:
                    if dict['province'] == p:
                        count += dict['count']
                pro_dict.update({"count": count})
                list.append(pro_dict)
    return list
コード例 #21
0
    def __init__(self, model, dummy=False):
        super(ScrapeWorker, self).__init__()

        self.source_q = Queue()
        self.parse_q = Queue()
        self.seen = ScalableBloomFilter()
        self.forwarded = ScalableBloomFilter()
        self.new_sources = []
        self.workers = []
        self.to_forward = []
        self.parser = None
        self.done_parsing = False
        self.no_more_sources = False
        self.dbs = dict()
        self.schedule = model.schedule
        self.model = model
        self.source_kill = None
        self.dummy = dummy

        db_threads = defaultdict(list)

        # Check if the functions in each template are used properly
        # and store which types of databases are needed.
        for phase in self.model.phases:
            for template in phase.templates:
                self.check_functions(template, phase)
                if template.db_type:
                    db_threads[template.db_type].append(template)

        # Start all the threads necessary for storing the data and give each
        # template a reference to the thread it needs to store data in.
        for thread, templates in db_threads.items():
            if not dummy:
                store_thread = databases._threads[thread]()
            else:
                store_thread = databases._threads['dummy']()

            for template in templates:
                self.dbs[template.name] = store_thread
            store_thread.start()
コード例 #22
0
ファイル: main.py プロジェクト: ttttttboy/py1
def ParseQueue():
    # Load Checked Urls File
    if os.path.isfile(path_checked_url_file):
        with open(path_checked_url_file, 'rb') as rf:
            checked_url_pool = ScalableBloomFilter.fromfile(rf)
            print("bf: Read pybloom from %s.\n" % path_checked_url_file)
    else:
        checked_url_pool = ScalableBloomFilter(
            initial_capacity=1000,
            error_rate=0.001,
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        print("bf: Create pybloom")

    # Get each Item from Queue
    i = 1
    # URL_QUEUE.put_nowait(None)  # sign the end of Queue
    # for item in iter(URL_QUEUE.get_nowait, None):
    #     cur_url = item[2]
    URL_DEQUE.appendleft(None)
    for item in iter(URL_DEQUE.pop, None):
        cur_url = item[2]

        if (cur_url in checked_url_pool) == False:  # cur_url never checked
            try:
                time.sleep(0.3)
                page_html_raw = requests.get(cur_url, timeout=3)
            except requests.RequestException as e:
                print(e)
                # URL_DEQUE.appendleft(cur_url)
                with open(path_requestErr_log, 'a') as f_requestErr:
                    f_requestErr.write(
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time())) +
                        "Timeout " + cur_url + '\n')
            else:
                page_html = page_html_raw.content.decode('utf-8', 'ignore')
                buffer = parser4me.parser_4_1(item, page_html)
                with open(path_output_folder + os.path.sep + item[1] +
                          item[0][0:128] + ".txt",
                          'w',
                          encoding='utf-8') as resf:
                    resf.write(buffer)
                    print("%s OK! to file %s" % (i, item[0]))
                checked_url_pool.add(cur_url)
                i += 1
        else:
            print("Skip %s" % i)
            i += 1

        with open(path_checked_url_file, 'wb') as wf:
            checked_url_pool.tofile(wf)
コード例 #23
0
    def __init__(self):
        super(StreamingTriangles, self).__init__()

        # Set up connection to Redis server
        self.redis_server = 'localhost'
        self.redis_db = redis.StrictRedis(host=self.redis_server,
                                          port=6379,
                                          db=0)

        # Initialize reservoir sizes
        self.edge_res_size = 40000
        self.wedge_res_size = 40000

        # Set Scalable Bloom Filter for ignoring repeated edges
        self.bloom_filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        # Init counters and arrays for Streaming-Triangles algorithm
        self.edge_count = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0}

        self.total_wedges = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0}

        self.edge_res = {
            RED: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            BLUE: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            YELLOW: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            GREEN: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)]
        }

        self.wedge_res = {
            RED: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            BLUE:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            YELLOW:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            GREEN:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)]
        }

        self.is_closed = {
            RED: [False for _ in xrange(self.wedge_res_size)],
            BLUE: [False for _ in xrange(self.wedge_res_size)],
            YELLOW: [False for _ in xrange(self.wedge_res_size)],
            GREEN: [False for _ in xrange(self.wedge_res_size)]
        }

        # Track percent of uncategorized transactions
        self.num_missed = 0
        self.num_colored = 0
コード例 #24
0
ファイル: views.py プロジェクト: y-xerxes/itfin
def second_new_warn_entity():
    minDates = getMinDate(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE)
    row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
    b7 = ScalableBloomFilter(100000, 0.001)
    b30 = ScalableBloomFilter(100000, 0.001)
    b90 = ScalableBloomFilter(100000, 0.001)
    for i, k in minDates.items():
        dateTime = datetime.strptime(k, '%Y-%m-%d')
        dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
        if dValue < 7 and dValue >= 0:
            [b7.add(i)]
        if dValue < 30 and dValue >= 0:
            [b30.add(i)]
        if dValue < 90 and dValue >= 0:
            [b90.add(i)]
    result90 = secondDetectData(90, TABLE_ENTITY_LIST, TABLE_MONITOR,
                                TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE,
                                'all', 0, 0, 'all', 'all',
                                TABLE_INDEX_QUANTILE, TABLE_GUARANTEE_PROMISE)
    count7 = 0
    count30 = 0
    count90 = 0
    resultIds = []
    for each in result90:
        if not each['id'] in resultIds:
            resultIds.append(each['id'])
    for id in resultIds:
        if id in b7:
            count7 += 1
        if id in b30:
            print(id)
            count30 += 1
        if id in b90:
            count90 += 1
    result = {'count7': count7, 'count30': count30, 'count90': count90}
    return json.dumps(result, ensure_ascii=False)
コード例 #25
0
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
コード例 #26
0
ファイル: pipelines.py プロジェクト: tousyou/SocialSpider
    def __init__(self, bloomfile, spider_name):
        self.bloomfile = bloomfile
        self.spider_name = spider_name

        # item crawled before
        logger.info("loading crawled items before...")

        if os.path.isfile(self.bloomfile):
            f = open(self.bloomfile, 'r')
            self.item_crawled = ScalableBloomFilter.fromfile(f)
            f.close()
        else:
            self.item_crawled = ScalableBloomFilter(
                100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        cnt = self.item_crawled.count
        logger.info("pipline read %d crawled items" % cnt)
コード例 #27
0
    def __init__(self, city):
        """豆瓣页面抓取,抓取正在上映列表和电影介绍页。

        :param city: 抓取影片数据的城市。
        """
        self._url = 'https://movie.douban.com/cinema/nowplaying/{}/'.format(
            city.lower())
        # 电影列表页请求头
        self._list_headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep - alive',
            'Host':
            'movie.douban.com',
            'Referer':
            'https://movie.douban.com/',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
        }
        # 电影介绍页请求头
        self._info_headers = self._list_headers.copy()
        self._info_headers.update({'Referer': self._url})
        # 用布隆过滤器去重
        self._bf = ScalableBloomFilter()

        cfg = ConfigParser()
        cfg.read('config.ini')
        db_host = cfg.get('database', 'host')
        db_port = cfg.getint('database', 'port')
        db_dbname = cfg.get('database', 'database')
        db_collection = cfg.get('database', 'collection')

        self._db = MongoClient(db_host, db_port)[db_dbname][db_collection]
        for movie in self._db.find({}):
            self.logger.debug('get {} in database'.format(movie['url']))
            self._bf.add(movie['url'])
コード例 #28
0
ファイル: task_tool.py プロジェクト: chinaylssly/aastory
    def add_sbf(self, query=None):
        '''
        params: query -->mysql 查询语句
        过滤任务处理结果
        '''

        if query is None:
            return None

        sbf = ScalableBloomFilter()
        table = Table(logger=self.logger)
        result_dict = table.execute(query=query)
        data = result_dict.get('data')
        for each in data:
            id = each.get('id')
            sbf.add(int(id))
        table.close()
        return sbf
コード例 #29
0
def get_province_rank(table, table4, field, risk_level):
    cur = defaultDatabase()
    list = []
    province_list = []
    sql = "select max(date) from %s" % table
    cur.execute(sql)
    end_time = cur.fetchall()[0][0]
    start0_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7)
    start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30)
    start_time0 = start0_time.strftime("%Y-%m-%d")
    start_time1 = start1_time.strftime("%Y-%m-%d")
    sql1 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % (
        table, table4, table4, start_time0, end_time, risk_level)
    cur.execute(sql1)
    res1 = cur.fetchall()
    result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1]
    sql2 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % (
        table, table4, table4, start_time1, end_time, risk_level)
    cur.execute(sql2)
    res2 = cur.fetchall()
    result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2]
    result = result1 + result2
    b = ScalableBloomFilter(1000000, 0.001)
    for p in result:
        if not p['province'] in b:
            [b.add(p['province'])]
            province_list.append(p['province'])
    for d in province_list:
        if d:
            pro_dict = {"province": d}
            for dict in result1:
                if dict['province'] == d:
                    pro_dict.update({'count7': dict['count']})
            for dict in result2:
                if dict['province'] == d:
                    pro_dict.update({'count30': dict['count']})
            list.append(pro_dict)
    for li in list:
        try:
            if li['count7']:
                pass
        except:
            li['count7'] = 0
    return list
コード例 #30
0
ファイル: es.py プロジェクト: Ymm0008/itfin
def getHotSpot(entity_list):
	type = 'type1'
	results = []
	number = 0
	for dict in entity_list:
		indexB = ScalableBloomFilter(1000,0.001)
		for index_name in ['bbs','forum','webo']:
			query_body = {
					"sort":{"publish_time":{"order":"desc"}},
					"query": {
						"bool": {
							"must": [
								{
								"match": {
									"content": dict['name']
									}
								},
									{
								"match": {
									"em1": 1
									}
								}
							]
						}
					}
				}
			res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100)
			hits = res['hits']['hits']
			if(len(hits)):
				for item in hits:
					if dict['name'] in item['_source']['content']:
						if not index_name in indexB:
							if number < 10:
								id = dict['id']
								entity_name = dict['name']
								entity_type = dict['entity_type']
								content = item['_source']['content']
								results.append({'id':id,'name':entity_name,'content':content,'entity_type':entity_type})
								[indexB.add(index_name)]
								number += 1
		if not number < 10:
			break
	return results