コード例 #1
0
def second_new_warn_entity():
    minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                           ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
    row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
    b7 = ScalableBloomFilter(100000, 0.001)
    b30 = ScalableBloomFilter(100000, 0.001)
    b90 = ScalableBloomFilter(100000, 0.001)
    for i, k in minDates.items():
        dateTime = datetime.strptime(k, '%Y-%m-%d')
        dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
        if dValue < 7 and dValue >= 0:
            [b7.add(i)]
        if dValue < 30 and dValue >= 0:
            [b30.add(i)]
        if dValue < 90 and dValue >= 0:
            [b90.add(i)]
    result90 = secondDetectFromBigTable(90, TABLE_REPORT_ILLEGAL, RISK_LEVEL,
                                        ILLEGAL_SCORE, 'all', 0, 0, 'all',
                                        'all', TABLE_LOGS, 'all')
    count7 = 0
    count30 = 0
    count90 = 0
    resultIds = []
    for each in result90:
        if not each['entity_id'] in resultIds:
            resultIds.append(each['entity_id'])
    for id in resultIds:
        if id in b7:
            count7 += 1
        if id in b30:
            count30 += 1
        if id in b90:
            count90 += 1
    result = {'count7': count7, 'count30': count30, 'count90': count90}
    return json.dumps(result, ensure_ascii=False)
コード例 #2
0
class URLFilter(object):

    lock = RLock()

    def __init__(self):
        self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                               'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
        self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def forbidden_key_word(self, url):
        for key_word in self.forbidden_keys:
            if key_word in url:
                log.debug('## FORBIDDEN: {}'.format(url))
                return False
        return True

    @staticmethod
    def is_english(url):
        try:
            url.decode('ascii')
        except UnicodeDecodeError:
            log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
            return False
        else:
            return True

    def pass_check(self, url):
        with URLFilter.lock:
            if url in self.seen:
                log.debug('## SEEN: {}'.format(url))
                return False
            self.seen.add(url)
            return self.forbidden_key_word(url) and self.is_english(url)
コード例 #3
0
class FileBloom(object):
    def __init__(self):
        self.file_path = "bloom/bloom_weibo.txt"
        self.bloom_filter = ScalableBloomFilter(initial_capacity=10000,
                                                error_rate=0.001)

    def read_bloom(self):
        if os.path.exists(self.file_path):
            f = open(self.file_path, "r")
            ids = f.readlines()
            for id in ids:
                id_s = id.strip()
                self.bloom_filter.add(id_s)
            f.close()
        else:
            f = open(self.file_path, "w")
            f.close()

    def to_file(self):
        pass

    def update_bloom_file(self, m_id):
        f = open(self.file_path, "a")
        f.write(str(m_id) + "\n")
        f.close()

    def update_bloom(self, m_id):
        self.bloom_filter.add(m_id)

    def has_id(self, m_id):
        if m_id in self.bloom_filter:
            return True
        else:
            return False
コード例 #4
0
	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()

		self.session.begin(subtransactions=True)
		
		ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
		
		#s = set()
		s = ScalableBloomFilter()
		query = self.session.query(SupplierCatalogModel.id)
		for (supplier_catalog_id, ) in query.yield_per(100):
			s.add(supplier_catalog_id)
		
		
		for plug in self.plugins.itervalues():
			supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
			model_name = plug.version_model()  + 'Model'
			VersionModel = getattr(model, model_name)
			query = self.session.query(VersionModel)
			if limit:
				query = query.order_by(VersionModel.vacuumed.nullsfirst())
				query = query.limit(limit)

			ts['sub_done'] = 0
			ts['sub_total'] = query.count()
			for supplier_catalog_item_version in query.yield_per(10):
				if supplier_catalog_item_version.supplier_catalog_id not in s:
					logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
					self.session.delete(supplier_catalog_item_version)
				ts['sub_done'] += 1
			ts['done'] += 1
		self.session.commit()
		ts.finish()
		logger.debug('End vacuum_all()')
コード例 #5
0
ファイル: utils.py プロジェクト: robertf224/bio_final
class kmer_store:
    def __init__(self):
        self.bloom_filter = ScalableBloomFilter(
            initial_capacity=1000000,
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        self.kmers = {}

    def update(self, item):
        if item in self.bloom_filter:
            if item in self.kmers:
                self.kmers[item] += 1
            else:
                self.kmers[item] = 2
        else:
            self.bloom_filter.add(item)

    def __iter__(self):
        for key in self.kmers:
            yield key

    def __getitem__(self, key):
        return self.kmers[key]

    def __repr__(self):
        return str(self.kmers)

    def __str__(self):
        return str(self.kmers)
コード例 #6
0
ファイル: ddup.py プロジェクト: shkarupa-alex/nlpclean
def dedup_lines_bloom(text,
                      just_words=True,
                      zero_digits=True,
                      capacity=100000,
                      error=0.00001):
    sbf = ScalableBloomFilter(initial_capacity=capacity,
                              error_rate=error,
                              mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    for line in text:
        if not isinstance(line, str):
            raise TypeError(
                'Expected "text" to contain stings, found: {}'.format(
                    type(line)))

        key = line.strip()
        if not key:
            yield line

        key = normalize('NFKD', key)

        if just_words:
            key = ' '.join(re.findall(r'\w+', key))
        if zero_digits:
            key = re.sub(r'\d', '0', key)

        if key in sbf:
            line = ''
        else:
            sbf.add(key)

        yield line
コード例 #7
0
def against_detect_data_from_bigtable():
    b = ScalableBloomFilter(1000000, 0.001)
    date = int(request.args.get('date', ''))
    operation_mode = request.args.get('operation_mode', '')
    illegal_type = int(request.args.get('illegal_type', ''))
    entity_type = int(request.args.get('entity_type', ''))
    warn_distribute = request.args.get('warn_distribute', '')
    problem = request.args.get('problem', '')
    newEntity = int(request.args.get('newEntity', ''))
    fund_mode = request.args.get('fund_mode', '')
    result = againstDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL,
                                           RISK_LEVEL, ILLEGAL_SCORE,
                                           operation_mode, illegal_type,
                                           entity_type, warn_distribute,
                                           problem, TABLE_LOGS, fund_mode)
    # 合并相同数据
    doubleId = []
    for dict in result:
        if not dict['entity_id'] in b:
            [b.add(dict['entity_id'])]
        else:
            doubleId.append(dict['entity_id'])
    for id in doubleId:
        num = 0
        illegalTypeList = []
        for dict in result:
            if dict['entity_id'] == id:
                num += 1
                illegalTypeList.append(dict['illegal_type'])
                dict.update({'illegal_type': illegalTypeList})
                if num > 1:
                    result.remove(dict)
    # 筛选新增实体
    if newEntity:
        bb = ScalableBloomFilter(1000000, 0.001)
        newResult = []
        minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                               ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
        row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
        for i, k in minDates.items():
            dateTime = datetime.strptime(k, '%Y-%m-%d')
            dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
            if dValue < date and dValue >= 0:
                [bb.add(i)]
        for dict in result:
            if dict['entity_id'] in bb:
                newResult.append(dict)
        # 前端传的是id,防止报错,加上id
        for dict in result:
            dict.update({'id': dict['entity_id']})
        return json.dumps(newResult, ensure_ascii=False)
    try:
        result.sort(key=lambda x: x['datetime'], reverse=True)
    except:
        pass
    # 前端传的是id,防止报错,加上id
    for dict in result:
        dict.update({'id': dict['entity_id']})
    return json.dumps(result, ensure_ascii=False)
コード例 #8
0
class BloomMembership(GenericMembership):
    def __init__(self, max_size: int, error_rate: float):
        self.bloom_filter = ScalableBloomFilter(max_size, error_rate)

    def add(self, key: str):
        self.bloom_filter.add(key)

    def __contains__(self, key: str) -> bool:
        return key in self.bloom_filter
コード例 #9
0
ファイル: pipelines.py プロジェクト: yangxue088/wish
class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
コード例 #10
0
ファイル: pipelines.py プロジェクト: chntylz/wish
class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
コード例 #11
0
class URLBloomFilter(RFPDupeFilter):
    def __init__(self, path=None, debug=False):
        self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path)

    def request_seen(self, request):
        fp = hashlib.sha1()
        fp.update(canonicalize_url(request.url).encode("utf8"))
        url_sha1 = fp.hexdigest()
        if url_sha1 in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_sha1)
コード例 #12
0
ファイル: main.py プロジェクト: ttttttboy/py1
def ParseQueue():
    # Load Checked Urls File
    if os.path.isfile(path_checked_url_file):
        with open(path_checked_url_file, 'rb') as rf:
            checked_url_pool = ScalableBloomFilter.fromfile(rf)
            print("bf: Read pybloom from %s.\n" % path_checked_url_file)
    else:
        checked_url_pool = ScalableBloomFilter(
            initial_capacity=1000,
            error_rate=0.001,
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        print("bf: Create pybloom")

    # Get each Item from Queue
    i = 1
    # URL_QUEUE.put_nowait(None)  # sign the end of Queue
    # for item in iter(URL_QUEUE.get_nowait, None):
    #     cur_url = item[2]
    URL_DEQUE.appendleft(None)
    for item in iter(URL_DEQUE.pop, None):
        cur_url = item[2]

        if (cur_url in checked_url_pool) == False:  # cur_url never checked
            try:
                time.sleep(0.3)
                page_html_raw = requests.get(cur_url, timeout=3)
            except requests.RequestException as e:
                print(e)
                # URL_DEQUE.appendleft(cur_url)
                with open(path_requestErr_log, 'a') as f_requestErr:
                    f_requestErr.write(
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time())) +
                        "Timeout " + cur_url + '\n')
            else:
                page_html = page_html_raw.content.decode('utf-8', 'ignore')
                buffer = parser4me.parser_4_1(item, page_html)
                with open(path_output_folder + os.path.sep + item[1] +
                          item[0][0:128] + ".txt",
                          'w',
                          encoding='utf-8') as resf:
                    resf.write(buffer)
                    print("%s OK! to file %s" % (i, item[0]))
                checked_url_pool.add(cur_url)
                i += 1
        else:
            print("Skip %s" % i)
            i += 1

        with open(path_checked_url_file, 'wb') as wf:
            checked_url_pool.tofile(wf)
コード例 #13
0
def second_detect_data():
    b = ScalableBloomFilter(1000000, 0.001)
    date = int(request.args.get('date', ''))
    operation_mode = request.args.get('operation_mode', '')
    illegal_type = int(request.args.get('illegal_type', ''))
    entity_type = int(request.args.get('entity_type', ''))
    warn_distribute = request.args.get('warn_distribute', '')
    problem = request.args.get('problem', '')
    newEntity = int(request.args.get('newEntity', ''))
    result = secondDetectData(date, TABLE_ENTITY_LIST, TABLE_MONITOR,
                              TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE,
                              operation_mode, illegal_type, entity_type,
                              warn_distribute, problem, TABLE_INDEX_QUANTILE,
                              TABLE_GUARANTEE_PROMISE, TABLE_LOGS)
    doubleId = []
    for dict in result:
        if not dict['id'] in b:
            [b.add(dict['id'])]
        else:
            doubleId.append(dict['id'])
    for id in doubleId:
        num = 0
        illegalTypeList = []
        for dict in result:
            if dict['id'] == id:
                num += 1
                illegalTypeList.append(dict['illegal_type'])
                dict.update({'illegal_type': illegalTypeList})
                if num > 1:
                    result.remove(dict)
    if newEntity:
        bb = ScalableBloomFilter(1000000, 0.001)
        newResult = []
        minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                               ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
        row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
        for i, k in minDates.items():
            dateTime = datetime.strptime(k, '%Y-%m-%d')
            dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
            if dValue < date:
                [bb.add(i)]
        for dict in result:
            if dict['id'] in bb:
                newResult.append(dict)
        return json.dumps(newResult, ensure_ascii=False)
    try:
        result.sort(key=lambda x: x['datetime'], reverse=True)
    except:
        pass
    return json.dumps(result, ensure_ascii=False)
コード例 #14
0
ファイル: scheduler.py プロジェクト: vieyahn2017/crawlers
class RequestFilter(object):
    """ RequestFilter """
    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
コード例 #15
0
class URLBloomFilter(RFPDupeFilter):
    """根据urlhash_bloom过滤"""
    def __init__(self,path=None):
        self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path)

    def request_seen(self, request):
        fp = hashlib.sha1()
        fp.update(canonicalize_url(request.url))
        url_shal = fp.hexdigest()
        if url_shal in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_shal)
コード例 #16
0
def total_detect_data_test():
    b = ScalableBloomFilter(1000000, 0.001)
    date = int(request.args.get('date', ''))
    operation_mode = request.args.get('operation_mode', '')  #多选
    illegal_type = int(request.args.get('illegal_type', ''))
    entity_type = int(request.args.get('entity_type', ''))
    warn_distribute = request.args.get('warn_distribute', '')  #多选
    problem = request.args.get('problem', '')  #多选
    newEntity = int(request.args.get('newEntity', ''))
    checked = int(request.args.get('checked', ''))
    fund_mode = request.args.get('fund_mode', '')
    result = totalDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL,
                                         operation_mode, illegal_type,
                                         entity_type, warn_distribute, problem,
                                         checked, fund_mode)
    # 将illegal_type不同的两个实体合并
    doubleId = []
    for dict in result:
        if not dict['entity_id'] in b:
            [b.add(dict['entity_id'])]
        else:
            doubleId.append(dict['entity_id'])
    for id in doubleId:
        num = 0
        illegalTypeList = []
        for dict in result:
            if dict['entity_id'] == id:
                num += 1
                illegalTypeList.append(dict['illegal_type'])
                dict.update({'illegal_type': illegalTypeList})
                if num > 1:
                    result.remove(dict)
    # 筛选新增实体
    if newEntity:
        bb = ScalableBloomFilter(1000000, 0.001)
        newResult = []
        minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                               ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
        row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
        for i, k in minDates.items():
            dateTime = datetime.strptime(k, '%Y-%m-%d')
            dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
            if dValue < date and dValue >= 0:
                [bb.add(i)]
        for dict in result:
            if dict['entity_id'] in bb:
                newResult.append(dict)
        return json.dumps(newResult, ensure_ascii=False)
    return json.dumps(result, ensure_ascii=False)
コード例 #17
0
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
コード例 #18
0
ファイル: sketch.py プロジェクト: Faiz7412/itpy
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
コード例 #19
0
class DuplicateItemFilterPipeline(Pipeline):  # bloomfiler 序列化
    fileName = "DuplicateItemFilter.dat"

    def open_spider(self, spider):
        self.fileName = spider.name + self.fileName
        if os.path.exists(self.fileName):
            with open(self.fileName, 'rb') as f:
                self.sbf = ScalableBloomFilter.fromfile(f)
        else:
            self.sbf = ScalableBloomFilter(
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        pass

    def close_spider(self, spider):
        with open(self.fileName, 'wb') as f:
            self.sbf = self.sbf.tofile(f)
        pass

    def process_item(self, item, spider):  # bloomfiler
        fp = hashlib.sha1()
        for key in item.keys():
            if key not in ['curlDate', 'reference'] \
                    and item[key] is not None:  # 不比较抓取时间,来源url
                fp.update(item[key])
        fpValue = fp.hexdigest()
        if not self.sbf.add(fpValue):
            return item
        else:
            raise DropItem("duplicate item :/n %s" % item)
コード例 #20
0
ファイル: scheduler.py プロジェクト: kaito-kidd/mini-scrapy
class RequestFilter(object):

    """ RequestFilter """

    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
コード例 #21
0
ファイル: pipelines.py プロジェクト: tousyou/SocialSpider
class BloomPipeline(object):
    def __init__(self, bloomfile, spider_name):
        self.bloomfile = bloomfile
        self.spider_name = spider_name

        # item crawled before
        logger.info("loading crawled items before...")

        if os.path.isfile(self.bloomfile):
            f = open(self.bloomfile, 'r')
            self.item_crawled = ScalableBloomFilter.fromfile(f)
            f.close()
        else:
            self.item_crawled = ScalableBloomFilter(
                100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        cnt = self.item_crawled.count
        logger.info("pipline read %d crawled items" % cnt)

    def __del__(self):
        f = open(self.bloomfile, 'w')
        self.item_crawled.tofile(f)
        f.close()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            #mongo_uri=crawler.settings.get('MONGODB_ADDRESS'),
            bloomfile=crawler.settings.get('BLOOM_FILE'),
            #bloomfile = "/root/dev/SocialSpider/data/weibotv/bloomfile",
            spider_name=crawler.spidercls.name)

    def process_item(self, item, spider):
        #if not item['md5']:
        #    md5 = hashlib.md5("%s%s%s"%(item['title'].encode('utf-8'),item['url'].encode('utf-8'))).hexdigest()
        #    item['md5'] = md5

        valid = True
        item_id = ''
        if self.spider_name == 'weibotv':
            item_id = item['mid']
        elif self.spider_name == 'toutiao':
            item_id = item['Url']
            #item_id = hashlib.md5("%s"%(item['Url'].encode('utf-8'))).hexdigest()
        elif self.spider_name == 'anyvspider':
            item_id = item['pid']
        else:
            pass

        if self.item_crawled.add(item_id):
            valid = False
        else:
            valid = True

        if valid:
            logger.info("item: %s wrote to bloomfile %s" %
                        (item_id.encode('utf-8'), self.bloomfile))
            return item
        else:
            logger.info("item droped %s " % item_id.encode('utf-8'))
コード例 #22
0
class UrlFilter(RFPDupeFilter):
    def __init__(self, path=None, debug=False):
        self.urls_sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path, debug)

    def request_seen(self, request):

        fp = hashlib.sha1()
        fp.update(canonicalize_url(request.url).encode('utf-8'))
        url_sha1 = fp.hexdigest()
        if url_sha1 not in self.urls_sbf and not mysqldb.queryItem(
                request.url):
            self.urls_sbf.add(url_sha1)
        else:
            return True
コード例 #23
0
class URLBloomFilter(RFPDupeFilter):
    # 根据urlhash_bloom过滤
    def __init__(self, path=None):
        self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path)

    def request_seen(self, request):
        # 生成一个哈希sha1处理实例
        fp = hashlib.sha1()
        # 更新传入的参数为格式化统一后的函数(有时候同一个网址,可能请求网址的格式不一样)
        fp.update(canonicalize_url(request.url))
        # sha1处理后的url
        url_sha1 = fp.hexdigest()
        if url_sha1 in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_sha1)
コード例 #24
0
ファイル: task_tool.py プロジェクト: chinaylssly/aastory
    def add_sbf(self, query=None):
        '''
        params: query -->mysql 查询语句
        过滤任务处理结果
        '''

        if query is None:
            return None

        sbf = ScalableBloomFilter()
        table = Table(logger=self.logger)
        result_dict = table.execute(query=query)
        data = result_dict.get('data')
        for each in data:
            id = each.get('id')
            sbf.add(int(id))
        table.close()
        return sbf
コード例 #25
0
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
コード例 #26
0
ファイル: sketch.py プロジェクト: Faiz7412/itpy
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
コード例 #27
0
def get_city_rank(table, table4, field, province_name, risk_level):
    cur = defaultDatabase()
    city_list = []
    list = []
    province_list = []
    sql = "select max(date) from %s" % table
    cur.execute(sql)
    end_time = cur.fetchall()[0][0]
    start_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7)
    start_time = start_time.strftime("%Y-%m-%d")
    start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30)
    start_time1 = start1_time.strftime("%Y-%m-%d")
    sql1 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % (
        table, table4, table4, start_time, end_time, risk_level)
    cur.execute(sql1)
    res1 = cur.fetchall()
    result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1]
    sql2 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % (
        table, table4, table4, start_time1, end_time, risk_level)
    cur.execute(sql2)
    res2 = cur.fetchall()
    result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2]
    result = result1 + result2
    b = ScalableBloomFilter(1000000, 0.001)
    for p in result:
        if not p['city'] in b:
            [b.add(p['city'])]
            city_list.append({'province': p['province'], 'city': p['city']})
    for d in city_list:
        if not d['province'] in province_list:
            province_list.append(d['province'])
    if province_name:
        for d in city_list:
            if d['province'] == province_name and d['city']:
                pro_dict = {"province": d['province'], "city": d['city']}
                for dict in result1:
                    if dict['city'] == d['city']:
                        pro_dict.update({'count7': dict['count']})
                for dict in result2:
                    if dict['city'] == d['city']:
                        pro_dict.update({'count30': dict['count']})
                list.append(pro_dict)
    if not province_name:
        for p in province_list:
            if p:
                pro_dict = {"province": p}
                count = 0
                for dict in result1:
                    if dict['province'] == p:
                        count += dict['count']
                pro_dict.update({"count": count})
                list.append(pro_dict)
    return list
コード例 #28
0
ファイル: util.py プロジェクト: cmusatyalab/deltaic
class BloomSet(object):
    def __init__(self, initial_capacity=1000, error_rate=0.0001):
        self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
                error_rate=error_rate,
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        # False positives in the Bloom filter will cause us to fail to
        # garbage-collect an object.  Salt the Bloom filter to ensure
        # that we get a different set of false positives on every run.
        self._bloom_salt = os.urandom(2)

    def add(self, name):
        self._set.add(self._bloom_key(name))

    def __contains__(self, name):
        # May return false positives.
        return self._bloom_key(name) in self._set

    def _bloom_key(self, name):
        if isinstance(name, unicode):
            name = name.encode('utf-8')
        return self._bloom_salt + name
コード例 #29
0
ファイル: postproc.py プロジェクト: etman/xPyCrawler
def main(args):
    seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    for ln in sys.stdin:
        if not ln:
            continue
        fetchedUrl = json.loads(ln)

        # continue if we've seen this url already.
        if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet:
            continue

        # add unseen url to the url set
        seenUrlSet.add(fetchedUrl["url"])
        seenUrlSet.add(fetchedUrl["effective_url"])

        # extract links and filter out some urls by url filter.
        outlinks = url_filter(extract_links(fetchedUrl))

        # analyze

        print "[postproc]%s" % fetchedUrl["url"]
コード例 #30
0
class LibsPoiPipeline(object):
    filter_prefix = 'POI_'
    def __init__(self):
        self.files = {}
        self.file_path = './data/libs_poi.%d.csv' % int(time.time())
        self.filter = ScalableBloomFilter(initial_capacity=1024, error_rate=0.001,
                                          mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(self.file_path, 'a+b')
        self.files[spider] = file
        kwargs = {
            'fields_to_export': ['lid','name','tag','ltype','typecode','biz_type','address','lng','lat','tel','postcode',
                                 'website','email','pcode','pname','citycode', 'cityname', 'adcode','adname','importance',
                                 'shopid','shopinfo','poiweight','gridcode','distance','navi_poiid','entr_lng','entr_lat','business_area',
                                 'exit_location','match','recommend','timestamp','alias','indoor_map','cpid','floor','truefloor',
                                 'groupbuy_num','discount_num','rating','cost','event','children']}
        self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        print("spider closed!")

    def process_item(self, item, spider):
        if isinstance(item, LibsPoiItem):
            if self.filter_prefix + item.get('lid') in self.filter:
                return
            self.exporter.export_item(item)
            self.filter.add(self.filter_prefix + item.get('lid'))
        return item
コード例 #31
0
	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()
		ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins))
		tx = transaction.get()
		
		try:
			#s = set()
			s = ScalableBloomFilter()
			query = DBSession.query(SupplierSpecialModel.id)
			for (supplier_special_id, ) in query.yield_per(100):
				s.add(supplier_special_id)
			
			for plug in self.plugins.itervalues():
				supplier_special_filter_id = plug.supplier_special_filter_id()
				model_name = plug.version_model()  + 'Model'
				VersionModel = getattr(model, model_name)
				query = DBSession.query(VersionModel)
				if limit:
					query = query.order_by(VersionModel.vacuumed.nullsfirst())
					query = query.limit(limit)

				ts['sub_done'] = 0
				ts['sub_total'] = query.count()
				for supplier_special_item_version in query.yield_per(10):
					if supplier_special_item_version.supplier_special_id not in s:
						logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id)
						DBSession.delete(supplier_special_item_version)
					ts['sub_done'] += 1
					if ts['sub_done'] % 1000 == 0:
						DBSession.flush()
				DBSession.flush()
				ts['done'] += 1
		except Exception:
			logger.exception('Caught Exception: ')
			tx.abort()
		finally:
			ts.finish()
		transaction.commit()
		logger.debug('End vacuum_all()')
コード例 #32
0
ファイル: bloomRedisFilter.py プロジェクト: yohee2015/Crwal
class URLBloomFilter(BaseDupeFilter):
    def __init__(self,host,port):
        self.urls_sbf=ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.host=host
        self.port=port
        self.client = redis.Redis(self.host, self.port)


    @classmethod
    def from_settings(cls, settings):
        return cls(host=settings.get('FILTER_HOST'),
                   port=settings.get('FILTER_PORT'))


    def request_seen(self, request):
        fp=hashlib.sha1()
        fp.update(canonicalize_url(request.url))
        url_sha1=fp.hexdigest()
        if url_sha1 in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_sha1)
コード例 #33
0
    def test_bloom_string(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
            rnd = ''.join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
コード例 #34
0
ファイル: utils.py プロジェクト: robertf224/bio_final
class kmer_store:
	def __init__(self):
		self.bloom_filter = ScalableBloomFilter(initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
		self.kmers = {}

	def update(self, item):
		if item in self.bloom_filter:
			if item in self.kmers:
				self.kmers[item] += 1
			else:
				self.kmers[item] = 2
		else:
			self.bloom_filter.add(item)

	def __iter__(self):
		for key in self.kmers:
			yield key
	def __getitem__(self, key):
		return self.kmers[key]
	def __repr__(self):
		return str(self.kmers)
	def __str__(self):
		return str(self.kmers)
コード例 #35
0
def main():
    bloom = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    random = SystemRandom()

    print('Sample hashes:')
    for i in range(0, 10000):
        random_hash = hex(random.getrandbits(256))
        bloom.add(random_hash)

        if i % 1000 == 0:
            print(random_hash)

    print(f'~{len(bloom)} hashes added to bloom filter.')

    print()
    try:
        while True:
            user_hash = input('Enter hash to check: ')
            if not user_hash:
                break

            print(user_hash in bloom)
    except (EOFError, KeyboardInterrupt):
        pass
コード例 #36
0
ファイル: views.py プロジェクト: y-xerxes/itfin
def second_new_warn_entity():
    minDates = getMinDate(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE)
    row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
    b7 = ScalableBloomFilter(100000, 0.001)
    b30 = ScalableBloomFilter(100000, 0.001)
    b90 = ScalableBloomFilter(100000, 0.001)
    for i, k in minDates.items():
        dateTime = datetime.strptime(k, '%Y-%m-%d')
        dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
        if dValue < 7 and dValue >= 0:
            [b7.add(i)]
        if dValue < 30 and dValue >= 0:
            [b30.add(i)]
        if dValue < 90 and dValue >= 0:
            [b90.add(i)]
    result90 = secondDetectData(90, TABLE_ENTITY_LIST, TABLE_MONITOR,
                                TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE,
                                'all', 0, 0, 'all', 'all',
                                TABLE_INDEX_QUANTILE, TABLE_GUARANTEE_PROMISE)
    count7 = 0
    count30 = 0
    count90 = 0
    resultIds = []
    for each in result90:
        if not each['id'] in resultIds:
            resultIds.append(each['id'])
    for id in resultIds:
        if id in b7:
            count7 += 1
        if id in b30:
            print(id)
            count30 += 1
        if id in b90:
            count90 += 1
    result = {'count7': count7, 'count30': count30, 'count90': count90}
    return json.dumps(result, ensure_ascii=False)
コード例 #37
0
ファイル: pipelines.py プロジェクト: ycs8912/aliexpress
class DuplicatePipeline(object):
    def __init__(self):
        self.filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if isinstance(item, UrlItem):
            uid = '{}{}{}'.format(spider.prefix, spider.name, item['url'])
        else:
            uid = '{}{}{}'.format(spider.prefix, spider.name, item['_id'])

        if self.filter.add(uid):
            raise DropItem('duplicate item found')
        else:
            return item
コード例 #38
0
ファイル: exclusions.py プロジェクト: pipermerriam/mozy
class StockTileExclusions(object):
    """
    Object that keeps track of which stock tiles have already been used.
    """
    def __init__(self, source_image):
        self.source_image = source_image
        self.bloom_filter = ScalableBloomFilter(
            initial_capacity=source_image.tiles.count(),
            error_rate=0.0001,  # 1 in 10,000
        )
        existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
        for tile_id, existing_match_id in existing_matches:
            self.bloom_filter.add((tile_id, existing_match_id))

    def __contains__(self, key):
        if key in self.bloom_filter:
            return True
        elif self.source_image.tiles.filter(stock_tile_match_id=key[1]).exists():
            self.add(key)
            return True
        return False

    def add(self, key):
        self.bloom_filter.add(key)
コード例 #39
0
    def test_bloom_int(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
             _ = f.add(i)

        for i in xrange(0, 10000):
            self.assertEqual(i in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(0,10000-1)
            self.assertEqual(r in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(10000,10000 * 2)
            self.assertEqual(r in f, False)
コード例 #40
0
def get_province_rank(table, table4, field, risk_level):
    cur = defaultDatabase()
    list = []
    province_list = []
    sql = "select max(date) from %s" % table
    cur.execute(sql)
    end_time = cur.fetchall()[0][0]
    start0_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7)
    start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30)
    start_time0 = start0_time.strftime("%Y-%m-%d")
    start_time1 = start1_time.strftime("%Y-%m-%d")
    sql1 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % (
        table, table4, table4, start_time0, end_time, risk_level)
    cur.execute(sql1)
    res1 = cur.fetchall()
    result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1]
    sql2 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % (
        table, table4, table4, start_time1, end_time, risk_level)
    cur.execute(sql2)
    res2 = cur.fetchall()
    result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2]
    result = result1 + result2
    b = ScalableBloomFilter(1000000, 0.001)
    for p in result:
        if not p['province'] in b:
            [b.add(p['province'])]
            province_list.append(p['province'])
    for d in province_list:
        if d:
            pro_dict = {"province": d}
            for dict in result1:
                if dict['province'] == d:
                    pro_dict.update({'count7': dict['count']})
            for dict in result2:
                if dict['province'] == d:
                    pro_dict.update({'count30': dict['count']})
            list.append(pro_dict)
    for li in list:
        try:
            if li['count7']:
                pass
        except:
            li['count7'] = 0
    return list
コード例 #41
0
ファイル: es.py プロジェクト: Ymm0008/itfin
def getHotSpot(entity_list):
	type = 'type1'
	results = []
	number = 0
	for dict in entity_list:
		indexB = ScalableBloomFilter(1000,0.001)
		for index_name in ['bbs','forum','webo']:
			query_body = {
					"sort":{"publish_time":{"order":"desc"}},
					"query": {
						"bool": {
							"must": [
								{
								"match": {
									"content": dict['name']
									}
								},
									{
								"match": {
									"em1": 1
									}
								}
							]
						}
					}
				}
			res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100)
			hits = res['hits']['hits']
			if(len(hits)):
				for item in hits:
					if dict['name'] in item['_source']['content']:
						if not index_name in indexB:
							if number < 10:
								id = dict['id']
								entity_name = dict['name']
								entity_type = dict['entity_type']
								content = item['_source']['content']
								results.append({'id':id,'name':entity_name,'content':content,'entity_type':entity_type})
								[indexB.add(index_name)]
								number += 1
		if not number < 10:
			break
	return results
コード例 #42
0
ファイル: order.py プロジェクト: jingtingzhiwu/aliexpress
class OrderSpider(RedisSpider):
    name = "order"
    allowed_domains = ["aliexpress.com"]
    start_urls = (
        'http://www.aliexpress.com/',
    )

    prefix = ''

    ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def __init__(self):
        self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        self.orders = dict()
        self.redis_queue = None

    def get_queue(self):
        for value in set(self.server.smembers(self.redis_key)):
            yield value

    def start_requests(self):
        OrderSpider.prefix = self.settings['prefix']
        self.redis_key = '{}:order'.format(OrderSpider.prefix)

        self.redis_queue = self.get_queue()

        db = MongoClient().aliexpress
        for order in db['{}order'.format(OrderSpider.prefix)].find():
            OrderSpider.ids.add(order['_id'])

        yield self.next_request()

    def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and OrderSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')

    def make_requests_from_url(self, url):
        self.log('request order page: {}'.format(url), logging.INFO)
        parsed = urlparse.urlparse(url)
        product_id = urlparse.parse_qs(parsed.query)['productId'][0]
        return self.request(product_id, url)

    def request(self, product_id, base_url, page=1):
        order_url = '{}&page={}'.format(base_url, page)

        self.log('request order page: {}'.format(order_url), logging.INFO)
        return scrapy.Request(url=order_url, meta={'product_id': product_id, 'base_url': base_url, 'page': page},
                              callback=self.parse)

    def parse(self, response):
        self.log('parse order page: {}'.format(response.url), logging.INFO)

        orders = json.loads(response.body.replace('\\', ''))
        records = [record for record in orders['records'] if not self.filter.add(record['id'])]

        if len(records) > 0:
            for record in records:
                date = datetime.strptime(record['date'], '%d %b %Y %H:%M')
                quantity = record['quantity']
                buyer_level = record['buyerAccountPointLeval']
                self.order(response.meta['product_id']).append_order(**{'date': date, 'quantity': quantity, 'buyer_level': buyer_level})

            return self.request(response.meta['product_id'], response.meta['base_url'], int(response.meta['page']) + 1)
        else:
            self.order(response.meta['product_id']).finish_order = True
            return self.pop_order(response.meta['product_id'])

    def order(self, id):
        if id not in self.orders:
            self.orders[id] = Order(id)
        return self.orders[id]

    def pop_order(self, id):
        if self.order(id).is_finish():
            order = self.orders.pop(id)

            self.log('crawl order: {}'.format(order), logging.INFO)

            item = OrderItem()
            item['prefix'] = OrderSpider.prefix
            item['_id'] = order.id
            item['orders'] = order.orders
            return item
コード例 #43
0
ファイル: product.py プロジェクト: jingtingzhiwu/aliexpress
class ProductSpider(RedisSpider):
    name = "product"
    allowed_domains = ["aliexpress.com"]
    start_urls = (
        'http://www.aliexpress.com/',
    )

    prefix = ''

    def __init__(self):
        self.products = dict()
        self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        self.redis_queue = None

    def get_queue(self):
        for value in set(self.server.smembers(self.redis_key)):
            yield value

    def start_requests(self):
        ProductSpider.prefix = self.settings['prefix']
        self.redis_key = '{}:product'.format(ProductSpider.prefix)

        self.redis_queue = self.get_queue()

        db = MongoClient().aliexpress
        for product in db['{}product'.format(ProductSpider.prefix)].find():
            self.ids.add(product['url'][product['url'].rfind('/') + 1:product['url'].rfind('.')])

        yield self.next_request()

    def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')

    def parse(self, response):
        self.log('product url: {}'.format(response.url), logging.INFO)

        try:
            store_url = response.css('.shop-name').xpath('a/@href').extract()[0]
            self.log('crawl store url: {}'.format(store_url), logging.INFO)

            store_item = UrlItem()
            store_item['prefix'] = ProductSpider.prefix
            store_item['type'] = 'store'
            store_item['url'] = store_url
            yield store_item

            feedback_base_url = response.xpath('//div[@id="feedback"]/iframe/@thesrc').extract()[0]
            parsed = urlparse.urlparse(feedback_base_url)
            product_id = urlparse.parse_qs(parsed.query)['productId'][0]

            try:
                percent_num = response.css('.percent-num').xpath('text()').extract()[0]
                rantings_text = response.css('.rantings-num').xpath('text()').extract()[0]
                rantings_num = rantings_text[1:rantings_text.index(' ')]
                order_text = response.css('.order-num').xpath('text()').extract()[0]
                order_num = order_text[:order_text.index(' ')]
            except:
                percent_num = 0
                rantings_num = 0
                order_num = 0

            product_item = ProductItem()
            product_item['prefix'] = ProductSpider.prefix
            product_item['_id'] = product_id
            product_item['store'] = store_url
            product_item['url'] = response.url
            product_item['percent_num'] = percent_num
            product_item['rantings_num'] = rantings_num
            product_item['order_num'] = order_num
            yield product_item

            feedback_item = UrlItem()
            feedback_item['prefix'] = ProductSpider.prefix
            feedback_item['type'] = 'feedback'
            feedback_item['url'] = feedback_base_url
            yield feedback_item

            order_item = UrlItem()
            order_item['prefix'] = ProductSpider.prefix
            order_item['type'] = 'order'
            order_item[
                'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format(
                product_id)
            yield order_item
        except:
            try:
                product_url = response.meta['redirect_urls'][0]
            except:
                product_url = response.url
                self.log('strange product url: {}'.format(product_url), logging.ERROR)
            finally:
                self.log('meet anti-spider, back product: {}'.format(product_url), logging.INFO)

                url_item = UrlItem()
                url_item['prefix'] = ProductSpider.prefix
                url_item['type'] = 'product'
                url_item['url'] = product_url
                yield url_item
コード例 #44
0
ファイル: test.py プロジェクト: JimmyZhang/jimmy_study
__author__ = 'ztj'

from pybloom import BloomFilter

f = BloomFilter(capacity=10000, error_rate=0.0001)
arr = [f.add(x) for x in range(10)]
print arr
print all([(x in f) for x in range(10)])
print 10 in f
print 5 in f


f = BloomFilter(capacity=1000, error_rate=0.001)
for i in xrange(0, f.capacity):
	_ = f.add(i)

print (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18

from pybloom import ScalableBloomFilter
sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
count = 10000
for i in xrange(0, count):
	_ = sbf.add(i)

print (1.0 - (len(sbf) / float(count))) <= sbf.error_rate
コード例 #45
0
ファイル: cli.py プロジェクト: longrw/fastqp
def run(args):
    """ read FASTQ or SAM and tabulate basic metrics """
    time_start = time.time()
    if args.input.name != '<stdin>':
        bsize = os.path.getsize(args.input.name)

    est_counter = int()
    sample_lengths = list()
    sample_binsizes = list()
    act_nlines = int()
    name, ext = os.path.splitext(args.input.name)
    if (args.leftlimit > 0) and (args.rightlimit > 0):
        if args.rightlimit < args.leftlimit:
            sys.exit("Left limit must be less than right limit.\n")
    if args.type:
        ext = '.' + args.type
    if ext not in ['.fq','.fastq', '.sam', '.bam', '.gz'] and args.input.name != '<stdin>':
        sys.exit("Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n")

    if args.name:
        sample_name = args.name
    else:
        sample_name = args.input.name

    # estimate the number of lines in args.input if we can
    if ext in ['.fastq','.fq']:
        with FastqReader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length "
                "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry,
                                                                    len=mean_len,
                                                                    est=est_nlines))
    elif ext  == '.sam':
        with Reader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length "
                "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry,
                                                                    len=mean_len,
                                                                    est=est_nlines))
    elif ext == '.bam':
        est_nlines = sum(bam_read_count(args.input.name))
        if not args.quiet:
            sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines))
    elif ext == '.gz':
        if args.binsize:
            n = args.binsize
            est_nlines = None
            if not args.quiet:
                sys.stderr.write("Reading from gzipped file, bin size (-s) set to {binsize:n}.\n".format(binsize=n))
        else:
            sys.stderr.write("Gzipped file detected. Reading file to determine bin size (-s).\n")
            p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE)
            p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE)
            est_nlines, _ = p2.communicate()
            est_nlines = int(est_nlines) // 4
            if not args.quiet:
                sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines))
    elif name == '<stdin>':
        if args.binsize:
            n = args.binsize
        else:
            n = 1
        if not args.quiet:
            sys.stderr.write("Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".format(binsize=n))
        est_nlines = None
    if est_nlines is not None:
        # set up factor for sampling bin size
        if args.binsize:
            n = args.binsize
        else:
            nf = math.floor(est_nlines / args.nreads)
            if nf >= 1:
                n = int(nf)
            else:
                n = 1
        if not args.quiet:
            sys.stderr.write("Bin size (-s) set to {binsize:n}.\n".format(binsize=n))

    if ext in ['.sam', '.bam']:
        infile = Reader(args.input)
    else:
        infile = FastqReader(args.input, ext=ext)

    read_len = defaultdict(int)
    cycle_nuc = defaultdict(lambda: defaultdict(int))
    cycle_qual = defaultdict(lambda: defaultdict(int))
    cycle_gc = defaultdict(int)
    cycle_kmers = defaultdict(lambda: defaultdict(int))
    cycle_mismatch = {'C': defaultdict(lambda: defaultdict(int)),
                      'G': defaultdict(lambda: defaultdict(int)),
                      'A': defaultdict(lambda: defaultdict(int)),
                      'T': defaultdict(lambda: defaultdict(int))}

    if args.count_duplicates:
        try:
            from pybloom import ScalableBloomFilter
            bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        except ImportError:
            sys.exit("--count-duplicates option requires 'pybloom' package.\n")

    duplicates = 0
    percent_complete = 10
    reads = infile.subsample(n)

    for read in reads:
        if isinstance(read, Sam):
            if args.aligned_only and not read.mapped:
                continue
            elif args.unaligned_only and read.mapped:
                continue
            if read.reverse:
                seq = read.seq[::-1]
                qual = read.qual[::-1]
            else:
                seq = read.seq
                qual = read.qual
        else:
            seq = read.seq
            qual = read.qual

        # Set up limits
        if (args.leftlimit == 1) and (args.rightlimit < 0):
            pass
        elif (args.leftlimit >= 1) and (args.rightlimit > 0):
            try:
                seq = seq[args.leftlimit - 1:args.rightlimit]
                qual = qual[args.leftlimit - 1:args.rightlimit]
            except IndexError:
                act_nlines += n
                continue
        elif (args.leftlimit > 1) and (args.rightlimit < 0):
            try:
                seq = seq[args.leftlimit - 1:]
                qual = qual[args.leftlimit - 1:]
            except IndexError:
                act_nlines += n
                continue
        if len(seq) == 0:
            act_nlines += n
            continue
        cycle_gc[gc(seq)] += 1

        if args.count_duplicates:
            if seq in bloom_filter:
                duplicates += 1
            else:
                bloom_filter.add(seq)

        for i, (s, q) in enumerate(zip(seq, qual)):
            cycle_nuc[args.leftlimit + i][s] += 1
            cycle_qual[args.leftlimit + i][q] += 1
        read_len[len(qual)] += 1

        for i, kmer in enumerate(window(seq, n=args.kmer)):
            cycle_kmers[args.leftlimit+i][kmer] += 1

        if isinstance(read, Sam) and read.mapped:
            try:
                ref = read.parse_md()
                for i, (s, r) in enumerate(zip(seq, ref)):
                    if s != r:
                        try:
                            cycle_mismatch[r][args.leftlimit+i][s] += 1
                        except KeyError:
                            pass
            except KeyError:
                pass


        if est_nlines is not None:
            if (act_nlines / est_nlines) * 100 >= percent_complete:
                sys.stderr.write("Approximately {0:n}% complete at "
                                 "read {1:,} in {2}\n".format(percent_complete,
                                                              act_nlines,
                                                              time.strftime('%H:%M:%S',
                                                                            time.gmtime(time.time()-time_start))))
                percent_complete += 10
        act_nlines += n

    positions = [k for k in sorted(cycle_qual.keys())]
    depths = [read_len[k] for k in sorted(read_len.keys())]

    basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())]
    bases = set(list(itertools.chain.from_iterable(basecalls)))
    #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())]
    map(padbases(bases), cycle_nuc.values())

    quantile_values = [0.05,0.25,0.5,0.75,0.95]
    quantiles = []
    ## replace ASCII quality with integer
    for _, v in sorted(cycle_qual.items()):
        for q in tuple(v.keys()): ## py3 keys are iterator, so build a tuple to avoid recursion
            v[ord(str(q)) - 33] = v.pop(q)
        line = [percentile(v, p) for p in quantile_values]
        quantiles.append(line)

    # build kmer set of known adapter sequences
    adapter_kmers = set()
    for adapter in all_adapter_sequences:
        for kmer in window(adapter, n=args.kmer):
            adapter_kmers.add(kmer)

    # test for nonuniform kmer profiles and calculate obs/exp
    observed_expected = dict()
    all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())]
    kmers = set(list(itertools.chain.from_iterable(all_kmers)))
    bad_kmers = []
    sequenced_bases = sum((l * n for l, n in read_len.items()))
    priors = tuple(map(float, args.base_probs.split(',')))
    for kmer in kmers:
        kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())]
        expected_fraction = reduce(mul, (p ** kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1)
        expected = expected_fraction * sequenced_bases
        observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected
        slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts))
        if abs(slope) > 2 and p_value < 0.05:
            bad_kmers.append((kmer, slope, p_value))
    bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10]
    pos_gc = [sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([cycle_nuc[i]['C'],
                                                              cycle_nuc[i]['G'],
                                                              cycle_nuc[i]['A'],
                                                              cycle_nuc[i]['T']]) * 100 for i in positions]

    # see http://vita.had.co.nz/papers/tidy-data.pdf
    sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='reads', pos='None', value=act_nlines))

    for cycle, count in read_len.items():
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_len', pos=cycle,
                                                               value=count))

    for i, position in enumerate(positions):
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q05', pos=position,
                                                               value=quantiles[i][0]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q25', pos=position,
                                                               value=quantiles[i][1]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q50', pos=position,
                                                               value=quantiles[i][2]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q75', pos=position,
                                                               value=quantiles[i][3]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q95', pos=position,
                                                               value=quantiles[i][4]))
    for base in bases:
        for position in positions:
            sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                                   column=base, pos=position,
                                                                   value=cycle_nuc[position][base]))
    for position in positions:
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='cycle_gc', pos=position,
                                                               value=cycle_gc[position]))
    for i in range(101):
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='read_gc', pos=i,
                                                               value=cycle_gc[i]))

    for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]):
        sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name,
                                                               column=kmer, pos='None',
                                                               value=obs_exp))

    if args.count_duplicates:
        sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='duplicate', pos='None', value=duplicates/act_nlines))


    from zipfile import ZipFile
    with ZipFile(args.output + '.zip', mode='w') as zip_archive:
        fig_kw = {'figsize':(8, 6)}
        qualplot(positions, quantiles, zip_archive, fig_kw)
        median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw)
        qualmap(cycle_qual, zip_archive, fig_kw)
        depthplot(read_len, zip_archive, fig_kw)
        gcplot(positions, pos_gc, zip_archive, fig_kw)
        gcdist(cycle_gc, zip_archive, fig_kw)
        nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw)
        kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw)
        adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw)
        if isinstance(infile, Reader):
            mismatchplot(positions
                         , cycle_mismatch, zip_archive, fig_kw)
    time_finish = time.time()
    elapsed = time_finish - time_start
    if not args.quiet:
        sys.stderr.write("There were {counts:,} reads in the file. Analysis finished in {sec}.\n".format(counts=act_nlines,
                                                                                                                       sec=time.strftime('%H:%M:%S',
                                                                                                                                         time.gmtime(elapsed))
        ))
        if len(bad_kmers) > 0:
            for kmer in bad_kmers:
                sys.stderr.write("KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer))
        if median_qual < args.median_qual:
            sys.stderr.write("QualityWarning: median base quality score is %s.\n" % median_qual)
コード例 #46
0
ファイル: fetcher.py プロジェクト: kamushin/kamuSpider
class Fetcher(metaclass=Singleton):
    def __init__(self, ioloop=None, start_url=None, max_depth=5):
        super().__init__()

        self.ioloop = ioloop or tornado.ioloop.IOLoop.instance()
        self.start_url = start_url or {}
        self.fetch_queue = Queue()
        self.fetched = []
        self.fetched_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.fetch_finished = []

        for u in start_url:
            self.fetch_queue.put(u)

        self.fetching = 0
        self.max_depth = max_depth

    def add_url(self, url):
        if not isValidScheme(url):
            logger.warning("not vaild_scheme")
            return

        logger.debug("get url: %s" % url)

        self.fetch_queue.put(url)

    @tornado.gen.coroutine
    def fetch(self, url):
        """
        抓取器
        """
        http_cilent = AsyncHTTPClient()
        request = HTTPRequest(url=url.encode("utf-8"), connect_timeout=options.timeout, request_timeout=options.timeout)
        response = yield http_cilent.fetch(request)
        logger.debug("fetched url: %s" % url)

        return response

    def parse(self, response):
        """
        解析URL, 保存结果, 传递新的URL
        """

        # self.save_tofile(response)

        url_gen = HtmlAnalyzer.extract_links(response.body, response.effective_url, [])

        return url_gen

    def save_tofile(self, response):
        """
        暂时使用blocking的f.write代替db
        这里的io比较快,影响不大
        """
        path = response.effective_url.split("/")[-1]
        if path is None or path is "":
            path = response.effective_url.split("/")[-2]
        try:
            with open(os.path.join("tmp", path), "a") as f:
                f.write(response.effective_url + "\n")
                f.write(str(response.body) + "\n")
        except:
            logger.error("path %s" % path)

    @tornado.gen.coroutine
    def do_work(self, url):
        if not isValidScheme(url):
            logger.warning("not vaild_scheme")
            return None

        try:
            response = yield self.fetch(url)

        except tornado.httpclient.HTTPError as e:
            # import traceback
            # traceback.print_exc()

            # TODO
            # Some bug here. Too many file open.

            # with open('httperror.txt', "a") as f:
            # f.write("Url: %s HTTPError: %s \n"% (url,e.code))

            logger.error("Url: %s HTTPError: %s " % (url, e.code))

        except:
            import traceback

            traceback.print_exc()
            logger.error("Unknow error with url: %s" % url)

        else:
            url_gen = self.parse(response)
            self.fetch_finished.append(url)

            sender = Sender()
            for u in url_gen:
                sender.add_url(u)
            logging.info("fetched %s" % url)

        self.fetching -= 1

    def run(self):
        """
        Get url from fetch_queue to fetch
        """

        logging.error("fetching: %s " % self.fetching)
        while not self.fetch_queue.empty() and self.fetching <= options.max_fetch_clients:

            url = self.fetch_queue.get()
            if url in self.fetched_filter:
                logging.info("url in fetched_filter")
                continue
            else:
                self.fetched_filter.add(url)
                self.fetched.append(url)
                self.fetching += 1

            self.ioloop.add_callback(self.do_work, url)

        self.ioloop.add_timeout(datetime.timedelta(seconds=1), self.run)
コード例 #47
0
ファイル: spider.py プロジェクト: kudocc/SpiderBaiduTieba
#load context from file
conn = sqlite3.connect('record.db')
curs = conn.cursor()
curs.execute('''create table if not exists downloaded_image_url (id INTEGER PRIMARY KEY autoincrement, tiebar_url text, image_url text, md5 text)''')
curs.execute('''create table if not exists parsed_url (id INTEGER PRIMARY KEY autoincrement, url text, title text, parsed_time date)''')
curs.execute('''create table if not exists wait_parse_url (id INTEGER PRIMARY KEY autoincrement, url text)''')
print 'finish create table'

#load downloaded image urls
curs.execute('select image_url from downloaded_image_url')
while True:
    url = curs.fetchone()
    if url is not None:
        print 'downloaded image url:', url[0]
        downloaded_image_urls.add(url[0])
    else:
        break
#load parsed urls
curs.execute('select url from parsed_url')
while True:
    url = curs.fetchone()
    if url is not None:
        print 'parsed url:', url[0]
        parsed_urls.add(url[0])
    else:
        break
#load wait parse queue urls
curs.execute('select url from wait_parse_url')
while True:
    url = curs.fetchone()
コード例 #48
0
class BloomAutoYara:
  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

  def save_filter(self):
    print "saving filter to file %s "%self.filterfile
    self.bf.tofile(open(self.filterfile,"wb"))

  def add_string(self,str):
    self.bf.add(str)

  def search_string(self,str):
    if str in self.bf:
      return True
    else:
      return False

  def extractlines(self,filename,min_len=4):
    chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> "
    shortest_run = 4
    regexp = '[%s]{%d,}' % (chars, shortest_run)
    pattern = re.compile(regexp)
    fp = open(filename,"rb")
    data = fp.read()
    lines = pattern.findall(data)
    s = set(lines)
    fp.close()
    return list(s)
   
  def build_filter(self,dirname,extensions=[]):
    print extensions
    total = 0
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        ext = f.split(".")[-1]
        
        if len(extensions) != 0 and ext not in extensions:
          continue
          
        print "processing file %s"%f
        total += 1
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          self.add_string(line)
  
    print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile)
    self.save_filter()
    
  def find_file_topn(self,filename,topn=10):
    tmp = []
    lines = self.extractlines(filename)
    print "total unique strings in file %s = %d"%(filename,len(lines))
    for line in lines:
      if self.search_string(line) == False:
        tmp.append(line)
    tmp.sort(key=len)
    print "total strings which can be used for signature = %d"%len(tmp)
    tmp = tmp[-topn:]
    tmp.reverse()
    return tmp
    
  def find_dir_topn(self,dirname,topn=10):
    tmplist = []
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          if self.search_string(line) == False:
            tmplist.append(line) 
    
    counts = Counter(list(tmplist))
    return counts.most_common(topn)

  def escapechars(self,str):
    for c in "\/.^$*+-?()[]{}|":
      str = str.replace(c,"\\"+c)
    return str
    
  def list_to_rule(self,list,rulename,threshold=0.5):
    tmp = "rule " + rulename + "{\n"
    tmp += " strings:\n"
    
    for i in xrange(0,len(list)):
      esc = self.escapechars(list[i])
      tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/"
      tmp += "\n"
    
    tmp += "condition:\n"
    tmp += str(int(len(list)*threshold))
    tmp += " of ("
    for i in xrange(0,len(list)):
      tmp += "$str"+ str(i)
      if i != (len(list) - 1):
        tmp += ","
    
    tmp += ")\n}"
    
    print "rule = %s.yara is written to disk "%rulename
    fp = open(rulename+".yara","w")
    fp.write(tmp)
    fp.close()
コード例 #49
0
ファイル: master.py プロジェクト: kintomiko/novel_crawler
import redis
import logging, sys
import config
from pybloom import ScalableBloomFilter
sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

logger = Logger.getStdOutDebugLogger('master')

redis_conn = redis.StrictRedis(config.REDIS_HOST, config.REDIS_PORT)

while(True):
  url = redis_conn.blpop(config.RawQueue)[1]

  if (not url in sbf) and (url[0:url.find('/', 7)].find('2epub') != -1):
    sbf.add(url)
    redis_conn.rpush(config.PendingCrawlingQueue, url)
コード例 #50
0
class StoreFeedbackSpider(RedisSpider):
    name = "store"
    allowed_domains = ["aliexpress.com"]
    start_urls = (
        'http://www.aliexpress.com/',
    )

    prefix = ''

    def __init__(self):
        self.feedbacks = dict()
        self.redis_queue = None
        self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def get_queue(self):
        for value in set(self.server.smembers(self.redis_key)):
            yield value

    def start_requests(self):
        StoreFeedbackSpider.prefix = self.settings['prefix']
        self.redis_key = '{}:storefeedback'.format(StoreFeedbackSpider.prefix)

        self.redis_queue = self.get_queue()

        db = MongoClient().aliexpress
        for store_feedback in db['{}storefeedback'.format(StoreFeedbackSpider.prefix)].find():
            self.ids.add(store_feedback['id'])

        yield self.next_request()

    def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['storeId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')

    def make_requests_from_url(self, url):
        self.log('request store feedback url: {}'.format(url), logging.INFO)
        parsed = urlparse.urlparse(url)
        store_id = urlparse.parse_qs(parsed.query)['storeId'][0]
        return self.request(store_id, url)

    def request(self, store_id, base_url, page=1):
        feedback_url = '{}&page={}'.format(base_url, page)
        self.log('request store feedback page: {}'.format(feedback_url), logging.INFO)

        return scrapy.Request(url=feedback_url, meta={'store_id': store_id, 'base_url': base_url, 'page': page},
                              callback=self.parse)

    def parse(self, response):
        self.log('parse store feedback page: {}'.format(response.url), logging.INFO)

        trs = response.xpath('//tbody/tr')
        if len(trs) > 0:
            for tr in trs:
                product = tr.css('.product-name').xpath('a/@href').extract()[0].replace('//', '/')
                time = datetime.strptime(tr.css('.feedback-date').xpath('text()').extract()[0], '%d %b %Y %H:%M')
                star_width = tr.css('.star').xpath('span/@style').extract()[0]
                star = int(star_width[star_width.index(':') + 1:-2]) / 20

                self.store(response.meta['store_id']).append_feedback(time=time, product=product, star=star)

            return self.request(response.meta['store_id'], response.meta['base_url'], int(response.meta['page']) + 1)
        else:
            self.store(response.meta['store_id']).finish_feedback = True
            return self.pop_feedback(response.meta['store_id'])

    def store(self, id):
        if id not in self.feedbacks:
            self.feedbacks[id] = StoreFeedback(id)
        return self.feedbacks[id]

    def pop_feedback(self, id):
        if self.store(id).is_finish():
            feedback = self.feedbacks.pop(id)

            self.log('crawl store feedback: {}'.format(feedback), logging.INFO)

            item = StoreFeedbackItem()
            item['prefix'] = StoreFeedbackSpider.prefix
            item['_id'] = feedback.id
            item['feedbacks'] = feedback.feedbacks
            return item
コード例 #51
0
ファイル: get_pp.py プロジェクト: AntNLP/opie
        for key, value in res_dict.items():
            print("NULL\t{0}\t{1}".format(key, value), file=f)
        i += 1
        filename = (field_content +
                    "pickles/parse_sentences/parse_sentences_" +
                    str(i) + ".pickle")


    sbf = ScalableBloomFilter(initial_capacity=150000000,
                              mode=ScalableBloomFilter.LARGE_SET_GROWTH)
    out = open(field_content + "lm/data.new", "w", encoding="utf8")
    with open(field_content + "lm/data.txt", "r", encoding="utf8") as f:
        for line in f:
            w1, w2, w3 = line.strip().split('\t')
            if w2 not in sbf:
                sbf.add(w2)
                print("{0}\t{1}\t{2}".format(w1, w2, w3), file=out)
    out.close()

    #  print("insert db")
    #  path = '"/home/zhi/Project/sentiment_relation_extraction_new_data/data/domains/{0}/lm/data.new"'.format(content)
    #  sql = "load data local infile "+path+ " into table lm_db fields escaped by ''"
    #  print(sql)
    #  execute(connection, sql)
    #  print("insert end")

    #  print("create index")
    #  sql = "alter lm_db add index on (content)"
    #  print(sql)
    #  execute(connection, sql)
    #  connection.close()
コード例 #52
0
class SupplierCatalogItemTask(BaseSupplierCatalogTask):

	field_names = {
		'advanced':'advanced',
		#'availability_indefinite':'availability_indefinite',
		'available':'available',
		'category_identifier':'category_identifier',
		'cost':'quantity_cost',
		#'effective':'effective',
		'manufacturer_identifier':'manufacturer_identifier', 
		'name':'name',
		'phased_out':'phased_out',
		'product_identifier':'product_identifier',
		'retail':'quantity_retail', 
		'scale_identifier':'scale_identifier',
		'special_cost':'quantity_special_cost',
		'stock':'in_stock',
		#'to_be_announced':'to_be_announced'
	}

	defaults = {
		'advanced': False,
		#'availability_indefinite': False,
		'available': None,
		'category_identifier': None,
		'cost': Decimal(0),
		'name': None,
		'phased_out': False,
		'retail': Decimal(0),
		'scale_identifier': None,
		'special_cost': Decimal(0),
		'stock': False,
		#'to_be_announced': False,
	}
	
	latest_supplier_catalog_id_cache = dict()

	category_conversion_filter = None
	manufacturer_conversion_filter = None
	price_control_filter = None
	scale_conversion_filter = None

	def __init__(self):
		BaseSupplierCatalogTask.__init__(self)
		self.plugins = self.load_plugins()

	def load(self):
		"""Load"""
		logger.debug("Begin load()")
		self.load_all()
		logger.debug("End load()")

	def load_all(self, supplier_id=None):
		logger.debug("Begin load_all()")
		self.ts = ttystatus.TerminalStatus(period=1)
		self.ts.add(ttystatus.Literal('SupplierCatalogItem Load  Elapsed: '))
		self.ts.add(ttystatus.ElapsedTime())
		self.ts.add(ttystatus.Literal('  Supplier: '))
		self.ts.add(ttystatus.PercentDone('supplier_done', 'supplier_total', decimals=2))
		self.ts.add(ttystatus.Literal('  Manufacturer: '))
		self.ts.add(ttystatus.PercentDone('manufacturer_done', 'manufacturer_total', decimals=2))
		self.ts.add(ttystatus.Literal('  Product: '))
		self.ts.add(ttystatus.PercentDone('product_done', 'product_total', decimals=2))
		self.ts.add(ttystatus.Literal('  '))
		self.ts.add(ttystatus.String('manufacturer'))
		self.ts.add(ttystatus.Literal('-'))
		self.ts.add(ttystatus.String('product'))
		
		self.ts['supplier_total'] = 1
		self.ts['supplier_done'] = 0
		self.ts['manufacturer_total'] = 1
		self.ts['manufacturer_done'] = 0
		self.ts['product_total'] = 1
		self.ts['product_done'] = 0

		
		self.ts['supplier_total'] = len(self.plugins)
		self.ts['supplier_done'] = 0
		
		filter_supplier_id = supplier_id
		try:
			#self.session.begin(subtransactions=True)
			for plug in self.plugins.itervalues():
				supplier_id = plug.supplier_id()
				if (
					filter_supplier_id is not None and 
					supplier_id != filter_supplier_id
				):
					continue
				#latest_supplier_catalog = self.load_latest_supplier_catalog(supplier_id)
				#if supplier_catalog is not None:
					#self.supplier_catalog_id = supplier_catalog.id
				self.load_supplier(plug, supplier_id)
				#else:
					#logger.error("No Latest SupplierCatalog Found for Supplier.id %s", supplier_id)
				self.session.flush()
				self.session.expunge_all()
				
				self.ts['supplier_done'] += 1
			#self.session.commit()
		except Exception:
			logger.exception("Caught Exception: ")
			if self.session.transaction is not None:
				self.session.rollback()
		finally:
			self.ts.finish()
		logger.debug("End load_all()")


	def load_supplier(self, plug, supplier_id):
		self.session.begin(subtransactions=True)
		logger.debug("load_supplier %s", supplier_id)
		query = self.session.query(SupplierCatalogItemFieldModel.manufacturer_identifier)
		query = query.filter(SupplierCatalogItemFieldModel.supplier_id == supplier_id)
		query = query.filter(SupplierCatalogItemFieldModel.manufacturer_identifier != None)
		query = query.group_by(SupplierCatalogItemFieldModel.manufacturer_identifier)
		self.ts['manufacturer_total'] = query.count()
		self.ts['manufacturer_done'] = 0
		
		
		for (manufacturer_identifier, ) in query.yield_per(1000):
			self.ts['manufacturer'] = manufacturer_identifier
			self.load_manufacturer(plug, supplier_id, manufacturer_identifier)
			self.session.flush()
			self.session.expunge_all()
			self.ts['manufacturer_done'] += 1
		self.session.commit()

	def load_manufacturer(self, plug, supplier_id, manufacturer_identifier):
		logger.debug("Manufacturer %s", manufacturer_identifier)
		query = self.session.query(SupplierCatalogItemFieldModel.product_identifier)
		query = query.filter(SupplierCatalogItemFieldModel.supplier_id == supplier_id)
		query = query.filter(SupplierCatalogItemFieldModel.manufacturer_identifier == manufacturer_identifier)
		query = query.filter(SupplierCatalogItemFieldModel.product_identifier != None)

		query = query.group_by(SupplierCatalogItemFieldModel.product_identifier)
		self.ts['product_total'] = query.count()
		self.ts['product_done'] = 0
		
		for (product_identifier, ) in query.yield_per(1000):
			self.ts['product'] = product_identifier
			self.load_one(supplier_id, manufacturer_identifier, product_identifier)
			self.ts['product_done'] += 1

			
	def load_one(self, supplier_id, manufacturer_identifier, product_identifier):
		
		#for (key, value) in self.defaults.iteritems():
			#if key not in data or data[key] is None:
				#data[key] = value

		query = self.session.query(SupplierCatalogItemModel)
		query = query.filter(SupplierCatalogItemModel.supplier_id == supplier_id)
		query = query.filter(SupplierCatalogItemModel.manufacturer_identifier == manufacturer_identifier)
		query = query.filter(SupplierCatalogItemModel.product_identifier == product_identifier)
		
		try:
			supplier_catalog_item = query.one()
		except NoResultFound:
			supplier_catalog_item = SupplierCatalogItemModel()
			supplier_catalog_item.supplier_id = supplier_id
			supplier_catalog_item.manufacturer_identifier = manufacturer_identifier
			supplier_catalog_item.product_identifier = product_identifier
			self.session.add(supplier_catalog_item)
		
		#for (field_name, item_name) in self.field_names.iteritems():
			#setattr(supplier_catalog_item, item_name, data[field_name])
		#supplier_catalog_item.effective = data['effective']
		self.session.flush()


	def update(self):
		"""Update"""
		logger.debug("Begin update()")
		self.update_all(limit=10000, time_limit=timedelta(hours=1))
		logger.debug("End update()")


	def update_all(self, modified_since=None, limit=None, time_limit=None):
		"""Update All"""
		logger.debug("Begin update_all()")
		result = None
		ts = self.term_stat('SupplierCatalogItem Update')
		start_time = datetime.now()
		try:
			#s = ScalableBloomFilter()
			#query = self.session.query(
			#	SupplierCatalogItemFieldModel.supplier_id,
			#	SupplierCatalogItemFieldModel.manufacturer_identifier,
			#	SupplierCatalogItemFieldModel.product_identifier,
			#)
			#for row in query.yield_per(1000):
			#	s.add(row)
			
			query = self.session.query(SupplierCatalogItemModel)

			if modified_since:
				query = query.filter(SupplierCatalogItemModel.modified >= modified_since)
			if limit:
				query = query.order_by(SupplierCatalogItemModel.updated.nullsfirst())
				query = query.limit(limit)

			ts['total'] = query.count()
			self.session.begin(subtransactions=True)
			for supplier_catalog_item in query.yield_per(10000):
				row = (
					supplier_catalog_item.supplier_id,
					supplier_catalog_item.manufacturer_identifier,
					supplier_catalog_item.product_identifier,
				)
				#if row not in s:
				#	logger.info(
				#		"Not found in SupplierCatalogItemFields %s %s-%s", 
				#		supplier_catalog_item.supplier_id,
				#		supplier_catalog_item.manufacturer_identifier,
				#		supplier_catalog_item.product_identifier
				#	)
					## TODO Maybe only not do load from SCIV?
				#	continue

				self.update_one(supplier_catalog_item)
				ts['done'] += 1
				if time_limit is not None:
					if datetime.now() > start_time + time_limit:
						logger.info("Reached Time Limit at %i of %i", ts['done'], ts['total'])
						break;
			self.session.commit()
			result = True
		except Exception as e:
			logger.exception("Caught Exception: ")
			if self.session.transaction is not None:
				self.session.rollback()
		finally:
			ts.finish()
		logger.debug("End update_all()")
		return result
			
	def update_one(self, supplier_catalog_item):
		"""
		Update One
		
		Using ManufacturerConversion,
			convert manufacturer_identifier to manufacturer_id
		Using ProductConversion, 
			convert product_identifier to product_id and quantity
			quantity_cost from quantity, cost
			quantity_retail from quantity, retail
		Using CategoryConversion, 
			convert category_identifier to category_id
		Using ScaleConversion, 
			convert scale_identifier to scale_id
		Using PriceControl,
			get price_control_id
			using sale, quantity generate quantity_sale
		"""
		self.session.begin(subtransactions=True)
		
		self.update_supplier_catalog_item_version(supplier_catalog_item)
		
		self.update_manufacturer(supplier_catalog_item)
		self.update_product(supplier_catalog_item)
		self.update_category(supplier_catalog_item)
		self.update_scale(supplier_catalog_item)
		self.update_price_control(supplier_catalog_item)
		supplier_catalog_item.updated = datetime.now()
		self.session.commit()

	def load_latest_supplier_catalog_id(self, supplier_id):
		if supplier_id in self.latest_supplier_catalog_id_cache:
			return self.latest_supplier_catalog_id_cache[supplier_id]
		query = self.session.query(SupplierCatalogModel)
		query = query.filter(SupplierCatalogModel.supplier_id == supplier_id)
		supplier_catalog = query.order_by(desc(SupplierCatalogModel.issue_date)).first()
		logger.debug("Latest Supplier %s, %s", supplier_id, supplier_catalog)
		self.latest_supplier_catalog_id_cache[supplier_id] = supplier_catalog.id
		return supplier_catalog.id

	def update_supplier_catalog_item_version(self, supplier_catalog_item):
		if supplier_catalog_item.supplier_id not in self.plugins:
			## Not an ETL tracked Supplier.
			return
		
		plug = self.plugins[supplier_catalog_item.supplier_id]
		model_name = plug.version_model()  + 'Model'
		VersionModel = getattr(model, model_name)

		## TODO: Don't overwrite manual entries
		
		self.latest_supplier_catalog_id = self.load_latest_supplier_catalog_id(supplier_catalog_item.supplier_id)
		if self.latest_supplier_catalog_id is None:
			logger.error("No Latest SupplierCatalog Found for Supplier.id %s", supplier_catalog_item.supplier_id)
			## TODO: What should we be doing here? setting some sort of defaults?
			supplier_catalog_item.legacy_flag = 20
			return

		query = self.session.query(SupplierCatalogItemFieldModel.id)
		query = query.filter(SupplierCatalogItemFieldModel.supplier_id == supplier_catalog_item.supplier_id)
		query = query.filter(SupplierCatalogItemFieldModel.manufacturer_identifier == supplier_catalog_item.manufacturer_identifier)
		query = query.filter(SupplierCatalogItemFieldModel.product_identifier == supplier_catalog_item.product_identifier)
	
		data = None
	
		if query.count() > 0:
	
			s = set()
			for (supplier_catalog_item_field_id, ) in query.yield_per(1000):
				s.add(supplier_catalog_item_field_id)

			del query

			if plug.opaque() is True:
				if plug.ghost() is True:
					data = self.coalesce_opaque_ghost(VersionModel, s, plug)
				else:
					data = self.coalesce_opaque_noghost(VersionModel, s)
			else:
				if plug.ghost() is True:
					data = self.coalesce_translucent_ghost(VersionModel, s)
				else:
					data = self.coalesce_translucent_noghost(VersionModel, s)
			#print "DATA IN", data
		
		if data is None:
			logger.warning(
				"Got None from coalesce %s %s-%s", 
				supplier_catalog_item.supplier_id,
				supplier_catalog_item.manufacturer_identifier,
				supplier_catalog_item.product_identifier,
			)
			## TODO What should we do here?
			supplier_catalog_item.legacy_flag = 30
			return
		
		for (key, value) in self.defaults.iteritems():
			if key not in data or data[key] is None:
				data[key] = value

		#print "DATA OUT", data

		f = {
			'advanced':'advanced',
			#'availability_indefinite':'availability_indefinite',
			'available':'available',
			'category_identifier':'category_identifier',
			'cost':'quantity_cost',
			#'effective':'effective',
			##'manufacturer_identifier':'manufacturer_identifier', 
			'name':'name',
			'phased_out':'phased_out',
			##'product_identifier':'product_identifier',
			'retail':'quantity_retail', 
			'scale_identifier':'scale_identifier',
			'special_cost':'quantity_special_cost',
			'stock':'in_stock',
			#'to_be_announced':'to_be_announced'
		}

		for (field_name, item_name) in f.iteritems():
			setattr(supplier_catalog_item, item_name, data[field_name])
		
		supplier_catalog_item.legacy_flag = 40


	def coalesce_opaque_noghost(self, VersionModel, s, get_effective=False):
		query = self.session.query(VersionModel)
		query = query.filter(VersionModel.supplier_catalog_item_field_id.in_(s))
		query = query.order_by(desc(VersionModel.effective))
		try:
			supplier_catalog_item_version = query.first()
		except NoResultFound:
			logger.debug('No %s Found', VersionModel.__name__)
			return None
		if supplier_catalog_item_version is None:
			logger.debug('No %s Found', VersionModel.__name__)
			return None
		data = dict()
		for field_name in self.field_names.iterkeys():
			data[field_name] = getattr(supplier_catalog_item_version.supplier_catalog_item_field, field_name)
		data['supplier_catalog_id'] = supplier_catalog_item_version.supplier_catalog_id
		
		supplier_catalog_item_field_id = supplier_catalog_item_version.supplier_catalog_item_field_id
		effective = supplier_catalog_item_version.effective
		
		if get_effective:
			for supplier_catalog_item_version in query.yield_per(5):
				if supplier_catalog_item_version.supplier_catalog_item_field_id == supplier_catalog_item_field_id:
					effective = supplier_catalog_item_version.effective
				else:
					break
			data['effective'] = effective
		return data


	def coalesce_opaque_ghost(self, VersionModel, s, plug, get_effective=False):
		data = self.coalesce_opaque_noghost(VersionModel, s, get_effective)
		if data is None: 
			return None
		
		if data['supplier_catalog_id'] != self.latest_supplier_catalog_id:
			if plug.ghost_stock():
				data['stock'] = False
			if plug.ghost_phased_out():
				data['phased_out'] = False
			if plug.ghost_advanced():
				data['advanced'] = False
		return data


	def coalesce_translucent_noghost(self, VersionModel, s):
		query = self.session.query(VersionModel)
		query = query.filter(VersionModel.supplier_catalog_item_field_id.in_(s))
		query = query.order_by(desc(VersionModel.effective))

		count = query.count()

		if count == 0:
			logger.error('No %s Found. Run SupplierCatalogItemVersionTask.vacuum() !', VersionModel.__name__)
			return None

		data = dict()
		first = True
		done = 0
		for supplier_catalog_item_version in query.all():
			done += 1
			if first:
				data['supplier_catalog_id'] = supplier_catalog_item_version.supplier_catalog_id
				data['effective'] = supplier_catalog_item_version.effective
			complete = True
			for field_name in self.field_names.iterkeys():
				field = getattr(supplier_catalog_item_version.supplier_catalog_item_field, field_name)
				if not field_name in data or data[field_name] is None:
					if field is None:
						complete = False
					else:
						data[field_name] = field
			if complete:
				break
		
		#logger.info("Complete SupplierCatalogItem was found in %i of %i Versions", done, count)
				
		return data

	def coalesce_translucent_ghost(self, VersionModel, s, plug):
		data = self.coalesce_translucent_noghost(VersionModel, s)
		if data is None: 
			return None
		
		if data['supplier_catalog_id'] != self.latest_supplier_catalog_id:
			if plug.ghost_stock():
				data['stock'] = False
			if plug.ghost_phased_out():
				data['phased_out'] = False
			if plug.ghost_advanced():
				data['advanced'] = False
		return data


	def update_manufacturer(self, supplier_catalog_item):
		#self.session.begin(subtransactions=True)
		"""Update Manufacturer"""
		#print (
		#	"Update Manufacturer", 
		#	"sid", supplier_catalog_item.supplier_id, 
		#	"mident", supplier_catalog_item.manufacturer_identifier,
		#	"mid", supplier_catalog_item.manufacturer_id
		#)

		manufacturer_conversion = self.get_manufacturer_conversion(
			supplier_catalog_item.supplier_id, 
			supplier_catalog_item.manufacturer_identifier
		)
		if manufacturer_conversion is not None:
			supplier_catalog_item.manufacturer_id = manufacturer_conversion.manufacturer_id
		else:
			supplier_catalog_item.manufacturer_id = None
		#self.session.commit()


	def update_product(self, supplier_catalog_item):
		#self.session.begin(subtransactions=True)
		"""Product Conversion"""
		if (
			supplier_catalog_item.supplier_id is not None and
			supplier_catalog_item.manufacturer_id is not None and
			supplier_catalog_item.product_identifier is not None
		):
			product_conversion = self.get_product_conversion(
				supplier_catalog_item.supplier_id, 
				supplier_catalog_item.manufacturer_id,
				supplier_catalog_item.product_identifier
			)
			if product_conversion is not None:
				supplier_catalog_item.product_id = product_conversion.product_id
				supplier_catalog_item.quantity = product_conversion.get_quantity()
			else:
				supplier_catalog_item.product_id = None
				supplier_catalog_item.quantity = Decimal(1)
		else:
			supplier_catalog_item.product_id = None
			supplier_catalog_item.quantity = Decimal(1)

		if supplier_catalog_item.quantity_cost > 0:
			supplier_catalog_item.cost = decimal_round(supplier_catalog_item.quantity_cost / supplier_catalog_item.quantity, cfg.cost_decimals)
		else:
			supplier_catalog_item.cost = Decimal(0)
			
		if supplier_catalog_item.quantity_special_cost > 0:
			supplier_catalog_item.special_cost = decimal_round(supplier_catalog_item.quantity_special_cost / supplier_catalog_item.quantity, cfg.cost_decimals)
		else:
			supplier_catalog_item.special_cost = Decimal(0)
			
		if supplier_catalog_item.quantity_retail > 0:
			supplier_catalog_item.retail = decimal_round(supplier_catalog_item.quantity_retail / supplier_catalog_item.quantity, cfg.cost_decimals)
		else:
			supplier_catalog_item.retail = Decimal(0)
		#self.session.commit()

	def update_category(self, supplier_catalog_item):
		"""Category Conversion"""
		#self.session.begin(subtransactions=True)
		if (
			supplier_catalog_item.supplier_id is not None and
			supplier_catalog_item.manufacturer_id is not None and
			supplier_catalog_item.category_identifier is not None
		):
			category_conversion = self.get_category_conversion(
				supplier_catalog_item.supplier_id, 
				supplier_catalog_item.manufacturer_id, 
				supplier_catalog_item.category_identifier
			)
			if category_conversion is not None:
				supplier_catalog_item.category_id = category_conversion.category_id
			else:
				supplier_catalog_item.category_id = None
		else:
			supplier_catalog_item.category_id = None
		#self.session.commit()


	def update_scale(self, supplier_catalog_item):
		"""Scale Conversion"""
		#self.session.begin(subtransactions=True)
		if (
			supplier_catalog_item.supplier_id is not None and
			supplier_catalog_item.scale_identifier is not None
		):
			scale_conversion = self.get_scale_conversion(
				supplier_catalog_item.supplier_id, 
				supplier_catalog_item.scale_identifier
			)
			if scale_conversion is not None:
				supplier_catalog_item.scale_id = scale_conversion.scale_id
			else:
				supplier_catalog_item.scale_id = None
		else:
			supplier_catalog_item.scale_id = None
		#self.session.commit()


	def update_price_control(self, supplier_catalog_item):
		"""Price Control"""
		#self.session.begin(subtransactions=True)
		#*** TODO handle price_control.allow_advanced
		
		if (
			supplier_catalog_item.supplier_id is not None and
			supplier_catalog_item.manufacturer_id is not None and
			supplier_catalog_item.retail > 0
		):
			price_control = self.get_price_control(
				supplier_catalog_item.supplier_id, 
				supplier_catalog_item.manufacturer_id, 
				supplier_catalog_item.retail, 
				supplier_catalog_item.advanced, 
				supplier_catalog_item.special
			)
			if price_control is not None:
				supplier_catalog_item.price_control_id = price_control.id
				supplier_catalog_item.rank = price_control.rank
				if supplier_catalog_item.special:
					if supplier_catalog_item.cost > 0:
						ratio = supplier_catalog_item.special_cost / supplier_catalog_item.cost
					else:
						ratio = 1
					special_retail = supplier_catalog_item.retail * ratio
					supplier_catalog_item.sale = price_control.sale(
						supplier_catalog_item.special_cost,
						special_retail
					)
				else:
					supplier_catalog_item.sale = price_control.sale(
						supplier_catalog_item.cost,
						supplier_catalog_item.retail
					)
			else:
				supplier_catalog_item.sale = 0
				supplier_catalog_item.price_control_id = None
				supplier_catalog_item.rank = 0
		else:
			supplier_catalog_item.sale = 0
			supplier_catalog_item.price_control_id = None
			supplier_catalog_item.rank = 0
		#self.session.commit()


	def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
		"""Category Conversion"""
		if self.category_conversion_filter is None:
			self.category_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				CategoryConversionModel.supplier_id,
				CategoryConversionModel.manufacturer_id,
				CategoryConversionModel.needle
			)
			for row in query.yield_per(100):
				self.category_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_id, category_identifier)
		if row in self.category_conversion_filter:
			query = self.session.query(CategoryConversionModel)
			query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
			query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
			query = query.filter(CategoryConversionModel.needle == category_identifier)
			try:
				category_conversion = query.one()
				return category_conversion
			except NoResultFound:
				pass

		category_conversion = CategoryConversionModel()
		category_conversion.manufacturer_id = manufacturer_id
		category_conversion.supplier_id = supplier_id
		category_conversion.needle = category_identifier
		self.session.add(category_conversion)
		self.category_conversion_filter.add(row)
		return category_conversion
		
		
	def get_manufacturer_conversion(self, supplier_id, manufacturer_identifier):
		"""Manufacturer Conversion"""
		if self.manufacturer_conversion_filter is None:
			self.manufacturer_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				ManufacturerConversionModel.supplier_id,
				ManufacturerConversionModel.manufacturer_identifier
			)
			for row in query.yield_per(100):
				self.manufacturer_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_identifier)
		if row in self.manufacturer_conversion_filter:
			query = self.session.query(ManufacturerConversionModel)
			query = query.filter(ManufacturerConversionModel.supplier_id == supplier_id)
			query = query.filter(ManufacturerConversionModel.manufacturer_identifier == manufacturer_identifier)
			try:
				manufacturer_conversion = query.one()
				return manufacturer_conversion
			except NoResultFound:
				pass
			
		query = self.session.query(ManufacturerModel)
		query = query.filter(ManufacturerModel.identifier == manufacturer_identifier)
		try:
			manufacturer = query.one()
		except NoResultFound:
			logger.warning("No ManufacturerConversion found for supplier_id '%s' manufacturer_identifier '%s'", supplier_id, manufacturer_identifier)
			return None
		
		manufacturer_conversion = ManufacturerConversionModel()
		manufacturer_conversion.manufacturer_id = manufacturer.id
		manufacturer_conversion.supplier_id = supplier_id
		manufacturer_conversion.manufacturer_identifier = manufacturer_identifier
		#self.session.add(manufacturer_conversion)
		return manufacturer_conversion


	def get_price_control(self, supplier_id, manufacturer_id, retail, preorder, special):
		"""Price Control"""
		if self.price_control_filter is None:
			self.price_control_filter = ScalableBloomFilter()
			query = self.session.query(
				PriceControlModel.supplier_id,
				PriceControlModel.manufacturer_id
			)
			for row in query.yield_per(100):
				self.price_control_filter.add(row)
		
		row = (supplier_id, manufacturer_id)
		if row in self.price_control_filter:
			query = self.session.query(PriceControlModel)
			query = query.filter(PriceControlModel.supplier_id == supplier_id)
			query = query.filter(PriceControlModel.manufacturer_id == manufacturer_id)
			if preorder:
				query = query.filter(PriceControlModel.preorder == True)
				
			if special:
				query = query.filter(PriceControlModel.special == True)
			
			if (not preorder) and (not special):
				query = query.filter(PriceControlModel.normal == True)
			
			query = query.filter(PriceControlModel.retail_low <= retail)
			query = query.filter(PriceControlModel.retail_high >= retail)
			query = query.filter(PriceControlModel.enable == True)
			try:
				price_control = query.one()
				return price_control
			except NoResultFound:
				#logger.warning(
				#	"No PriceControl found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", 
				#	supplier_id, 
				#	manufacturer_id, 
				#	retail, 
				#	preorder, 
				#	special
				#)
				return None
			except MultipleResultsFound:
				logger.warning(
					"Duplicate PriceControls found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", 
					supplier_id, 
					manufacturer_id, 
					retail, 
					preorder, 
					special
				)
		return None


	def get_product_conversion(self, supplier_id, manufacturer_id, product_identifier):
		"""Product Conversion"""
		query = self.session.query(ProductConversionModel)
		query = query.filter(ProductConversionModel.supplier_id == supplier_id)
		query = query.filter(ProductConversionModel.manufacturer_id == manufacturer_id)
		query = query.filter(ProductConversionModel.product_identifier == product_identifier)
		
		try:
			product_conversion = query.one()
			return product_conversion
		except NoResultFound:
			pass
			
		query = self.session.query(ProductModel)
		query = query.filter(ProductModel.manufacturer_id == manufacturer_id)
		query = query.filter(ProductModel.identifier == product_identifier)

		try:
			product = query.one()
		except NoResultFound:
			#logger.warning(
			#	"No ProductConversion found for supplier_id '%s' manufacturer_id '%s' product_identifier '%s'", 
			#	supplier_id, 
			#	manufacturer_id, 
			#	product_identifier, 
			#)
			return None
			
		product_conversion = ProductConversionModel()
		product_conversion.product_id = product.id
		product_conversion.manufacturer_id = manufacturer_id
		product_conversion.supplier_id = supplier_id
		product_conversion.source_quantity = 1
		product_conversion.target_quantity = 1
		return product_conversion


	def get_scale_conversion(self, supplier_id, scale_identifier):
		"""Scale Conversion"""
		
		if scale_identifier is None:
			return None
		if supplier_id is None:
			return None


		if self.scale_conversion_filter is None:
			self.scale_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				ScaleConversionModel.supplier_id,
				ScaleConversionModel.scale_identifier
			)
			for row in query.yield_per(100):
				self.scale_conversion_filter.add(row)
		
		row = (supplier_id, scale_identifier)
		if row in self.scale_conversion_filter:
			
			query = self.session.query(ScaleConversionModel)
			query = query.filter(ScaleConversionModel.supplier_id == supplier_id)
			query = query.filter(ScaleConversionModel.scale_identifier == scale_identifier)
			
			try:
				scale_conversion = query.one()
				return scale_conversion
			except NoResultFound:
				pass

		query = self.session.query(ScaleModel)
		query = query.filter(ScaleModel.name == scale_identifier)

		try:
			scale = query.one()
		except NoResultFound:
			scale = None

		if scale is not None:
			scale_conversion = ScaleConversionModel()
			scale_conversion.scale_id = scale.id
			return scale_conversion
		else:
			scale_conversion = ScaleConversionModel()
			scale_conversion.scale_id = None
			scale_conversion.supplier_id = supplier_id
			scale_conversion.scale_identifier = scale_identifier
			self.session.add(scale_conversion)
			self.scale_conversion_filter.add(row)
			self.session.flush()
			return scale_conversion
コード例 #53
0
ファイル: generate.py プロジェクト: rokiyer/Wanderer
def addNewUrl():

	conn = database.getConn()
	cursor = conn.cursor()

	# check if empty
	cursor.execute('SELECT outlinks FROM webpage WHERE status = 2')
	num_outlinks = cursor.rowcount
	rows_outlinks = cursor.fetchall()
	cursor.execute("SELECT error FROM webpage WHERE status = 11")
	num_redirect = cursor.rowcount
	rows_redirect = cursor.fetchall()
	
	num_all = num_redirect + num_outlinks
	if num_all == 0 :
		return {'exist':0 , 'insert':0 , 'all':0}
		cursor.close()
		conn.close()

	#bloom start ..input the urls into bloom
	import bitarray
	from pybloom import ScalableBloomFilter
	
	sql = "SELECT url FROM webpage WHERE 1"
	cursor.execute(sql)
	num_exist = cursor.rowcount
	rows = cursor.fetchall()

	sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
	for row in rows:
		sbf.add(row[0])
	#bloom end  sbf

	insert_arr = []
	num_insert = 0

	for row in rows_outlinks:
		outlinks_arr = row[0].split(',')
		proper_links = filterOutLinks(outlinks_arr)
		for link in proper_links:
			if link in sbf:
				pass
			else:
				num_insert += 1
				sbf.add(link)
				insert_arr.append((link,0))

	# for the redirect url
	cursor.execute("SELECT error FROM webpage WHERE status = 11")
	rows = cursor.fetchall()
	for row in rows_redirect:
		link = row[0]
		link = filterLink(link) 
		if link == '':
			continue

		if link in sbf:
			pass
		else:
			num_insert += 1
			sbf.add(link)
			insert_arr.append((link,0))
	
	sql = "INSERT INTO webpage (url,status)VALUE(%s,%s)"
	cursor.executemany(sql,insert_arr)

	cursor.execute("UPDATE webpage SET status = 3 WHERE status = 2 OR status = 11")

	cursor.close()
	conn.close()

	return {'exist':num_exist , 'insert':num_insert , 'all':num_all}
コード例 #54
0
class StreamingTriangles(threading.Thread):
    daemon = True

    # Constructor sets up Redis connection and algorithm vars
    def __init__(self):
        super(StreamingTriangles, self).__init__()

        # Set up connection to Redis server
        self.redis_server = 'localhost'
        self.redis_db = redis.StrictRedis(host=self.redis_server,
                                          port=6379,
                                          db=0)

        # Initialize reservoir sizes
        self.edge_res_size = 40000
        self.wedge_res_size = 40000

        # Set Scalable Bloom Filter for ignoring repeated edges
        self.bloom_filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        # Init counters and arrays for Streaming-Triangles algorithm
        self.edge_count = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0}

        self.total_wedges = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0}

        self.edge_res = {
            RED: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            BLUE: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            YELLOW: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            GREEN: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)]
        }

        self.wedge_res = {
            RED: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            BLUE:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            YELLOW:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            GREEN:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)]
        }

        self.is_closed = {
            RED: [False for _ in xrange(self.wedge_res_size)],
            BLUE: [False for _ in xrange(self.wedge_res_size)],
            YELLOW: [False for _ in xrange(self.wedge_res_size)],
            GREEN: [False for _ in xrange(self.wedge_res_size)]
        }

        # Track percent of uncategorized transactions
        self.num_missed = 0
        self.num_colored = 0

    # Thread sets up consumer and consumes kafka messages
    def run(self):
        consumer = KafkaConsumer(bootstrap_servers='52.35.109.64:9092')
        consumer.subscribe(['venmo-transactions'])

        for message in consumer:
            msg = str(message.value)
            new_edge = self.__extract_edge__(msg)
            colors = self.__analyze_message__(msg)
            self.redis_db.set(
                'percent_caught',
                self.num_colored / float(self.num_colored + self.num_missed))
            for color in colors:
                colored_edge = tuple((color, new_edge))
                if colored_edge not in self.bloom_filter and -1 not in new_edge:
                    self.__streaming_triangles__(self.redis_db, new_edge,
                                                 color)
                    self.bloom_filter.add(colored_edge)

    # Assign colors to message based on emoji/text content
    def __analyze_message__(self, json_obj):
        json_data = json.loads(json_obj)
        message = json_data['message']  # message data

        moji = PyMoji()
        message = moji.encode(message)
        if isinstance(message, str):
            message = unicode(message, "utf-8")
        message = message.encode('utf-8').lower()
        print(message)

        # Define categorization rules
        foods = [
            "pizza", "hamburger", "food", "burrito", "chinese", "indian",
            "fries", "ramen", "taco", "dinner", "lunch", "spaghetti",
            "poultry_leg", "breakfast", "sushi"
        ]
        drinks = [
            "wine", "cocktail", "drink", " bar"
            "beer", "[:tada]", "club", "vegas"
        ]
        transportation = [
            "taxi", "[:car]", "[:oncoming_automobile]", "uber", "lyft", "ride",
            "drive", "driving"
        ]
        bills = [
            "bulb", "[:moneybag]", "water", "[:house_with_garden]", "[:house]",
            " bill", "rent", "internet", "utilities", "pg&e", "dues", "cable"
        ]

        colors = Set([])

        # Check for food-related content
        if any(food in message for food in foods):
            colors.add(RED)

        # Check for drink-related content
        if any(drink in message for drink in drinks):
            colors.add(BLUE)

        # Check for transportation-related content
        if any(transport in message for transport in transportation):
            colors.add(YELLOW)

        # Check for transportation-related content
        if any(bill in message for bill in bills):
            colors.add(GREEN)

        if (len(colors) == 0):
            self.num_missed += 1
        else:
            self.num_colored += 1

        return colors

    # Streaming triangles algorithm as described in Jha et al. 2013
    def __streaming_triangles__(self, redis_db, new_edge, color):
        k = self.__update__(new_edge, color)
        transitivity = 3 * k
        redis_db.set(str(color + '_transitivity'),
                     transitivity)  # store calculated transitivity in Redi

    # Update function as described in Jha et al. 2013
    def __update__(self, new_edge, color):

        self.edge_count[color] += 1  # increment edge counter
        updated_edge_res = False

        # Check if new edge closes any of the wedges to form a triangle
        for i in range(len(self.wedge_res[color])):
            if self.__is_closed_by__(self.wedge_res[color][i], new_edge):
                self.is_closed[color][i] = True

        # Use reservoir sampling method to maintain random sample of edges, including new edges from stream
        for i in range(len(self.edge_res[color])):
            x = random.uniform(0, 1)
            if x < (1 / float(self.edge_count[color])):
                self.edge_res[color][i] = new_edge
                updated_edge_res = True

        if updated_edge_res:
            new_wedges = [
            ]  # stores all new wedges created by the new edge in the edge reservoir

            # Generate list of new wedges created by the newest edge added to the edge reservoir
            for i in range(len(self.edge_res[color])):
                if self.__creates_wedge__(self.edge_res[color][i], new_edge):
                    new_wedges.append(
                        self.__get_wedge__(self.edge_res[color][i], new_edge))
            self.total_wedges[color] += len(
                new_wedges)  # Update ratio for total number of wedges

            # Use reservoir sampling method to maintain random sample of wedges, including newly formed wedges from stream
            for i in range(len(self.wedge_res[color])):
                x = random.uniform(0, 1)
                if self.total_wedges[color] > 0 and x < (
                        len(new_wedges) / float(self.total_wedges[color])):
                    w = random.choice(new_wedges)
                    self.wedge_res[color][i] = w
                    self.is_closed[color][i] = False

        # Return ratio of closed wedges (triangles) in wedge reservoir
        return np.sum(self.is_closed[color]) / float(len(
            self.is_closed[color]))

    # Extract relevant data from json body
    def __extract_edge__(self, json_obj):
        json_data = json.loads(json_obj)
        try:
            from_id = int(json_data['actor']['id'])  # Sender data
            to_id = int(
                json_data['transactions'][0]['target']['id'])  # Receiver data
        except:
            from_id = -1  # Values of -1 are filtered out later
            to_id = -1
        edge = sorted(tuple(
            (from_id, to_id)
        ))  # Sort to treat edges as undirected (ie. (132, 452) = (452, 132))
        return edge

    # Extract wedge from adjacent edges
    def __get_wedge__(self, edge1, edge2):
        if edge1[0] == edge2[0]:
            return tuple((edge2[1], edge1[0], edge1[1]))
        if edge1[0] == edge2[1]:
            return tuple((edge2[0], edge1[0], edge1[1]))
        if edge1[1] == edge2[0]:
            return tuple((edge2[1], edge1[1], edge1[0]))
        if edge1[1] == edge2[1]:
            return tuple((edge2[0], edge1[1], edge1[0]))
        return None

    # Check if input edge closes input wedge
    def __is_closed_by__(self, wedge, edge):
        if (wedge[0] == edge[0]
                and wedge[2] == edge[1]) or (wedge[0] == edge[1]
                                             and wedge[2] == edge[0]):
            return True
        return False

    # Check if input edges create a wedge
    def __creates_wedge__(self, edge1, edge2):
        if edge1[0] == edge2[0] and edge1[1] != edge2[1]:
            return True
        if edge1[0] == edge2[1] and edge1[1] != edge2[0]:
            return True
        if edge1[1] == edge2[1] and edge1[0] != edge2[0]:
            return True
        if edge1[1] == edge2[0] and edge1[0] != edge2[1]:
            return True
        return False
コード例 #55
0
ファイル: store.py プロジェクト: jingtingzhiwu/aliexpress
class StoreSpider(RedisSpider):
    name = "store"
    allowed_domains = ["aliexpress.com"]
    start_urls = (
        'http://www.aliexpress.com/',
    )

    prefix = ''

    def __init__(self):
        self.redis_queue = None
        self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def get_queue(self):
        for value in set(self.server.smembers(self.redis_key)):
            yield value

    def start_requests(self):
        StoreSpider.prefix = self.settings['prefix']
        self.redis_key = '{}:store'.format(StoreSpider.prefix)

        self.redis_queue = self.get_queue()

        db = MongoClient().aliexpress
        for store in db['{}store'.format(StoreSpider.prefix)].find():
            self.ids.add(store['url'][store['url'].rfind('/') + 1:])

        yield self.next_request()

    def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')

    def parse(self, response):
        try:
            self.log('request store: {}'.format(response.url), logging.INFO)

            owner_member_id = response.css('.s-alitalk').xpath('a/@data-id1').extract()[0]
            evaluation_detail_url = 'http://feedback.aliexpress.com/display/evaluationDetail.htm?ownerMemberId={}'.format(owner_member_id)

            store_feedback_item = UrlItem()
            store_feedback_item['prefix'] = StoreSpider.prefix
            store_feedback_item['type'] = 'storefeedback'
            store_feedback_item[
                'url'] = 'http://feedback.aliexpress.com/display/evaluationList.htm?ownerMemberId={}&refreshPage=received'.format(
                owner_member_id)

            yield scrapy.Request(url=evaluation_detail_url, callback=self.parse_evaluation_detail,
                                 meta={'store_feedback_item': store_feedback_item})
        except:
            try:
                store_url = response.meta['redirect_urls'][0]
            except:
                store_url = response.url
                self.log('strange store url: {}'.format(store_url), logging.ERROR)
            finally:
                self.log('meet anti-spider, back store: {}'.format(store_url), logging.INFO)

                url_item = UrlItem()
                url_item['prefix'] = StoreSpider.prefix
                url_item['type'] = 'store'
                url_item['url'] = store_url
                yield url_item

    def parse_evaluation_detail(self, response):
        self.log('parse evaluation detail: {}'.format(response.url), logging.INFO)

        summary_tb_tds = response.xpath('//div[@id="feedback-summary"]/div/table/tbody/tr/td')
        store_name = summary_tb_tds[0].xpath('a/text()').extract()[0]
        store_url = summary_tb_tds[0].xpath('a/@href').extract()[0]
        store_positive_feedback = summary_tb_tds[1].xpath('span/text()').extract()[0]
        store_positive_score = int(summary_tb_tds[2].xpath('span/text()').extract()[0].replace(',', ''))
        store_since_time = datetime.strptime(summary_tb_tds[3].xpath('text()').extract()[0].strip(), '%d %b %Y')

        history_tds = response.xpath('//div[@id="feedback-history"]/div/table/tbody/tr/td/a/text()').extract()
        one_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[::5]]
        three_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[1::5]]
        six_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[2::5]]
        twelve_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[3::5]]
        overall_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[4::5]]

        store_id = store_url.split('/')[-1]

        # store_feedback_item = response.meta['store_feedback_item']
        # store_feedback_item['url'] += '&storeId={}'.format(store_id)
        # yield store_feedback_item

        item = StoreItem()
        item['prefix'] = StoreSpider.prefix
        item['_id'] = store_id
        item['url'] = store_url
        item['name'] = store_name
        item['positive_feedback'] = store_positive_feedback
        item['positive_score'] = store_positive_score
        item['since_time'] = store_since_time
        item['one_month_feedback'] = one_month_feedback
        item['three_month_feedback'] = three_month_feedback
        item['six_month_feedback'] = six_month_feedback
        item['twelve_month_feedback'] = twelve_month_feedback
        item['overall_feedback'] = overall_feedback

        all_product_url = 'http://www.aliexpress.com/store/all-wholesale-products/{}.html'.format(store_id)

        self.log('request product store: {}'.format(response.url), logging.INFO)
        return scrapy.Request(all_product_url, meta={'item': item}, callback=self.parse_product_num)

    def parse_product_num(self, response):
        self.log('parse product num: {}'.format(response.url), logging.INFO)

        item = response.meta['item']

        product_num = int(response.xpath('//div[@id="result-info"]/strong/text()').extract()[0].replace(',', ''))
        item['product'] = product_num

        return item
コード例 #56
0
class ProductSpider(RedisSpider):
    name = "product"
    allowed_domains = ["aliexpress.com"]
    start_urls = ('http://www.aliexpress.com/', )

    prefix = ''

    def __init__(self):
        self.products = dict()
        self.ids = ScalableBloomFilter(
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        self.redis_queue = None

    def get_queue(self):
        for value in set(self.server.smembers(self.redis_key)):
            yield value

    def start_requests(self):
        ProductSpider.prefix = self.settings['prefix']
        self.redis_key = '{}:product'.format(ProductSpider.prefix)

        self.redis_queue = self.get_queue()

        db = MongoClient().aliexpress
        for product in db['{}product'.format(ProductSpider.prefix)].find():
            self.ids.add(product['url'][product['url'].rfind('/') +
                                        1:product['url'].rfind('.')])

        yield self.next_request()

    def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url
                    and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')

    def parse(self, response):
        self.log('product url: {}'.format(response.url), logging.INFO)

        try:
            store_url = response.css('.shop-name').xpath(
                'a/@href').extract()[0]
            self.log('crawl store url: {}'.format(store_url), logging.INFO)

            store_item = UrlItem()
            store_item['prefix'] = ProductSpider.prefix
            store_item['type'] = 'store'
            store_item['url'] = store_url
            yield store_item

            feedback_base_url = response.xpath(
                '//div[@id="feedback"]/iframe/@thesrc').extract()[0]
            parsed = urlparse.urlparse(feedback_base_url)
            product_id = urlparse.parse_qs(parsed.query)['productId'][0]

            try:
                percent_num = response.css('.percent-num').xpath(
                    'text()').extract()[0]
                rantings_text = response.css('.rantings-num').xpath(
                    'text()').extract()[0]
                rantings_num = rantings_text[1:rantings_text.index(' ')]
                order_text = response.css('.order-num').xpath(
                    'text()').extract()[0]
                order_num = order_text[:order_text.index(' ')]
            except:
                percent_num = 0
                rantings_num = 0
                order_num = 0

            product_item = ProductItem()
            product_item['prefix'] = ProductSpider.prefix
            product_item['_id'] = product_id
            product_item['store'] = store_url
            product_item['url'] = response.url
            product_item['percent_num'] = percent_num
            product_item['rantings_num'] = rantings_num
            product_item['order_num'] = order_num
            yield product_item

            feedback_item = UrlItem()
            feedback_item['prefix'] = ProductSpider.prefix
            feedback_item['type'] = 'feedback'
            feedback_item['url'] = feedback_base_url
            yield feedback_item

            order_item = UrlItem()
            order_item['prefix'] = ProductSpider.prefix
            order_item['type'] = 'order'
            order_item[
                'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format(
                    product_id)
            yield order_item
        except:
            try:
                product_url = response.meta['redirect_urls'][0]
            except:
                product_url = response.url
                self.log('strange product url: {}'.format(product_url),
                         logging.ERROR)
            finally:
                self.log(
                    'meet anti-spider, back product: {}'.format(product_url),
                    logging.INFO)

                url_item = UrlItem()
                url_item['prefix'] = ProductSpider.prefix
                url_item['type'] = 'product'
                url_item['url'] = product_url
                yield url_item
コード例 #57
0
ファイル: frontier.py プロジェクト: ymero/PyCrawler
class BFSFrontier(Frontier):
    def __init__(self, spider):
        super(BFSFrontier, self).__init__(spider)
        self._spider = spider
        self.args = {'rules': [],
                     'order': 'bfs'}
        self.redis = RediSugar.getConnection()
        self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.todo = spider.name + '-todo'
        self.visited = spider.name + '-visited'
        self._feedfilter()

    def setargs(self, args):
        if not isinstance(args, dict):
            raise FrontierException('Args must be a dict')
        for key, value in args.iteritems():
            self.args[key] = value
        if self.args['rules']:
            for each in self.args['rules']:
                try:
                    re.compile(each)
                except re.error:
                    raise FrontierException('Wrong regular expression: \'{0}\''.format(each))

    def __len__(self):
        return self.redis.llen(self.todo)

    def __contains__(self, item):
        temp = self.redis.lrange(self.todo, 0, self.__len__())
        return item in temp

    def visitednum(self):
        return self.redis.llen(self.visited)

    def add(self, item):
        if isinstance(item, list):
            for each in iter(item):
                self._addone(each)
        elif isinstance(item, str):
            self._addone(item)
        else:
            raise FrontierException('Unsupported type: {0}'.format(type(item)))

    def _addone(self, item):
        if not self.isVisited(item) and self.validate(item):
            self.redis.rpush(self.todo, item)

    def next(self, num=1):
        if num == 1:
            return self._nextone()
        elif num == 0 or num >= self.__len__():
            return self._nextall()
        elif num > 1:
            result = []
            while len(result) < num:
                item = self._nextone()
                if item:
                    result.append(item)
            return result
        else:
            raise FrontierException('Num should be greater than 0')

    def _nextone(self):
        item = self.redis.lpop(self.todo)
        while item:
            if item in self.filter:
                item = self.redis.lpop(self.todo)
            else:
                self.filter.add(item)
                self.redis.rpush(self.visited, item)
                break
        return item

    def _nextall(self):
        temp = self.redis.lrange(self.todo, 0, self.__len__())
        result = [x for x in temp if x not in self.filter]
        self.redis.ltrim(self.todo, len(temp), self.__len__())
        for each in iter(result):
            self.filter.add(each)
            self.redis.rpush(self.visited, each)
        return result

    def hasnext(self):
        return self.__len__() != 0

    def isVisited(self, item):
        return item in self.filter

    def validate(self, item):
        if self.args['rules']:
            for each in self.args['rules']:
                if not re.match(each, item):
                    return False
        return True

    def clean(self, *args):
        if 'visited' in args:
            self.redis.delete(self.visited)
        if 'todo' in args:
            self.redis.delete(self.todo)

    def _feedfilter(self):
        length = self.redis.llen(self.visited)
        if length != 0:
            map(self.filter.add, self.redis.lrange(self.visited, 0, length))

    def save(self):
        try:
            self.redis.bgsave()
        except ResponseError:
            pass