Exemple #1
0
class A25ppSpider(scrapy.Spider):
    name = '25pp'
    allowed_domains = ['25pp.com']
    start_urls = [
        'https://www.25pp.com/android/', 'https://www.25pp.com/android/game/'
    ]
    base_url = 'https://www.25pp.com'

    def __init__(self, checkpoint=None, *a, **kw):
        super(A25ppSpider, self).__init__(*a, **kw)
        self.bf = BloomFilter(capacity=10000000)
        self.apkbf = BloomFilter(capacity=100000000)
        self.checkpoint = checkpoint
        if not checkpoint == None:
            fd = open(checkpoint, 'r')
            while True:
                line = fd.readline()
                if not line:
                    break
                line = line.strip()
                self.apkbf.add(line)
            fd.close()

    def start_requests(self):
        for url in self.start_urls:
            self.bf.add(url)
            yield Request(
                url,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                dont_filter=True)

    def parse(self, response):
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        categorypattern = re.compile(ur'fenlei/[0-9]+')
        for aitem in soup.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if categorypattern.search(href) == None:
                continue
            if href in self.bf:
                continue
            self.bf.add(href)
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                callback=self.parse_category)

    def parse_category(self, response):
        print(response.url)
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        category = soup.select('.active')[2].get_text()
        print(category)
        applist = soup.select('.app-list')[0]
        pagelist = soup.select('.page-wrap')[0]
        for aitem in applist.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if href in self.bf:
                continue
            self.bf.add(href)
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                meta={'category': category},
                callback=self.parse_detail)
        for aitem in pagelist.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if href in self.bf:
                continue
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                callback=self.parse_category)

    def parse_detail(self, response):
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        appinfo = soup.select('.app-info')[0]
        commonname = appinfo.select('.app-title')[0].get_text()
        pls = soup.select('.permission-list')
        permissionlist = list()
        if not len(pls) == 0:
            for perm in pls[0].select('.clearfix')[0].find_all('li'):
                permissionlist.append(perm.get_text())
        category = response.meta['category']
        detail_info = soup.select('.app-detail-info')[0].select('strong')
        size = detail_info[1].get_text()
        updatetime = detail_info[0].get_text()
        version = detail_info[2].get_text()
        urllink = soup.select('.btn-install')[0]['appdownurl']
        platform = self.name
        detailpattern = re.compile(ur'detail_[0-9]+')
        idpattern = re.compile(ur'[0-9]+')
        detailstring = detailpattern.search(response.url).group()
        apkid = idpattern.search(detailstring).group()
        packagename = commonname
        if apkid in self.apkbf:
            return
        print("apkid%s" % apkid)
        item = ItemLoader(item=ApkspiderItem(), response=response)
        item.add_value('commonname', commonname)
        item.add_value('apkid_specifiedbyplaform', apkid)
        item.add_value('apkplaform', platform)
        item.add_value('category', category)
        item.add_value('packagename', packagename)
        item.add_value('updatetime', updatetime)
        item.add_value('size', size)
        item.add_value('version', version)
        item.add_value('permission', permissionlist)
        item.add_value('urllink', urllink)
        item.add_value('file_urls', urllink)
        item.add_value('checkpoint', self.checkpoint)
        yield item.load_item()
Exemple #2
0
    bufgen = takewhile(lambda x: x,
                       (f.raw.read(1024 * 1024) for _ in repeat(None)))
    return sum(buf.count(b'\n') for buf in bufgen)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Generate bloom filter files')
    parser.add_argument("--base_path", default='../')
    parser.add_argument("--path",
                        help="dataset path",
                        default="data/LC-QUAD/linked_answer6.json",
                        dest="dataset_path")
    parser.add_argument('--create', action='store_true')
    args = parser.parse_args()

    bloom = BloomFilter(capacity=200000000, error_rate=0.000001)

    dbpedia_path = os.path.join(args.base_path, 'data', 'dbpedia')
    blooms_path = os.path.join(args.base_path, 'data', 'blooms')
    if args.create:
        for ttl_file in os.listdir(dbpedia_path):
            if '.ttl' not in ttl_file:
                continue
            print(ttl_file)
            file_path = os.path.join(dbpedia_path, ttl_file)
            with open(file_path, 'r') as f:
                for line in tqdm(f, total=rawincount(file_path)):
                    items = line.split(' ')
                    if len(items) != 4:
                        continue
                    items = items[:-1]
Exemple #3
0
basedir = os.path.dirname(__file__)


def row_to_dict(row):
    row = row.strip()
    record = dict(xmltodict.parse(row)['row'])

    return {k.replace('@', '').lower(): v for k, v in record.items()}


#
# TRAIN BLOOM FILTER
#
# -- training the Bloom filter
bf = BloomFilter(capacity=10**5, error_rate=0.01)
with open('./resources/0.xml', 'r') as f:

    for line in f:
        user = row_to_dict(line)
        bf.add(user['displayname'])

with open('./resources/hot_displayname.bf', 'wb') as f:
    bf.tofile(f)


#
# MAP REDUCE JOB USING THE FILTER
#
class NotHotFilterJob(MRJob):
    def mapper_init(self):
Exemple #4
0
    #     'connection': 'keep-alive',
    #     'cache-control': 'no-cache',
    #     'upgrade-insecure-requests': '1',
    #     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36 QQBrowser/4.2.4976.400',
    #     'Accept': 'text/html, */*; q=0.01',
    #     'Accept-Language': 'zh-CN,zh;q=0.8',
    # }
    # req = request.Request(url, headers=request_headers)
    # response = request.urlopen(req)
    # return response.read()


city_home_pages = []
city_ids = []
dirname = 'mafengwo_notes/'
download_bf = BloomFilter(1024 * 1024 * 16, 0.01)


def download_city_notes(id):
    for i in range(1, 999):
        url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i)
        if url in download_bf:
            continue

        print('open url' + url)
        htmlcontent = do_request(url).decode('utf-8')

        # 一页中所有的游记,使用正则是因为可以直接拿来url用
        city_notes = re.findall('href="/i/\d{7}.html', htmlcontent)

        if len(city_notes) == 0:
Exemple #5
0
 def __init__(self, path=None):
     self.file = None
     self.fingerprints = BloomFilter(2000000, 0.00001)
Exemple #6
0
 def __init__(self,save_queue,filename):
     super(Sava_key, self).__init__()
     self.save_queue = save_queue
     self.filename = filename
     self.boom = BloomFilter(capacity=1e7,error_rate=0.001)
Exemple #7
0
    def __init__(self,
                 disk_filepath,
                 block_size,
                 n_blocks,
                 n_input_data,
                 growth_factor=2,
                 pointer_density=0.1):
        super(FractionalBloomCola, self).__init__(disk_filepath, block_size,
                                                  n_blocks, n_input_data)

        self.g = int(growth_factor)
        self.p = float(pointer_density)
        self.bloom_filter = BloomFilter(capacity=self.n_input_data,
                                        error_rate=ERROR_RATE)

        # compute the number of levels needed to store all input data
        self.n_levels = 1
        n_elements = 1
        while n_elements < self.n_input_data:
            level_size = 2 * (self.g - 1) * self.g**(self.n_levels - 1)
            level_n_lookahead = int(
                math.floor(2 * self.p * (self.g - 1) *
                           self.g**(self.n_levels - 1)))
            n_elements += (level_size - level_n_lookahead)
            self.n_levels += 1
        self.n_levels += 1

        # compute the number of lookahead pointers
        self.level_sizes = [1] + [(2 * (self.g - 1) * self.g**(i - 1))
                                  for i in range(1, self.n_levels)]
        self.level_n_lookaheads = [0] + [
            int(math.floor(2 * self.p * (self.g - 1) * self.g**(i - 1)))
            for i in range(1, self.n_levels)
        ]

        self.level_n_items = np.zeros(self.n_levels, dtype=int)
        self.disk_size = np.sum(self.level_sizes)

        self.level_start_idxs = np.zeros(self.n_levels, dtype=int)
        for i in range(1, self.n_levels
                       ):  # preform prefix sum to get start idxs for the level
            self.level_start_idxs[i] = self.level_start_idxs[
                i - 1] + self.level_sizes[i - 1]

        # create storage file.
        if os.path.exists(disk_filepath):
            os.remove(disk_filepath)
        else:
            dirname = os.path.dirname(disk_filepath)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
        disk = h5py.File(self.disk_filepath, 'w')
        disk.create_dataset('dataset', shape=(self.disk_size, ), dtype=np.int)
        disk.create_dataset('is_lookaheads',
                            shape=(self.disk_size, ),
                            dtype=np.bool)
        disk.create_dataset('references',
                            shape=(self.disk_size, ),
                            dtype=np.int)
        disk.close()

        self.disk = h5py.File(self.disk_filepath, 'r+')
        self.data = self.disk['dataset']
        self.is_lookaheads = self.disk['is_lookaheads']
        self.references = self.disk['references']
        self.n_items = 0
        self.final_insert_level = 0
Exemple #8
0
from pybloom_live import ScalableBloomFilter

# 可自动扩容的布隆过滤器
bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)

url1 = 'http://www.baidu.com'
url2 = 'http://qq.com'

bloom.add(url1)
print(url1 in bloom)
print(url2 in bloom)

# BloomFilter 是定长的
from pybloom_live import BloomFilter

url1 = 'http://www.baidu.com'
url2 = 'http://qq.com'

bf = BloomFilter(capacity=1000)
bf.add(url1)
print(url1 in bf)
print(url2 in bf)
Exemple #9
0
from os.path import abspath, dirname
import sqlite3

from flask import Flask, request
from pybloom_live import BloomFilter

here = dirname(abspath(__file__))

db_file = '{}/events.db'.format(here)
app = Flask(__name__)
id_cache = BloomFilter(1000000)
cursor = None


def db_connect():
    schema = '''
    CREATE TABLE IF NOT EXISTS events (
        id CHAR(32) PRIMARY KEY
    );
    '''
    conn = sqlite3.connect(db_file)
    conn.executescript(schema)
    return conn.cursor()


def event_in_db(event_id):
    cursor.execute('SELECT * FROM events WHERE id = ?', (event_id, ))
    return cursor.fetchone() is not None


def insert_new_event(event_id):
Exemple #10
0
class FractionalBloomCola(WriteOptimizedDS):
    def __init__(self,
                 disk_filepath,
                 block_size,
                 n_blocks,
                 n_input_data,
                 growth_factor=2,
                 pointer_density=0.1):
        super(FractionalBloomCola, self).__init__(disk_filepath, block_size,
                                                  n_blocks, n_input_data)

        self.g = int(growth_factor)
        self.p = float(pointer_density)
        self.bloom_filter = BloomFilter(capacity=self.n_input_data,
                                        error_rate=ERROR_RATE)

        # compute the number of levels needed to store all input data
        self.n_levels = 1
        n_elements = 1
        while n_elements < self.n_input_data:
            level_size = 2 * (self.g - 1) * self.g**(self.n_levels - 1)
            level_n_lookahead = int(
                math.floor(2 * self.p * (self.g - 1) *
                           self.g**(self.n_levels - 1)))
            n_elements += (level_size - level_n_lookahead)
            self.n_levels += 1
        self.n_levels += 1

        # compute the number of lookahead pointers
        self.level_sizes = [1] + [(2 * (self.g - 1) * self.g**(i - 1))
                                  for i in range(1, self.n_levels)]
        self.level_n_lookaheads = [0] + [
            int(math.floor(2 * self.p * (self.g - 1) * self.g**(i - 1)))
            for i in range(1, self.n_levels)
        ]

        self.level_n_items = np.zeros(self.n_levels, dtype=int)
        self.disk_size = np.sum(self.level_sizes)

        self.level_start_idxs = np.zeros(self.n_levels, dtype=int)
        for i in range(1, self.n_levels
                       ):  # preform prefix sum to get start idxs for the level
            self.level_start_idxs[i] = self.level_start_idxs[
                i - 1] + self.level_sizes[i - 1]

        # create storage file.
        if os.path.exists(disk_filepath):
            os.remove(disk_filepath)
        else:
            dirname = os.path.dirname(disk_filepath)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
        disk = h5py.File(self.disk_filepath, 'w')
        disk.create_dataset('dataset', shape=(self.disk_size, ), dtype=np.int)
        disk.create_dataset('is_lookaheads',
                            shape=(self.disk_size, ),
                            dtype=np.bool)
        disk.create_dataset('references',
                            shape=(self.disk_size, ),
                            dtype=np.int)
        disk.close()

        self.disk = h5py.File(self.disk_filepath, 'r+')
        self.data = self.disk['dataset']
        self.is_lookaheads = self.disk['is_lookaheads']
        self.references = self.disk['references']
        self.n_items = 0
        self.final_insert_level = 0

    def insert(self, item):
        insert_data = [item]
        self.n_items += 1
        n_inserts = 1
        next_level_data = None
        self.bloom_filter.add(item, skip_check=True)

        # perform the downward merge
        last_insert_level = 0
        for i in range(self.n_levels):
            level_start_idx = self.level_start_idxs[i]
            level_n_items = self.level_n_items[i]
            level_size = self.level_sizes[i]
            level_end_idx = level_start_idx + level_n_items

            level_data = self.data[level_start_idx:level_end_idx]
            level_is_lookaheads = self.is_lookaheads[
                level_start_idx:level_end_idx]
            level_references = self.references[level_start_idx:level_end_idx]

            merge_size = n_inserts + level_n_items
            merged_data = np.zeros(shape=merge_size, dtype=int)
            merged_is_lookaheads = np.zeros(shape=merge_size, dtype=bool)
            merged_references = np.zeros(shape=merge_size, dtype=int)

            # perform the merge here, we merge to the front of the merge array.
            merged_i, insert_i, level_i = 0, 0, 0
            leftmost_lookahead_idx = INVALID_IDX
            while level_i < level_n_items and insert_i < n_inserts:
                if level_data[level_i] <= insert_data[
                        insert_i]:  # insert level items
                    merged_data[merged_i] = level_data[level_i]
                    merged_is_lookaheads[merged_i] = level_is_lookaheads[
                        level_i]
                    # if is lookahead pointer, then
                    if merged_is_lookaheads[merged_i]:
                        merged_references[merged_i] = level_references[level_i]
                        leftmost_lookahead_idx = merged_i
                    else:  # not lookahead, so point to the nearest lookahead.
                        merged_references[merged_i] = leftmost_lookahead_idx
                    level_i += 1
                else:
                    merged_data[merged_i] = insert_data[insert_i]

                    merged_is_lookaheads[merged_i] = False
                    merged_references[merged_i] = leftmost_lookahead_idx
                    insert_i += 1
                merged_i += 1

            if insert_i < n_inserts:
                assert level_i == level_n_items
                merged_data[merged_i:] = insert_data[insert_i:]
                merged_is_lookaheads[merged_i:] = np.zeros_like(
                    insert_data[insert_i:], dtype=bool)
                merged_references[merged_i:] = np.ones_like(
                    insert_data[insert_i:], dtype=int) * leftmost_lookahead_idx
            elif level_i < level_n_items:
                assert insert_i == n_inserts
                merged_data[merged_i:] = level_data[level_i:]
                merged_is_lookaheads[merged_i:] = level_is_lookaheads[level_i:]
                for j, is_lookahead in enumerate(
                        level_is_lookaheads[level_i:]):
                    if is_lookahead:
                        merged_references[merged_i +
                                          j] = level_references[level_i + j]
                        leftmost_lookahead_idx = level_i + j
                    else:
                        merged_references[merged_i +
                                          j] = leftmost_lookahead_idx

            if level_n_items + n_inserts > level_size:  # it will be full, grab all non-pointers
                self.level_n_items[i] = 0
                data_idxs = np.argwhere(
                    np.bitwise_not(merged_is_lookaheads)).reshape(-1)
                insert_data = merged_data[data_idxs]
                n_inserts = len(insert_data)
            else:
                self.level_n_items[i] = merge_size
                level_end_idx = level_start_idx + merge_size

                # perfrom writes here.
                self.data[level_start_idx:level_end_idx] = merged_data
                self.is_lookaheads[
                    level_start_idx:level_end_idx] = merged_is_lookaheads
                self.references[
                    level_start_idx:level_end_idx] = merged_references

                # update for searches
                self.final_insert_level = max(self.final_insert_level, i)

                # update for the upward insertion of lookahead pointers
                last_insert_level = i
                next_level_data = merged_data
                break

        # perform the upward insertion of lookahead pointers, note that all upper levels were merged
        # and should not have any items, so we can simply override them.
        for i in reversed(range(last_insert_level)):
            level_n_lookahead = self.level_n_lookaheads[i]
            if level_n_lookahead == 0:
                break  # no more lookaheads

            next_level_size = self.level_sizes[i + 1]
            next_level_n_items = self.level_n_items[i + 1]
            assert len(next_level_data) == next_level_n_items

            lookahead_stride = next_level_size // level_n_lookahead
            lookahead_references = [
                ref for ref in range(lookahead_stride -
                                     1, next_level_n_items, lookahead_stride)
            ]
            n_lookahead = len(lookahead_references)
            if n_lookahead == 0:
                break  # no more lookahead pointers to insert.
            lookahead_data = next_level_data[lookahead_references]

            # update n_items
            self.level_n_items[i] = n_lookahead
            level_start_idx = self.level_start_idxs[i]
            level_end_idx = level_start_idx + n_lookahead

            # write to disk
            self.data[level_start_idx:level_end_idx] = lookahead_data
            self.is_lookaheads[level_start_idx:level_end_idx] = np.ones(
                shape=n_lookahead, dtype=bool)
            self.references[
                level_start_idx:level_end_idx] = lookahead_references

            # update for next iteration
            next_level_data = lookahead_data

    def query(self, item):
        idx = self._search(item)
        return idx > INVALID_IDX

    def _search(self, item):
        if item not in self.bloom_filter:
            return INVALID_IDX

        n_search_levels = self.final_insert_level + 1
        search_start = INVALID_IDX
        search_end = INVALID_IDX

        for i in range(n_search_levels):
            if search_start == INVALID_IDX:
                search_start = 0

            level_n_item = self.level_n_items[i]
            if search_end == INVALID_IDX:
                search_end = level_n_item

            assert search_start <= search_end
            if search_end - search_start == 0:
                search_start = INVALID_IDX
                search_end = INVALID_IDX
                continue

            level_start_idx = self.level_start_idxs[i]
            start_idx = level_start_idx + search_start
            end_idx = level_start_idx + search_end
            search_arr = self.data[start_idx:end_idx]

            l, r = self.binary_search(search_arr, item)
            is_found = (l == r) and (l != INVALID_IDX)
            if is_found:
                loc = start_idx + l
                is_lookahead = self.is_lookaheads[loc]
                if is_lookahead:
                    reference = self.references[loc]
                    search_start = reference
                    search_end = reference + 1
                else:
                    return loc
            else:
                if l == INVALID_IDX:
                    search_start = INVALID_IDX
                else:
                    loc = start_idx + l
                    is_lookahead = self.is_lookaheads[loc]
                    reference = self.references[loc]
                    if is_lookahead:
                        search_start = reference
                    else:
                        if reference == INVALID_IDX:
                            search_start = INVALID_IDX
                        else:
                            loc = level_start_idx + reference
                            search_start = self.references[loc]

                if r == INVALID_IDX:
                    search_end = INVALID_IDX
                else:
                    loc = start_idx + r
                    is_lookahead = self.is_lookaheads[loc]
                    reference = self.references[loc]
                    if is_lookahead:
                        search_end = reference
                    else:
                        search_end = INVALID_IDX
                        is_lookaheads = self.is_lookaheads[level_start_idx +
                                                           r +
                                                           1:level_start_idx +
                                                           level_n_item]
                        for j, is_lookahead in enumerate(is_lookaheads):
                            if is_lookahead:
                                reference = self.references[level_start_idx +
                                                            r + 1 + j]
                                search_end = reference
        return -1

    @staticmethod
    def binary_search(search_arr, item):
        # boundary conditions
        search_arr = np.array(search_arr, dtype=int)
        last_idx = len(search_arr) - 1
        if item == search_arr[0]:  # if item is found at the startign idx
            return 0, 0

        if item == search_arr[-1]:  # if item is found at the last idx
            return last_idx, last_idx

        if item > search_arr[-1]:  # if item is bigger than all items
            return last_idx, INVALID_IDX

        if item < search_arr[0]:  # if item is smaller than all items
            return INVALID_IDX, 0

        l = 0
        h = last_idx
        while (l + 1) < h:  # terminate when l + 1 = h
            mid = (l + h) // 2
            if item == search_arr[mid]:
                return mid, mid
            elif item < search_arr[mid]:
                h = mid
            else:  # item > search_arr[mid]
                l = mid
        return l, h
Exemple #11
0
from spider.io.pojo import User, Title, TitleDetail, Img
from spider.io.DB import session
from spider.io.DB import engine
from spider.io.DB import Base

Base.metadata.create_all(engine)

# redis
r = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'])


def processUserURL(url):
    return url.split('&ie=')[0]


user_bloomFilter = BloomFilter(capacity=2 << 25, error_rate=0.01)
img_bloomFilter = BloomFilter(capacity=2 << 15, error_rate=0.01)


class TiebaPipeline(object):
    def process_item(self, item, spider):
        user_url = processUserURL(item['user_url'])
        t = Title(url=item['url'], title=item['title'], user_url=user_url)
        if 'home/main?un' in user_url and user_url not in user_bloomFilter:
            user_bloomFilter.add(user_url)
            r.lpush('tieba:user_urls', user_url)
        else:
            print('unsupport user_url:%s' % user_url)
        r.lpush('tieba:title_urls', item['url'])
        session.add(t)  # @UndefinedVariable
        session.commit()  # @UndefinedVariable
Exemple #12
0
from pybloom_live import BloomFilter

# 初始化 BloomFilter 对象,设定容量为 1000,误判几率 0.001
f = BloomFilter(capacity=1000, error_rate=0.001)
# 循环将 0~4 的数字添加到 vector 中,并打印返回结果
res = [f.add(x) for x in range(5)]
print(res)
# 单独将数字 4 添加到 vector 中,并打印返回结果
print(f.add(3))
# 判断数字 10 和数字 5 是否在 vector 中,并打印判断结果
print(10 in f)
print(5 in f)
Exemple #13
0
class ProproHelper:
    def __init__(self,
                 filepath="../src_tgt.txt",
                 vocabfile="../miniparapair/fastText/vocab.txt",
                 wordVecfile="../miniparapair/fastText/wordVec.txt",
                 sentencesfile="../miniparapair/fastText/sentenceSet.txt",
                 maxlen=25,
                 capacity=250000000):

        self.filepath = filepath
        self.vocabfile = vocabfile
        self.wordVecfile = wordVecfile
        self.sentencesfile = sentencesfile
        self.maxlen = maxlen
        self.bf = BloomFilter(capacity=capacity)

    def extractVocabsAndSentences(self):
        '''
        这里考虑到词典需要100%准确,所以词典采用集合的方式去重
        这里句子 采用布隆过滤器 进行去重 随时一点精度
        :param vocabfile: 保存单词
        :param sentencesfile: 保存所有的句子
        :return:
        '''
        vocabSet = set()
        sentencesTokenSet = []  #这是存储所有已经分好词句子 里面没有重复的
        num = 0

        try:
            with open(self.filepath, mode="r", encoding="utf-8") as fr:
                for line in fr:
                    try:
                        num += 1
                        if num % 100000 == 0:
                            print("数据正在提取单词,数据正在去重,,,", num / 233864191)
                        line = line.strip()
                        if line != "":
                            sen1, sen2 = line.split("---xhm---")
                            if len(sen1) > self.maxlen or len(
                                    sen2) > self.maxlen:
                                # 长度太大的不需要
                                continue
                            words_1 = list(jieba.cut(sen1))
                            words_2 = list(jieba.cut(sen2))

                            # 将单词添加到单词集合中
                            for word in words_1:
                                if word not in vocabSet:
                                    vocabSet.add(word)
                            for word in words_2:
                                if word not in vocabSet:
                                    vocabSet.add(word)

                            #将句子添加到句子集合中
                            if sen1 not in self.bf:
                                sentencesTokenSet.append(" ".join(words_1))
                                self.bf.add(sen1)

                            if sen2 not in self.bf:
                                sentencesTokenSet.append(" ".join(words_2))
                                self.bf.add(sen2)
                    except Exception:
                        print("这是出错的行", line)
        except Exception:
            print("内部错误")

        with open(self.vocabfile, mode="w", encoding="utf-8") as fw:
            fw.write("\n".join(vocabSet))

        with open(self.sentencesfile, mode="w", encoding="utf-8") as fw:
            fw.write("\n".join(sentencesTokenSet))

    def computeAndSaveWord2vec(self):
        fr = open(self.vocabfile, mode="r", encoding="utf-8")
        fw = open(self.wordVecfile, mode="w", encoding="utf-8")
        for line in fr:
            line = line.strip()
            if line != "":
                vec = fastTextHelper.getWordVec(line)
                vec = [str(num) for num in vec]
                fw.write(line + " " + " ".join(vec) + "\n")

        fr.close()
        fw.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : 1.python布隆过滤器.py


#   安装:pip install pybloom-live
from pybloom_live import BloomFilter

# capacity是容量, error_rate 是能容忍的误报率
f = BloomFilter(capacity=1000, error_rate=0.001)

# 返回False,一定不存在/返回True,则有可能存在
state = f.add('你好')

# ScalableBloomFilter:自动扩容
from pybloom_live import ScalableBloomFilter

# SMALL_SET_GROWTH,扩容规则
sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
sbf.add()  # 与BloomFilter 同
Exemple #15
0
import os
from pybloom_live import BloomFilter

# coon = pymysql.connect(host='127.0.0.1', user='******', passwd='qwer', db='haining')
# cur = coon.cursor()
# cur.execute("SELECT room_id from haining_room")
# room_urls = cur.fetchall()

ls = ["1049be49dc584707"]
os.chdir(r'E:\Myproject\Scan\chizhou\chizhou\spiders')

is_exist = os.path.exists('chizhou.blm')
# 判断是否存在bloom文件
# 判断存在就读取
if is_exist:
    bf = BloomFilter.fromfile(open('chizhou.blm', 'rb'))
    # 没有该文件则创建bf对象 最后的时候保存文件
else:
    bf = BloomFilter(1000000, 0.0000001)

i = 1
for room_url in ls:
    if room_url in bf:
        print('pass')
        pass
    else:
        # 加入布隆列表
        bf.add(room_url)
        print('添加了 %s 个' % i)
        i += 1
# 创建,写入布隆文件(单次写入)
Exemple #16
0
def trial(fd):
    params = search_params()
    for blk_size in BLK_SIZE:
        for fpr_r in FPR_RECEIVER:
            for fraction in FRACTION:

                # True_positives is the number of txns in the blk the receiver has
                true_positives = int(blk_size * fraction)
                true_false_positives = blk_size - true_positives
                mempool_size = true_false_positives + true_positives
                assert mempool_size == blk_size

                print(
                    'Running %d trials for parameter combination: extra txns in mempool %d blk size %d CB bound %f fraction %f'
                    % (NUM_TRIAL, true_false_positives, blk_size, bound,
                       fraction))

                # Size of Compact block (inv + getdata)
                getdata = true_false_positives * TXN_SHORT_BYTES_CB
                inv = blk_size * TXN_SHORT_BYTES_CB
                compact = inv + getdata

                for i in range(NUM_TRIAL):
                    blk, receiver_mempool = create_mempools(
                        mempool_size, fraction, blk_size, true_false_positives)

                    # Sender creates BF of blk
                    a, fpr_sender, iblt_rows_first = params.CB_solve_a(
                        mempool_size, blk_size, blk_size, 0, bound)
                    bloom_sender = BloomFilter(blk_size, fpr_sender)
                    tmp = blk_size + 0.5
                    exponent = (-bloom_sender.num_slices *
                                tmp) / (bloom_sender.num_bits - 1)
                    real_fpr_sender = (1 -
                                       exp(exponent))**bloom_sender.num_slices
                    #exponent = (-bloom_sender.num_slices*blk_size) / bloom_sender.num_bits
                    #tmp = (1-exp(exponent)) ** bloom_sender.num_slices
                    #real_fpr_sender = max(tmp, fpr_sender)
                    #assert real_fpr_sender >= fpr_sender

                    # Sender creates IBLT of blk
                    iblt_sender_first = PYBLT(a, TXN_SHORT_BYTES)

                    # Add to BF and IBLT
                    for txn in blk:
                        bloom_sender.add(txn)
                        iblt_sender_first.insert(txn, 0x0)

                    # Receiver computes how many items pass through BF of sender and creates IBLT
                    iblt_receiver_first = PYBLT(a, TXN_SHORT_BYTES)
                    Z = []
                    for txn in receiver_mempool:
                        if txn in bloom_sender:
                            Z.append(txn)
                            iblt_receiver_first.insert(txn,
                                                       0x0)  #(id and content)
                    z = len(Z)
                    observed_false_positives = z - true_positives

                    # Eppstein subtraction
                    T = iblt_receiver_first.subtract(iblt_sender_first)
                    boolean, result = T.list_entries()
                    #assert boolean == False

                    # Check whether decoding successful
                    if boolean == True:
                        flag, in_blk = decode_blk(result, Z, blk)

                        # Each component of graphene blk size
                        first_IBLT = (iblt_rows_first * TAU)
                        first_BF = (bloom_sender.num_bits / 8.0)
                        extra = (len(in_blk) * TXN_SHORT_BYTES)
                        # Compute size of Graphene block
                        graphene = first_IBLT + first_BF + extra

                        fd.write(
                            str(true_false_positives) + '\t' + str(blk_size) +
                            '\t' + str(bound) + '\t' + str(fraction) + '\t' +
                            str(mempool_size) + '\t' + str(fpr_sender) + '\t' +
                            str(real_fpr_sender) + '\t' + str(0) + '\t' +
                            str(a) + '\t' + str(0) + '\t' + str(0) + '\t' +
                            str(z) + '\t' + str(0) + '\t' +
                            str(observed_false_positives) + '\t' +
                            str(boolean and flag) + '\t' + str(False) + '\t' +
                            str(graphene) + '\t' + str(first_IBLT) + '\t' +
                            str(first_BF) + '\t' + str(0) + '\t' + str(0) +
                            '\t' + str(extra) + '\t' + str(iblt_rows_first) +
                            '\t' + str(0) + '\t' + str(compact) + '\t' +
                            str(0) + '\t' + str(0) + '\n')
                    else:
                        fpr_receiver = fpr_r
                        bloom_receiver = BloomFilter(z, fpr_receiver)
                        for txn in Z:
                            bloom_receiver.add(txn)

                        # Sender determines IBLT size
                        from_sender = []
                        for txn in blk:
                            if txn not in bloom_receiver:
                                from_sender.append(txn)
                                T.insert(txn, 0x0)
                        h = len(
                            from_sender)  # sender sends these over to receiver
                        #z is the count of txns that pass through bloom filter S
                        x_star = params.search_x_star(z=blk_size - h,
                                                      mempool_size=blk_size,
                                                      fpr=fpr_receiver,
                                                      bound,
                                                      blk_size)
                        temp = (blk_size - x_star) * fpr_receiver
                        y_star = params.CB_bound(temp, fpr_receiver, bound)
                        y_star = ceil(y_star)

                        b, fpr_sender_second, iblt_rows_second = params.solve_a(
                            m=blk_size, n=x_star, x=x_star, y=y_star)

                        bloom_sender_second = BloomFilter(
                            blk_size - h, fpr_sender_second)
                        iblt_sender_second = PYBLT(b + y_star, TXN_SHORT_BYTES)
                        for txn in blk:
                            iblt_sender_second.insert(txn, 0x0)
                            if txn not in from_sender:
                                bloom_sender_second.add(txn)

                        # Receiver determines IBLT size
                        count = 0
                        for txn in Z:
                            if txn in bloom_sender_second:
                                from_sender.append(txn)
                                T.insert(txn, 0x0)
                                count = count + 1

                        iblt_receiver_second = PYBLT(b + y_star,
                                                     TXN_SHORT_BYTES)
                        # Size of IBLT
                        # if b+(blk_size-h-x_star)-1 >= len(params.params): # difference too much
                        #     tmp = b+(blk_size-h-x_star) * 1.362549
                        #     rows = ceil(tmp)
                        #     iblt_rows_second = rows *  12
                        # else:
                        #     rows = params.params[b+(blk_size-h-x_star)-1][3]
                        #     iblt_rows_second = rows * 12
                        for txn in from_sender:
                            iblt_receiver_second.insert(txn, 0x0)

                        # Eppstein subtraction
                        T_second = iblt_receiver_second.subtract(
                            iblt_sender_second)
                        boolean, result = T_second.list_entries()
                        #print(boolean)
                        #print('Z', z)

                        # Check whether blk was reconstructed properly
                        flag, in_blk = decode_blk(result, from_sender, blk)

                        final = False
                        if boolean == False or flag == False:
                            final, in_blk, not_in_blk = try_ping_pong(
                                T, T_second, set(), set())
                            #print('Ping pong result', final)
                            if final == True:
                                possibly_in_blk = set(from_sender)
                                possibly_in_blk.difference_update(not_in_blk)
                                reconstructed_blk = list(
                                    in_blk.union(possibly_in_blk))
                                assert set(reconstructed_blk) == set(blk)

                        # Each component of graphene blk size
                        first_IBLT = (iblt_rows_first * TAU)
                        first_BF = (bloom_sender.num_bits / 8.0)
                        second_IBLT = (iblt_rows_second * TAU)
                        second_BF = (bloom_receiver.num_bits / 8.0)
                        third_BF = (bloom_sender_second.num_bits / 8.0)
                        extra = (len(in_blk) * TXN_SHORT_BYTES)
                        # Compute size of Graphene block
                        graphene = first_IBLT + first_BF + second_IBLT + second_BF + third_BF + extra

                        fd.write(
                            str(true_false_positives) + '\t' + str(blk_size) +
                            '\t' + str(bound) + '\t' + str(fraction) + '\t' +
                            str(mempool_size) + '\t' + str(fpr_sender) + '\t' +
                            str(real_fpr_sender) + '\t' + str(fpr_receiver) +
                            '\t' + str(a) + '\t' + str(b) + '\t' +
                            str(x_star) + '\t' + str(z) + '\t' + str(count) +
                            '\t' + str(observed_false_positives) + '\t' +
                            str(boolean and flag) + '\t' + str(final) + '\t' +
                            str(graphene) + '\t' + str(first_IBLT) + '\t' +
                            str(first_BF) + '\t' + str(second_IBLT) + '\t' +
                            str(second_BF) + '\t' + str(extra) + '\t' +
                            str(iblt_rows_first) + '\t' +
                            str(iblt_rows_second) + '\t' + str(compact) +
                            '\t' + str(third_BF) + '\t' +
                            str(fpr_sender_second) + '\n')

                    fd.flush()
import os
from pybloom_live import BloomFilter

#你完全可以避免下载庞大的vc运行库,在https://www.lfd.uci.edu/~gohlke/pythonlibs/下载whl文件
'''
animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
           'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
           'chicken', 'dolphin', 'donkey', 'crow', 'crocodile','testadd']
'''
is_exist = os.path.exists('test.blm')
#判断是否存在bloom文件
#判断存在就读取
if is_exist:
    bf = BloomFilter.fromfile(open('test.blm', 'rb'))
#没有该文件则创建bf对象 最后的时候保存文件
else:
    bf = BloomFilter(20000, 0.001)

for i in range(10):
    if i in bf:
        print('pass')
        pass
    else:
        print('add %s' % i)
        bf.add(i)
        n = open('test.blm', 'wb')
        bf.tofile(n)
        n.close()
for i in range(20):
    if i in bf:
        print("written")
Exemple #18
0
class BasicBloomCola(WriteOptimizedDS):
    """ this augments the basic cola data structure with bloom filters at each subarray level and a larger bloom
    filter that checks for existence across all levels"""
    def __init__(self,
                 disk_filepath,
                 block_size,
                 n_blocks,
                 n_input_data,
                 growth_factor=2,
                 pointer_density=0.1):
        super(BasicBloomCola, self).__init__(disk_filepath, block_size,
                                             n_blocks, n_input_data)

        self.g = int(growth_factor)
        self.bloom_filter = BloomFilter(capacity=self.n_input_data,
                                        error_rate=ERROR_RATE)

        # compute the number of levels needed to store all input data
        self.n_levels = math.ceil(math.log(self.n_input_data, self.g)) + 1
        self.level_sizes = np.array([self.g**i for i in range(self.n_levels)],
                                    dtype=int)
        self.level_n_items = np.zeros(self.n_levels, dtype=int)
        self.disk_size = np.sum(self.level_sizes) + self.block_size

        self.level_start_idxs = np.zeros(self.n_levels, dtype=int)
        for i in range(1, self.n_levels
                       ):  # preform prefix sum to get start idxs for the level
            self.level_start_idxs[i] = self.level_start_idxs[
                i - 1] + self.level_sizes[i - 1]

        # create storage file.
        if os.path.exists(disk_filepath):
            os.remove(disk_filepath)
        else:
            dirname = os.path.dirname(disk_filepath)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
        disk = h5py.File(self.disk_filepath, 'w')
        disk.create_dataset('dataset', shape=(self.disk_size, ), dtype=int)
        disk.close()

        self.disk = h5py.File(self.disk_filepath, 'r+')
        self.data = self.disk['dataset']

        self.n_items = 0
        self.final_insert_level = 0

    def insert(self, item):
        insert_data = [item]
        self.n_items += 1
        n_inserts = 1
        self.bloom_filter.add(item)

        # perform the downward merge
        for i in range(self.n_levels):
            level_start_idx = self.level_start_idxs[i]
            level_n_items = self.level_n_items[i]
            level_size = self.level_sizes[i]
            level_end_idx = level_start_idx + level_n_items

            level_data = self.data[level_start_idx:level_end_idx]
            merge_size = n_inserts + level_n_items
            merged_data = np.zeros(shape=merge_size, dtype=int)

            # perform the merge here.
            merged_i, insert_i, level_i = 0, 0, 0
            while level_i < level_n_items and insert_i < n_inserts:
                if level_data[level_i] <= insert_data[
                        insert_i]:  # insert level items
                    merged_data[merged_i] = level_data[level_i]
                    level_i += 1
                else:
                    merged_data[merged_i] = insert_data[insert_i]
                    insert_i += 1
                merged_i += 1

            if insert_i < n_inserts:
                assert level_i == level_n_items
                merged_data[merged_i:] = insert_data[insert_i:]
            elif level_i < level_n_items:
                merged_data[merged_i:] = level_data[level_i:]

            if merge_size > level_size:  # it will be full
                self.level_n_items[i] = 0
                insert_data = copy.deepcopy(merged_data)
                n_inserts = len(insert_data)
            else:
                self.level_n_items[i] = merge_size
                level_end_idx = level_start_idx + merge_size
                self.data[level_start_idx:level_end_idx] = merged_data

                # update for queries
                self.final_insert_level = max(self.final_insert_level, i)
                break

    def query(self, item):
        idx = self._search(item)
        return idx > INVAlID_SEARCH_IDX

    def _search(self, item):
        if item not in self.bloom_filter:  # check bloom filter first.
            return INVAlID_SEARCH_IDX
        n_search_levels = self.final_insert_level + 1

        for i in range(n_search_levels):
            level_n_item = self.level_n_items[i]
            if level_n_item == 0:
                continue  # no items to search

            level_start_idx = self.level_start_idxs[i]
            level_end_idx = level_start_idx + level_n_item
            search_data = self.data[level_start_idx:level_end_idx]
            idx = bs.search(search_data, item)
            if idx < len(search_data) and search_data[idx] == item:
                return level_start_idx + idx
        return INVAlID_SEARCH_IDX
Exemple #19
0
class A360Spider(scrapy.Spider):
    name = '360'
    allowed_domains = ['zhushou.360.cn']
    start_urls = [
        'http://zhushou.360.cn/list/index/cid/1/',
        'http://zhushou.360.cn/list/index/cid/2/'
    ]
    base_url = 'http://zhushou.360.cn'
    custom_settings = {
        "CONCURRENT_REQUESTS": 3
    }

    def __init__(self, checkpoint=None, *a, **kw):
        super(A360Spider, self).__init__(*a, **kw)
        self.bf = BloomFilter(capacity=10000000)
        self.apkbf = BloomFilter(capacity=100000000)
        self.checkpoint = checkpoint
        if not checkpoint == None:
            fd = open(checkpoint, 'r')
            while (True):
                line = fd.readline()
                if not line:
                    break
                line = line.strip()
                self.apkbf.add(line)
            fd.close()

    def start_requests(self):
        for url in self.start_urls:
            self.bf.add(url)
            yield Request(
                url=url,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                dont_filter=True)

    def parse(self, response):
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        categorypattern = re.compile(ur'cid/[0-9]+/$')
        for aitem in soup.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if categorypattern.search(href) == None:
                continue
            if href in self.bf:
                continue
            self.bf.add(href)
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                meta={
                    "category_url": href,
                    'category': aitem.get_text()
                },
                callback=self.parse_category)

    def parse_category(self, response):
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        appinfo = soup.select('div .icon_box')[0]
        pagepattern = re.compile(ur'pageCount\s*=\s*[0-9]+')
        numpattern = re.compile(ur'[0-9]+')
        for aitem in appinfo.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if href.find('detail') == -1:
                continue
            if href in self.bf:
                continue
            self.bf.add(href)
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                meta={'category': response.meta['category']},
                callback=self.parse_detail)
        pageinfo = soup.select('script')[7]
        pagenum = numpattern.search(pagepattern.search(
            pageinfo.text).group()).group()
        print(response.url)
        print(pagenum)
        for np in range(2, int(pagenum)):
            yield Request(
                url=response.meta['category_url'] + '?page=%d' % np,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                meta={
                    "category_url": response.meta['category_url'],
                    'category': response.meta['category']
                },
                callback=self.parse_category)

    def parse_detail(self, response):
        urlpattern = re.compile(ur'url=.*')
        apkidpattern = re.compile(ur'soft_id/[0-9]+')
        numpattern = re.compile(ur'[0-9]+')
        packagenamepattern = re.compile(ur'/[^/]*\.apk')
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        print(response.url)
        commonname = soup.select('#app-name')[0].get_text()
        size = soup.select('.s-3')[1].get_text()
        urllink = urlpattern.search(
            soup.select('.js-downLog.dbtn')[0]['href']).group()[4:]
        packagename = packagenamepattern.search(urllink).group()[1:-4]
        apkid = numpattern.search(apkidpattern.search(
            response.url).group()).group()
        metainfo = soup.select('.base-info')[0]
        metainfo = metainfo.select('td')
        developer = metainfo[0].get_text()
        developer = developer[developer.find(u':') + 1:]
        version = metainfo[2].get_text()
        version = version[version.find(u':') + 1:]
        updatetime = metainfo[1].get_text()
        updatetime = updatetime[updatetime.find(u':') + 1:]
        permissionlist = list()
        permission = soup.select('#authority-panel')[0].select(
            'p')[0].get_text().split('\n')
        category = response.meta['category']
        for perm in permission:
            if perm.strip().startswith(u'-'):
                permissionlist.append(perm.strip())
        if apkid in self.apkbf:
            return
        self.apkbf.add(apkid)
        item = ItemLoader(item=ApkspiderItem(), response=response)
        item.add_value('commonname', commonname)
        item.add_value('apkplaform', self.name)
        item.add_value('apkid_specifiedbyplaform', apkid)
        item.add_value('category', category)
        item.add_value('developer', developer)
        item.add_value('packagename', packagename)
        item.add_value('updatetime', updatetime)
        item.add_value('size', size)
        item.add_value('version', version)
        item.add_value('permission', permissionlist)
        item.add_value('urllink', urllink)
        item.add_value('file_urls', urllink)
        item.add_value('checkpoint', self.checkpoint)
        yield item.load_item()
Exemple #20
0
 def __init__(self,key_queue,save_queue):
     super(Spider, self).__init__()
     self.key_queue = key_queue
     self.save_queue = save_queue
     self.boom = BloomFilter(capacity=1e7,error_rate=0.001) #关键词重复采集过滤器
Exemple #21
0
class MeizuSpider(scrapy.Spider):
    name = 'meizu'
    allowed_domains = ['app.meizu.com', 'app.flyme.cn']
    start_urls = [
        'http://app.meizu.com/', 'http://app.flyme.cn/games/public/index'
    ]
    custom_settings = {
        "CONCURRENT_REQUESTS": 3
    }
    download_url = 'http://app.flyme.cn/%s/public/download.json?app_id=%d'

    def __init__(self, checkpoint=None, *a, **kw):
        super(MeizuSpider, self).__init__(*a, **kw)
        self.bf = BloomFilter(capacity=10000000)
        self.checkpoint = checkpoint
        self.apkbf = BloomFilter(capacity=100000000)
        if not checkpoint == None:
            fd = open(checkpoint, 'r')
            while True:
                line = fd.readline()
                if not line:
                    break
                line = line.strip()
                self.apkbf.add(line)
            fd.close()

    def start_requests(self):
        for url in self.start_urls:
            metainfo = {
                'type': 'apps'
            }
            if not url.find('games') == -1:
                metainfo = {
                    'type': 'games'
                }
            yield Request(
                url,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                meta=metainfo,
                dont_filter=True)

    def parse(self, response):
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        category_url = 'http://app.flyme.cn/%s/public/category/%d/all/feed/index/0/18'
        categorylist = soup.select("#categoryList")
        if not len(categorylist) == 1:
            return
        categorylist = categorylist[0]
        dataparam = categorylist.select("li")
        for dp in dataparam:
            if dp.has_attr('data-param'):
                yield Request(
                    url=category_url %
                    (response.meta['type'], int(dp['data-param'])),
                    headers={
                        "User-Agent":
                        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                    },
                    meta=response.meta,
                    callback=self.parse_category)

    def parse_category(self, response):
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        applist = soup.select('#app')
        base_url = 'http://app.flyme.cn'
        if len(applist) == 0:
            return
        applist = applist[0].find_all('a')
        for app in applist:
            if not app.has_attr('href'):
                continue
            if base_url + app['href'] in self.bf:
                continue
            self.bf.add(base_url + app['href'])
            yield Request(
                url=base_url + app['href'],
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                meta=response.meta,
                callback=self.parse_detail)

    def parse_detail(self, response):
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        metedata = soup.select('div.left.inside_left')[0]
        platform = self.name
        category = metedata.select('.current')[0]['title']
        packagenamepattern = re.compile(ur'package_name=.*')
        packagename = packagenamepattern.search(response.url).group()[13:]
        urllink = response.url
        app_titles = metedata.find_all("span", class_="app_title")
        app_content = metedata.find_all('div', class_='app_content')
        size = app_content[5].get_text().strip()
        version = app_content[3].get_text().strip()
        updatetime = app_content[6].get_text().strip()
        developer = app_content[2].get_text().strip()
        commonname = soup.find_all('div', class_='app_detail')[0]
        commonname = commonname.find_all('div', class_='detail_top')[0]
        commonname = commonname.find_all('h3')[0].get_text()
        apkid = soup.select('.price_bg.downloading')[0]['data-appid']
        yield Request(
            url=self.download_url % (response.meta['type'], int(apkid)),
            headers={
                "User-Agent":
                "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
            },
            meta={
                'commonname': commonname,
                'platform': platform,
                'category': category,
                'developer': developer,
                'packagename': packagename,
                'updatetime': updatetime,
                'size': size,
                'version': version,
                'urllink': urllink
            },
            callback=self.parse_download)

    def parse_download(self, response):
        json_response = json.loads(response.body_as_unicode())
        if not json_response['code'] == 200:
            return
        urllink = json_response['value']['downloadUrl']
        apkid = response.meta['packagename']
        if apkid in self.apkbf:
            return
        self.apkbf.add(apkid)
        item = ItemLoader(item=ApkspiderItem(), response=response)
        item.add_value('apkid_specifiedbyplaform', apkid)
        item.add_value('commonname', response.meta['commonname'])
        item.add_value('apkplaform', response.meta['platform'])
        item.add_value('category', response.meta['category'])
        item.add_value('developer', response.meta['developer'])
        item.add_value('packagename', response.meta['packagename'])
        item.add_value('updatetime', response.meta['updatetime'])
        item.add_value('size', response.meta['size'])
        item.add_value('version', response.meta['version'])
        item.add_value('urllink', urllink)
        item.add_value('file_urls', urllink)
        item.add_value('checkpoint', self.checkpoint)
        yield item.load_item()
Exemple #22
0
def make_filter():
    return BloomFilter(
        capacity=settings["MAX_POSTS"],
        error_rate=0.001
    )
Exemple #23
0
class QqSpider(scrapy.Spider):
    name = 'qq'
    allowed_domains = ['sj.qq.com'];
    start_urls = ['https://sj.qq.com/myapp/index.htm'];
    custom_settings = {"CONCURRENT_REQUESTS": 3};
    base_cate_url = "https://sj.qq.com/myapp/cate/appList.htm?orgame=%d&categoryId=%d&pageSize=20&pageContext=%d";

    def __init__(self, checkpoint=None, *a, **kw):
        super(QqSpider,self).__init__(*a,**kw);
        self.step = 20;
        self.begin_step = 0;
        self.categorybf = BloomFilter(capacity = 100000000);
        self.checkpoint = checkpoint;
        self.apkbf = BloomFilter(capacity=100000000);
        if not checkpoint == None:
            fd = open(checkpoint,'r');
            while(True):
                line = fd.readline();
                if not line:
                    break;
                line = line.strip();
                self.apkbf.add(line);
            fd.close();


    def start_requests(self):
        for url in self.start_urls:
            yield Request(url,
                headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"},
                dont_filter = True
            );


    def parse(self, response):
        soup = bs4.BeautifulSoup(response.text, "html.parser");
        pattern = re.compile(ur'categoryId=-?[0-9]+');
        idpattern = re.compile(ur'-?[0-9]+');
        orgamepattern = re.compile(ur'orgame=[0-9]+');
        orgameidpattern = re.compile(ur'[0-9]+');
        for aitem in soup.find_all('a'):
            href = aitem['href'];
            if not href.find('categoryId') == -1 and href not in self.categorybf:
                self.categorybf.add(href);
                categoryid = pattern.search(href).group();
                categoryid = idpattern.search(categoryid).group();
                orgname = orgameidpattern.search(orgamepattern.search(href).group()).group();
                url = self.base_cate_url%(int(orgname),int(categoryid),self.begin_step);
                #print(url);
                yield Request(
                    url,
                    headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"},
                    meta={'orgname':orgname},
                    callback=self.parse_json
                );
    

    def parse_json(self, response):
        categorypattern = re.compile(ur'categoryId=-?[0-9]+');
        pagecontext = re.compile(ur'pageContext=-?[0-9]+');
        idpattern = re.compile(ur'-?[0-9]+');
        catestring = categorypattern.search(response.url).group();
        pagestring = pagecontext.search(response.url).group();
        cateid = idpattern.search(catestring).group();
        pageid = idpattern.search(pagestring).group();
        json_response = json.loads(response.body_as_unicode());
        count = 0;
        if json_response.has_key('count'):
            count = int(json_response['count']);
        else:
            return;
        print(response.url);
        print(count);
        if count <= 0:
            return;
        objs = "";
        if json_response.has_key('obj'):
            objs = json_response['obj'];
        else:
            return;
        apkplaform = 'qq';
        for obj in objs:
            if obj['apkUrl'] in self.categorybf:
                continue;
            if obj['appId'] in self.apkbf:
                continue;
            self.apkbf.add(obj['appId']);
            self.categorybf.add(obj['apkUrl']);
            print(obj);
            item = ItemLoader(item=ApkspiderItem(), response=response);
            item.add_value("commonname",obj['appName']);
            item.add_value('apkplaform',apkplaform);
            item.add_value('apkid_specifiedbyplaform',str(obj['appId']));
            item.add_value('category',obj['categoryName']);
            item.add_value('developer',obj['authorName']);
            item.add_value('packagename',obj['pkgName']);
            item.add_value('updatetime',obj['apkPublishTime']);
            item.add_value('version',obj['versionName']);
            item.add_value('urllink',obj['apkUrl']);
            item.add_value('file_urls',obj['apkUrl']);
            item.add_value('checkpoint',self.checkpoint);
            yield item.load_item();

        url = self.base_cate_url%(int(response.meta['orgname']),int(cateid),int(pageid)+self.step);
        yield Request(
            url,
            headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"},
            meta={'orgname':response.meta['orgname']},
            callback=self.parse_json
        );
Exemple #24
0
 def mapper_init(self):
     with open(os.path.join(basedir, 'resources/hot_displayname.bf'),
               'rb') as f:
         self.filter = BloomFilter.fromfile(f)
Exemple #25
0

def str_filesize(size):
    d = [(1024 - 1, 'K'), (1024**2 - 1, 'M'), (1024**3 - 1, 'G'),
         (1024**4 - 1, 'T')]
    s = [x[0] for x in d]
    index = bisect.bisect_left(s, size) - 1
    if index == -1:
        return str(size)
    else:
        b, u = d[index]
    return str(size / (b + 1)) + u


#定长
bf = BloomFilter(capacity=1000)

#超出报错
for i in range(0, 101):
    try:
        bf.add("zjl-{}".format(i))
    except Exception as e:
        print(i, e)

#address, size, endianness, unused, allocated

#布隆过滤器的一些信息
address, size, endianness, unused, allocated = bf.bitarray.buffer_info()

print(address, size, endianness, unused, allocated, str_filesize(size))
Exemple #26
0
def _open_bloom(infile):
    nb = open(infile, "rb")
    return BloomFilter.fromfile(nb)
Exemple #27
0
def trial(fd):
    params = search_params()
    for blk_size in BLK_SIZE:
        true_false_positives = blk_size
        for bound in B:
            for fraction in FRACTION:

                # True_positives is the number of txns in the blk the receiver has
                true_positives = int(blk_size * fraction)
                mempool_size = true_false_positives + true_positives

                print(
                    'Running %d trials for parameter combination: extra txns in mempool %d blk size %d fraction %f'
                    % (NUM_TRIAL, true_false_positives, blk_size, fraction))

                # Size of Compact block (inv + getdata)
                getdata = (1 - fraction) * blk_size * TXN_SHORT_BYTES_CB
                inv = blk_size * TXN_SHORT_BYTES_CB
                compact = inv + getdata

                for i in range(NUM_TRIAL):
                    blk, receiver_mempool = create_mempools(
                        mempool_size, fraction, blk_size, true_false_positives)

                    # Sender creates BF of blk
                    a, fpr_sender, iblt_rows_first = params.solve_a(
                        mempool_size, blk_size, blk_size, 0)
                    bloom_sender = BloomFilter(blk_size, fpr_sender)
                    tmp = blk_size + 0.5
                    exponent = (-bloom_sender.num_slices *
                                tmp) / (bloom_sender.num_bits - 1)
                    real_fpr_sender = (1 -
                                       exp(exponent))**bloom_sender.num_slices
                    #exponent = (-bloom_sender.num_slices*blk_size) / bloom_sender.num_bits
                    #tmp = (1-exp(exponent)) ** bloom_sender.num_slices
                    #real_fpr_sender = max(tmp, fpr_sender)
                    #assert real_fpr_sender >= fpr_sender

                    # Sender creates IBLT of blk
                    iblt_sender_first = PYBLT(a, TXN_SHORT_BYTES)

                    # Add to BF and IBLT
                    for txn in blk:
                        bloom_sender.add(txn)
                        iblt_sender_first.insert(txn, 0x0)

                    # Receiver computes how many items pass through BF of sender and creates IBLT
                    iblt_receiver_first = PYBLT(a, TXN_SHORT_BYTES)
                    Z = []
                    for txn in receiver_mempool:
                        if txn in bloom_sender:
                            Z.append(txn)
                            iblt_receiver_first.insert(txn,
                                                       0x0)  #(id and content)
                    z = len(Z)
                    observed_false_positives = z - true_positives

                    # Eppstein subtraction
                    T = iblt_receiver_first.subtract(iblt_sender_first)
                    boolean, result = T.list_entries()
                    #assert boolean == False

                    # Check whether decoding successful
                    if boolean == True:
                        flag, in_blk = decode_blk(result, Z, blk)

                        # Each component of graphene blk size
                        first_IBLT = (iblt_rows_first * TAU)
                        first_BF = (bloom_sender.num_bits / 8.0)
                        extra = (len(in_blk) * TXN_SHORT_BYTES)
                        # Compute size of Graphene block
                        graphene = first_IBLT + first_BF + extra

                        fd.write(
                            str(true_false_positives) + '\t' + str(blk_size) +
                            '\t' + str(bound) + '\t' + str(fraction) + '\t' +
                            str(mempool_size) + '\t' + str(fpr_sender) + '\t' +
                            str(real_fpr_sender) + '\t' + str(0) + '\t' +
                            str(a) + '\t' + str(0) + '\t' + str(0) + '\t' +
                            str(0) + '\t' + str(z) + '\t' + str(0) + '\t' +
                            str(observed_false_positives) + '\t' +
                            str(boolean and flag) + '\t' + str(False) + '\t' +
                            str(graphene) + '\t' + str(first_IBLT) + '\t' +
                            str(first_BF) + '\t' + str(0) + '\t' + str(0) +
                            '\t' + str(extra) + '\t' + str(iblt_rows_first) +
                            '\t' + str(0) + '\t' + str(compact) + '\n')
                    else:
                        # Receiver creates BF of txns that passed through sender's BF
                        # print('z', z)
                        # print('bound', bound)
                        x_star = params.search_x_star(z, mempool_size,
                                                      real_fpr_sender, bound,
                                                      blk_size)
                        temp = (mempool_size - x_star) * real_fpr_sender
                        y_star = params.CB_bound(temp, real_fpr_sender, bound)
                        #print('y_star', y_star)
                        y_star = ceil(y_star)
                        b, fpr_receiver, iblt_rows_second = params.solve_a(
                            blk_size, z, x_star, y_star)

                        bloom_receiver = BloomFilter(z, fpr_receiver)
                        for txn in Z:
                            bloom_receiver.add(txn)

                        # Receiver determines IBLT size
                        iblt_sender_second = PYBLT(b + y_star, TXN_SHORT_BYTES)
                        # Sender creates IBLT of blk again and sends txns that do not pass through BF of receiver
                        count = 0
                        for txn in blk:
                            iblt_sender_second.insert(txn, 0x0)
                            if txn not in bloom_receiver:
                                T.insert(
                                    txn, 0x0
                                )  # add txns just received to subtracted IBLT
                                Z = Z + [txn]  # sends the txn to the receiver
                                count = count + 1

                        iblt_receiver_second = PYBLT(b + y_star,
                                                     TXN_SHORT_BYTES)
                        for txn in Z:
                            iblt_receiver_second.insert(txn, 0x0)

                        # Eppstein subtraction
                        T_second = iblt_receiver_second.subtract(
                            iblt_sender_second)
                        boolean, result = T_second.list_entries()
                        #print(boolean)
                        #print('Z', z)

                        # Check whether blk was reconstructed properly
                        flag, in_blk = decode_blk(result, Z, blk)

                        final = False
                        if boolean == False or flag == False:
                            final, in_blk, not_in_blk = try_ping_pong(
                                T, T_second, set(), set())
                            #print('Ping pong result', final)
                            if final == True:
                                possibly_in_blk = set(Z)
                                possibly_in_blk.difference_update(not_in_blk)
                                reconstructed_blk = list(
                                    in_blk.union(possibly_in_blk))
                                assert set(reconstructed_blk) == set(blk)

                        # Each component of graphene blk size
                        first_IBLT = (iblt_rows_first * TAU)
                        first_BF = (bloom_sender.num_bits / 8.0)
                        second_IBLT = (iblt_rows_second * TAU)
                        second_BF = (bloom_receiver.num_bits / 8.0)
                        extra = (len(in_blk) * TXN_SHORT_BYTES)
                        # Compute size of Graphene block
                        graphene = first_IBLT + first_BF + second_IBLT + second_BF + extra

                        fd.write(
                            str(true_false_positives) + '\t' + str(blk_size) +
                            '\t' + str(bound) + '\t' + str(fraction) + '\t' +
                            str(mempool_size) + '\t' + str(fpr_sender) + '\t' +
                            str(real_fpr_sender) + '\t' + str(fpr_receiver) +
                            '\t' + str(a) + '\t' + str(b) + '\t' +
                            str(x_star) + '\t' + str(y_star) + '\t' + str(z) +
                            '\t' + str(count) + '\t' +
                            str(observed_false_positives) + '\t' +
                            str(boolean and flag) + '\t' + str(final) + '\t' +
                            str(graphene) + '\t' + str(first_IBLT) + '\t' +
                            str(first_BF) + '\t' + str(second_IBLT) + '\t' +
                            str(second_BF) + '\t' + str(extra) + '\t' +
                            str(iblt_rows_first) + '\t' +
                            str(iblt_rows_second) + '\t' + str(compact) + '\n')

                    fd.flush()

if __name__ == '__main__':
    # 一个新集群Key文件构建一个布隆过滤器
    logging.info(
        '===================================begin new compare task==================================='
    )
    logging.info('base new cluster key files begin build filter')
    buildFilterThreads = []
    filterKeys = []
    filterLens = []

    for filename in os.listdir(newClusterPath):
        if not filename.endswith('csv'):
            continue
        filterKey = BloomFilter(capacity=1000 * 10000, error_rate=0.00001)
        filterLen = BloomFilter(capacity=1000 * 10000, error_rate=0.00001)
        filterKeys.append(filterKey)
        filterLens.append(filterLen)
        logging.info('base %s build BloomFilter cost memory %dM' %
                     (filename, len(filterKey.bitarray * 2) / 8 / 1024 / 1024))
        t = threading.Thread(target=buildFilter,
                             args=(filterKeys[-1], filterLens[-1], filename,
                                   newClusterPath))
        buildFilterThreads.append(t)
        t.start()
        # break

    for a in buildFilterThreads:
        a.join()
    logging.info('base new cluster key files end build filter')
Exemple #29
0
class Main():
    def __init__(self):
        self.taskCode = ""

        #读取配置文件
        configPath = "config.ini"
        WebConfig = configparser.ConfigParser()
        WebConfig.read(configPath, encoding='utf-8-sig')
        self.redisHost = WebConfig.get("redis", "host")
        self.redisPort = WebConfig.get("redis", "port")
        self.redisPassword = WebConfig.get("redis", "password")
        self.redisDb = WebConfig.get("redis", "database")
        self.redis_platform_address = WebConfig.get("redis","redis_platform_address")

        self.url_key_name = self.redis_platform_address+":url:" + self.taskCode
        self.redis = redis.Redis(host=self.redisHost, port=self.redisPort, decode_responses=True, password=self.redisPassword, db=self.redisDb)

        mongoHost = WebConfig.get("mongodb", "host")
        mongoPort = WebConfig.get("mongodb", "port")
        mongoUser = WebConfig.get("mongodb", "user")
        mongoPassword = WebConfig.get("mongodb", "password")
        mongourl = "mongodb://" + mongoUser + ":" + mongoPassword + "@" + mongoHost + ":" + mongoPort
        conn = pymongo.MongoClient(mongourl)
        mongoDatabase = WebConfig.get("mongodb", "database")  # mongo数据库名
        self.myMongo = conn[mongoDatabase]  # 数据库名


        self.bloom = None

        self.webType = ""
        self.executionType =""

        # 页面翻页设置
        self.start_url = ""
        self.second_page_value = ""
        self.page_interval = ""
        self.end_page_value = ""
        self.url_type = ""
        self.lineListXpath = ""
        self.json_page_re = ""
        self.page_xpath = ""    #page页如果有需要提取的数据
        # 获取页面元素
        self.titleXpath = ""
        self.contentXpath = ""

        self.proxy = None
        self.proxy_url = None
        self.headers = {
            'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
                           'Windows NT 6.1; Win64; x64; Trident/5.0)'),
        }  # header
        self.timeout = 10
        self.timeInterval = 0  # 时间间隔
        self.post_data = ""
        self.page_num_str = ""
    # 从数据库读布隆过滤器数据
    def bloom_readfrom_db(self):
        tempFile = open("tempFile", "wb")

        bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode})

        if bloom_dict: #如果有布隆过滤器,读取
            bloomData = bloom_dict["bloom_data"]
            tempFile.write(bloomData)
            tempFile.close()
            bloomFile = open("tempFile", "rb")
            self.bloom = BloomFilter.fromfile(bloomFile)
        else:
            self.bloom = BloomFilter(capacity=1000000, error_rate=0.00001)

    def get_proxy(self):
        ps = requests.get(self.proxy_url).text
        return ps

    # 将布隆过滤器数据写入数据库保存
    def bloom_writeto_db(self):
        bloomDbKeyName = self.redis_platform_address + ":bloom:" + self.taskCode

        tempFile_del = open("tempFile", "wb")

        self.bloom.tofile(tempFile_del)         #将布隆过滤器数据写入文件
        tempFile_del.close()

        bloomFile = open("tempFile", "rb")      #打开保存数据的文件
        bloomData = bloomFile.read()

        insert_data = {"_id": self.taskCode, "bloom_data": bloomData}

        bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode})

        if bloom_dict:  #更新布隆过滤器
            self.myMongo["bloom"].update_one({"_id": self.taskCode},{"$set": {"bloom_data":bloomData}})
        else:
            self.myMongo["bloom"].insert_one(insert_data)

        bloomFile.close()
        logging.info("布隆过滤器成功保存到数据库"+bloomDbKeyName)

    # 构造链接页的所有链接
    def get_PageUrlList(self):
        """构造翻页链接"""
        urlList = []
        for i in range(int(self.second_page_value), int(self.end_page_value)):
            page_num = str(i)
            page_url = self.url_type.replace("%d", page_num)
            urlList.append(page_url)
        urlList.append(self.start_url)
        return urlList

    #根据url下载数据
    def download(self, url):
        try:
            if self.proxy:
                proxy = self.get_proxy().strip()
                proxies={'https':proxy}  # 获取代理
                response = requests.get(url, proxies=proxies, timeout=self.timeout, headers=self.headers,verify=False)
                logging.info(url)
                logging.info("以使用代理")
            else:
                response = requests.get(url, timeout=self.timeout, headers=self.headers,verify=False)

            statusCode = response.status_code
            codeStyle = cchardet.detect(response.content)["encoding"]
            if not codeStyle:
                codeStyle = "utf-8"
            webData = response.content.decode(codeStyle, errors="ignore")
            return (webData, statusCode)
        except Exception as e:
            print(e)
            return (0,0)

    def change_outqueue_num(self):
        keyName = self.redis_platform_address + ":status:" + self.taskCode  # 获取任务状态键值
        status_data = self.redis.get(keyName)  # 获取所有状态数据
        print("-------------------------", self.taskCode)
        taskData = json.loads(status_data)
        taskData["outQueue"] = 1        #更新json数据
        keyname_data = json.dumps(taskData)  # 转化为字符串
        self.redis.set(keyName, keyname_data)  # 更新redis

    # 更新所有需要的属性
    def update_attr(self):
        keyName = self.redis_platform_address+":status:" + self.taskCode  # 获取任务状态键值
        status_data = self.redis.get(keyName)  # 获取所有状态数据
        print("-------------------------", self.taskCode)

        taskData = json.loads(status_data)

        self.executionType = int(taskData["executionType"])
        self.taskCode = taskData["taskCode"]
        self.timeInterval = taskData["timeInterval"]
        self.url_key_name = self.redis_platform_address+":url:" + self.taskCode



        # 下载 设置
        if "proxy" in taskData:
            self.proxy = taskData["proxy"]
        else:
            self.proxy = ""
        if "proxyProductValue" in taskData:
            self.proxy_url = taskData["proxyProductValue"]
        else:
            self.proxy_url = ""

        if "timeout" in taskData:
            self.timeout = taskData["timeout"]
        else:
            self.timeout = 10


        temp_data = json.loads(taskData["templateInfo"])    #模板数据
        print(temp_data)
        try:
            self.webType = temp_data["web_type"]
        except KeyError:
            self.webType = temp_data["webType"]

        # 页面翻页设置
        self.start_url = temp_data["start_url"]
        self.second_page_value = int(temp_data["second_page_value"])
        if "page_interval" in temp_data:
            self.page_interval = int(temp_data["page_interval"])
        else:
            self.page_interval = 1
        self.end_page_value = int(temp_data["end_page_value"])
        self.url_type = temp_data["url_type"]
        try:
            self.lineListXpath = temp_data["line_list_xpath"]
        except KeyError:
            self.lineListXpath = temp_data["lineListXpath"]

        if "headers" in temp_data:
            self.headers = temp_data["headers"]
        else:
            self.headers = {
            'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ''Windows NT 6.1; Win64; x64; Trident/5.0)'),
        }  # header

        if "json_page_re" in temp_data:
            self.json_page_re = temp_data["json_page_re"]
        else:
            self.json_page_re = ""
        if "post" in temp_data:
            self.post_data = temp_data["post"]
        else:
            self.post_data = None
        if "page_num_str" in temp_data:
            self.page_num_str = temp_data["page_num_str"]
        else:
            self.page_num_str = ""

        if "page_xpath" in temp_data:
            self.page_xpath = temp_data["page_xpath"]
        else:
            self.page_xpath = ""

    def deal_html_page_data(self,base_url,line,swtich=False):   #处理链接页的数据
        if self.page_xpath:
            one_data_dict = {}
            for key, keyxpath in self.page_xpath.items():
                if key == "url_xpath" or key == "url":
                    content_url = line.xpath(keyxpath)
                    if content_url:
                        endUrl = urljoin(base_url, content_url[0])
                        one_data_dict["url"] = endUrl
                        continue
                    else:  # 没有获取到url
                        swtich = True

                keystr = line.xpath(keyxpath)
                keystr = "".join(keystr)

                if keystr == "images" or keystr == "images_xpath":  # 对图片的链接进行处理
                    keystr = urljoin(base_url, keystr)

                one_data_dict[key] = keystr
            end_data = json.dumps(one_data_dict)  # 将字典转化为字符串

        else:
            end_data = urljoin(base_url,line)
        return end_data,swtich

    def deal_json_page_data(self,base_url,line,swtich=False):
        if self.page_xpath:
            one_data_dict = {}
            swtich = False
            for key, keyxpath in self.page_xpath.items():
                if key == "url_xpath" or key == "url":
                    content_url = jsonpath.jsonpath(line, keyxpath)
                    if content_url:
                        endUrl = urljoin(base_url, content_url[0])
                        one_data_dict["url"] = endUrl
                        continue
                    else:  # 没有获取到url
                        swtich = True

                keystr = jsonpath.jsonpath(line, keyxpath)
                keystr = " ".join(keystr)
                one_data_dict[key] = keystr
            end_data = json.dumps(one_data_dict)  # 将字典转化为字符串
        else:
            end_data = urljoin(base_url, line)
        return end_data,swtich
    # 根据url获取该页面的所有文本的链接或者链接字典

    def judge_url_in_bloom(self,judge_data):
        """判断url或字典里的url是否在布隆过滤器,不在的话加入布隆过滤器,并将数据加入redis"""
        if judge_data.startswith("{"):
            judge_data_json = json.loads(judge_data)
            insert_url = judge_data_json["url"]
            if insert_url in self.bloom:
                return True
            else:
                self.bloom.add(insert_url)
                print(judge_data)
                self.redis.lpush(self.url_key_name, judge_data)
                return False
        else:
            if judge_data in self.bloom:
                return True
            else:
                self.bloom.add(judge_data)
                print(judge_data)
                self.redis.lpush(self.url_key_name, judge_data)
                return False

    def get_content_url_list(self, url):
        """获取静态链接页内容"""
        """获取静态链接页内容"""
        endUrlList = []
        response = self.download(url)
        if response[1] == 200:
            ps = response[0]
            mytree = lxml.etree.HTML(ps)
            linelist = mytree.xpath(self.lineListXpath)
            for line in linelist:
                dealed_page_data, swtich = self.deal_html_page_data(url, line)
                if dealed_page_data and not swtich:  # swtich处理链接页,有一行没有获取到链接的情况
                    endUrlList.append(dealed_page_data)
        return endUrlList


    # json  根据 url获取该  json  页面所有的链接以及其他数据
    def get_json_content_url_list(self, url):
        """获取动态链接页内容"""
        end_data_list = []
        response = self.download(url)
        if response[1] == 200:
            ps = response[0]
            ps = ps.replace("\n", "")
            if self.json_page_re:
                ps = re.compile(self.json_page_re).findall(ps)
                if ps:
                    ps = ps[0]
                else:
                    logging.info(url + "---------这个url用json_page_re处理,结果为空")
                    return
            myjson = json.loads(ps)
            linelist = jsonpath.jsonpath(myjson, self.lineListXpath)
            for line in linelist:
                one_data_dict, swtich = self.deal_json_page_data(url, line)
                if swtich:
                    continue
                end_data_list.append(one_data_dict)
        return end_data_list

    #  post 的有关函数
    #根据url和datapost下载数据
    def post_download(self,url,data):
        try:
            if self.proxy == "1":
                proxy = self.get_proxy().strip()
                proxies = {'https': proxy}  # 获取代理
                response = requests.post(url, proxies=proxies, timeout=self.timeout, headers=self.headers,data=data)
                logging.info(url)
                logging.info("以使用代理")
            else:
                response = requests.post(url, timeout=self.timeout, headers=self.headers,data=data)

            statusCode = response.status_code
            codeStyle = cchardet.detect(response.content)["encoding"]
            if not codeStyle:
                codeStyle = "utf-8"
            webData = response.content.decode(codeStyle, errors="ignore")
            print(webData)
            return (webData, statusCode)
        except Exception as e:
            print(e)
            return (0, 0)

    def get_post_data_list(self):
        data_list = []
        for i in range(int(self.second_page_value), int(self.end_page_value),int(self.page_interval)):
            current_page_data = self.post_data.copy()
            current_page_data[self.page_num_str] = str(i)
            data_list.append(current_page_data)
        return data_list

    def post_html(self,post_data_list):
        switch = False
        for post_data in post_data_list:
            time.sleep(self.timeInterval)
            response = self.post_download(self.start_url, post_data)
            if response[1] == 200:
                ps = response[0]
                mytree = lxml.etree.HTML(ps)
                linelist = mytree.xpath(self.lineListXpath)
                for line in linelist:
                    one_data_dict, swtich_url = self.deal_html_page_data(self.start_url, line)
                    if swtich_url:
                        continue
                    judge_answer = self.judge_url_in_bloom(one_data_dict)
                    if self.executionType != 1 and judge_answer:  # 增量爬虫
                        switch = True

            if switch:  # 布隆过滤器判断有去重
                break


    def post_json(self,post_data_list):
        for post_data in post_data_list:
            swtich = False  # 判断这一页是否有布隆过滤器去重

            time.sleep(self.timeInterval)
            response = self.post_download(self.start_url, post_data)
            if response[1] == 200:
                ps = response[0]
                myjson = json.loads(ps)
                linelist = jsonpath.jsonpath(myjson, self.lineListXpath)
                for line in linelist:
                    # 每一行的操作
                    one_data_dict, swtich_url = self.deal_json_page_data(self.start_url, line)
                    if swtich_url:  # 这一行没有url,跳过这一行
                        continue

                    judge_answer = self.judge_url_in_bloom(one_data_dict)

                    if self.executionType != 1 and judge_answer:  # 增量爬虫
                        swtich = True
            if swtich:
                break

    def get_post_url_list(self):
        """针对wen_type为4,即post的url变化但是post  data不变的情况
        http://www.nhsa.gov.cn/module/web/jpage/dataproxy.jsp?startrecord=%d&endrecord=%p&perpage=15
        """
        end_url_list = []
        for first_num in range(int(self.second_page_value),int(self.end_page_value),int(self.page_interval)):
            second_num = first_num+int(self.page_interval)-1
            if second_num>int(self.end_page_value):
                second_num = int(self.end_page_value)
            post_url = self.start_url.replace("%d",str(first_num)).replace("%p",str(second_num))
            end_url_list.append(post_url)
        return end_url_list

    def post_url_change(self):
        if self.page_xpath:
            switch = False
            url_list = self.get_post_url_list()
            for url in url_list:
                time.sleep(self.timeInterval)
                response = self.post_download(url,self.post_data)
                if response[1] == 200:
                    ps = response[0]
                    mytree = lxml.etree.HTML(ps)
                    linelist = mytree.xpath(self.lineListXpath)
                    for line in linelist:
                        one_data_dict = {}
                        swtich_url = False
                        for key, keyxpath in self.page_xpath.items():
                            if key == "url_xpath" or key == "url":
                                content_url = line.xpath(keyxpath)
                                if content_url:
                                    content_url = content_url[0]
                                    content_url = parse.unquote(content_url)
                                    endUrl = urljoin(self.start_url, content_url)
                                    one_data_dict["url"] = endUrl
                                    continue
                                else:  # 没有获取到url
                                    swtich_url=True

                            keystr = line.xpath(keyxpath)
                            keystr = "".join(keystr)

                            if keystr == "images" or keystr == "images_xpath":  # 对图片的链接进行处理
                                keystr = urljoin(self.start_url, keystr)

                            one_data_dict[key] = keystr
                        if swtich_url:
                            continue
                        bloom_url = one_data_dict["url"]
                        if self.executionType != 1:  # 增量爬虫
                            if bloom_url in self.bloom:
                                logging.info(self.taskCode+"判断url在布隆过滤器成功")
                                switch = True
                            else:
                                self.bloom.add(bloom_url)
                                one_data_dict = json.dumps(one_data_dict)  # 将字典转化为字符串
                                print(one_data_dict)
                                self.redis.lpush(self.url_key_name, one_data_dict)
                        else:
                            one_data_dict = json.dumps(one_data_dict)  # 将字典转化为字符串
                            print(one_data_dict)
                            self.redis.lpush(self.url_key_name, one_data_dict)

                if switch:  # 布隆过滤器判断有去重
                    break
        else:
            swtich = False
            url_list = self.get_post_url_list()
            for url in url_list:
                time.sleep(self.timeInterval)
                response = self.post_download(url,self.post_data)
                if response[1] == 200:
                    ps = response[0]
                    mytree = lxml.etree.HTML(ps)
                    linelist = mytree.xpath(self.lineListXpath)
                    for ii in linelist:
                        content_url = parse.unquote(ii)
                        endUrl = urljoin(self.start_url, content_url)
                        if self.executionType != 1:  # 增量爬虫
                            if endUrl in self.bloom:
                                logging.info(self.taskCode + "判断url在布隆过滤器成功")
                                swtich=True
                            else:
                                self.bloom.add(endUrl)
                                print(endUrl)
                                self.redis.lpush(self.url_key_name, endUrl)
                        else:
                            print(endUrl)
                            self.redis.lpush(self.url_key_name, endUrl)

                if swtich:
                    break


        url_list = self.get_post_url_list()
        for url in url_list:
            response = self.post_download(url,self.post_data)
            if response[0]==200:
                ps = response[1]


    def post_start(self):
        """post_data,page_num_str"""
        if self.webType == 2:  #post,html类型
            post_data_list = self.get_post_data_list()  # 构造post请求数据
            self.post_html(post_data_list)
        elif self.webType == 3: # post  json类型
            post_data_list = self.get_post_data_list()  # 构造post请求数据
            self.post_json(post_data_list)
        else:   #web_type==4,url变化但是postdata不变的情况
            self.post_url_change()

    #html和json的get方法处理
    def get_start(self):

        # 存量爬虫
        if self.executionType == 1:
            pageList = self.get_PageUrlList()  # 页数链接

            for url in pageList:
                time.sleep(self.timeInterval)
                if self.webType == 0:
                    urlList = self.get_content_url_list(url)
                else:
                    urlList = self.get_json_content_url_list(url)
                time.sleep(self.timeInterval)

                for content_data in urlList:
                    print(content_data)
                    self.redis.lpush(self.url_key_name, content_data)
        # 增量爬虫
        else:
            switch = False
            if self.webType == 0:
                start_data_urlList = self.get_content_url_list(self.start_url)
            else:
                start_data_urlList = self.get_json_content_url_list(self.start_url)
            time.sleep(self.timeInterval)

            # 链接页只有url的情况下
            if not self.page_xpath:
                for start_data in start_data_urlList:  # 判断第一页
                    if start_data in self.bloom:
                        logging.info(self.taskCode + "判断url在布隆过滤器成功")
                        switch = True  # 如果第一页出现以前爬过的url,switch为true,后续的就不在爬了
                    else:
                        self.bloom.add(start_data)
                        print(start_data)
                        self.redis.lpush(self.url_key_name, start_data)

                if not switch:  # 判断第二页及以后页数
                    for pageIndex in range(int(self.second_page_value), int(self.end_page_value)):
                        swtich2 = False
                        theUrl = self.url_type.replace("%d", str(pageIndex))

                        if self.webType == 0:
                            second_content_urlList = self.get_content_url_list(theUrl)  # 每一页的文本链接列表
                        else:
                            second_content_urlList = self.get_json_content_url_list(theUrl)  # json格式的每一页的文本链接列表

                        for second_content_url in second_content_urlList:
                            if second_content_url in self.bloom:
                                logging.info(self.taskCode + "判断url在布隆过滤器成功")
                                swtich2 = True
                            else:
                                self.bloom.add(second_content_url)
                                self.redis.lpush(self.url_key_name, second_content_url)
                                print(second_content_url)
                        if swtich2:
                            break
            # 文本链接在一个字典里    {"url": "http://www.nea.gov.cn/2015-01/16/c_133924732.htm","statement_time_xpath":  "2015-01-16"}
            else:
                for start_data in start_data_urlList:  # 判断第一页
                    start_data_json = json.loads(start_data)
                    current_url = start_data_json["url"]
                    if current_url in self.bloom:
                        logging.info(self.taskCode + "判断url在布隆过滤器成功")
                        switch = True  # 如果第一页出现以前爬过的url,switch为true,后续的就不在爬了
                    else:
                        self.bloom.add(current_url)
                        self.redis.lpush(self.url_key_name, start_data)
                        print(start_data)

                if not switch:  # 判断第二页及以后页数
                    for pageIndex in range(int(self.second_page_value), int(self.end_page_value)):
                        swtich2 = False
                        theUrl = self.url_type % pageIndex  # 从第二页开始构造链接

                        if self.webType == 0:
                            second_content_urlList = self.get_content_url_list(theUrl)  # 每一页的文本链接列表
                        else:
                            second_content_urlList = self.get_json_content_url_list(theUrl)  # json格式的每一页的文本链接列表

                        for second_content_data in second_content_urlList:
                            second_content_data_json = json.loads(second_content_data)
                            current_url = second_content_data_json["url"]
                            if current_url in self.bloom:
                                logging.info(self.taskCode + "判断url在布隆过滤器成功")
                                swtich2 = True
                            else:
                                self.bloom.add(current_url)
                                print(current_url)
                                self.redis.lpush(self.url_key_name, second_content_data)
                                print(second_content_data)
                        if swtich2:
                            break
        


    def judge_status(self,task_data):
        """处理周期执行任务,判断周期执行的任务状态,在暂停和停止状态下的处理情况"""

        task_data_json = json.loads(task_data)
        task_code = task_data_json["taskCode"]
        task_key_name = self.redis_platform_address + ":task"   #任务队列键值

        status_key_name = self.redis_platform_address + ":status:" + task_code  # 状态队列键值
        status_data = self.redis.get(status_key_name)
        print("status_key_name",status_key_name)
        print("status_data",status_data)
        status_data = json.loads(status_data)
        status = status_data["status"]

        if status=="1" or status=="2":
            print("判断状态为进行中", task_data)
            self.redis.lrem(task_key_name, 0, task_data)
            print("删除任务", task_data)
            return True
        if status=="3":
            print("判断状态为暂停",task_data)
            time.sleep(1)
            return False
        if status=="4":
            print("判断状态为停止",task_data)
            time.sleep(1)
            self.redis.lrem(task_key_name,0,task_data)
            print("删除任务",task_data)
            return False

    def start(self):
        while True:
            task_key_name = self.redis_platform_address+":task"
            task_data_list = self.redis.lrange(task_key_name,0,100)
            print(task_data_list)
            time.sleep(5)
            for task_data in task_data_list:
                swtich = self.judge_status(task_data)   # 更新self.taskCode

                if swtich:
                    print(self.taskCode)
                    self.taskCode = json.loads(task_data)["taskCode"]
                    self.change_outqueue_num()      #更改outQueue值为1
                    self.update_attr()  # 更新属性

                    if self.executionType != 1:    #增量爬虫      更新布隆过滤器      executionType
                        self.bloom_readfrom_db()

                    if self.post_data or type(self.post_data) == dict:
                        self.post_start()        #处理post
                    else:
                        self.get_start()     #处理get方法html和json

                    if self.executionType != 1:
                        self.bloom_writeto_db()  # 布隆过滤器保存到数据库
		kmers2.add(seq2[i:i+ksize])

	true_jaccard = len(kmers1.intersection(kmers2)) / float(len(kmers1.union(kmers2)))
	true_jaccards[it] = true_jaccard

	# Calculate sourmash estimate of Jaccard index
	E1 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y')
	E2 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y')
	E1.add_sequence(seq1)
	E2.add_sequence(seq2)
	estimate_jaccard = E1.jaccard(E2)
	estimate_jaccards[it] = estimate_jaccard

	# Containment version.
	# Bloom filter
	f = BloomFilter(capacity=i_size+n1, error_rate=p)
	len_kmers_1 = 0
	for val in kmers1:
		if val not in f:
			len_kmers_1 += 1
			f.add(val)
	#len_kmers_1 *= (1 - p)  # adjust for the false positive rate, shouldn't need to do this as I'm just adding elements
	int_est = 0
	for val in E2._kmers:
		#if val in f:  # in python2, no distinguishing between byte and utf-8 string
		if val is not '':
			if val.decode("utf-8") in f:
				int_est += 1
	int_est -= p*h  # adjust for the false positive rate
	containment_est = int_est / float(h)
Exemple #31
0
class AppchinaSpider(scrapy.Spider):
    name = 'appchina'
    allowed_domains = ['appchina.com']
    start_urls = ['http://www.appchina.com/']

    base_url = 'http://www.appchina.com'

    def __init__(self, checkpoint=None, *a, **kw):
        super(AppchinaSpider, self).__init__(*a, **kw)
        self.bf = BloomFilter(capacity=10000000)
        self.apkbf = BloomFilter(capacity=10000000)
        self.checkpoint = checkpoint
        if not checkpoint == None:
            fd = open(checkpoint, 'r')
            while (True):
                line = fd.readline()
                if not line:
                    break
                line = line.strip()
                self.apkbf.add(line)
            fd.close()

    def start_requests(self):
        for url in self.start_urls:
            self.bf.add(url)
            yield Request(
                url=url,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                dont_filter=True)

    def parse(self, response):
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        for aitem in soup.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if href in self.bf:
                continue
            self.bf.add(href)
            if href.find('category') == -1:
                continue
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                callback=self.parse_category)

    def parse_category(self, response):
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        pagesoup = soup.select('.discuss_fangye')[0]
        appsoup = soup.select('.app-list')[0]
        for aitem in pagesoup.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if href in self.bf:
                continue
            self.bf.add(href)
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                callback=self.parse_category)
        for aitem in appsoup.find_all('a'):
            if not aitem.has_attr('href'):
                continue
            href = self.base_url + aitem['href']
            if href in self.bf:
                continue
            self.bf.add(href)
            yield Request(
                url=href,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"
                },
                callback=self.parse_detail)

    def parse_detail(self, response):
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        print(response.url)
        urllinkpattern = re.compile(ur'\'.*\'')
        urllink = soup.select('.download_app')[0]
        if not urllink.has_attr(
                'onclick') or urllink['onclick'] == 'return false;':
            return
        urllink = urllink['onclick']
        urllink = urllinkpattern.search(urllink).group()[1:-1]
        commonname = soup.select('.app-name')[0].get_text()
        detaillist = soup.select('.art-content')
        size = detaillist[2].get_text()
        size = size[size.find(u':') + 1:]
        version = detaillist[3].get_text()
        version = version[version.find(u':') + 1:]
        category = detaillist[6].get_text()
        category = category[category.find(u':') + 1:]
        packagename = response.url[response.url.rfind('/') + 1:]
        permissionlist = list()
        permissions = soup.select('.permissions-list')[0].find_all('li')
        for perm in permissions:
            permissionlist.append(perm.get_text())
        if packagename in self.apkbf:
            return
        self.apkbf.add(packagename)
        item = ItemLoader(item=ApkspiderItem(), response=response)
        item.add_value('apkid_specifiedbyplaform', packagename)
        item.add_value('commonname', commonname)
        item.add_value('apkplaform', self.name)
        item.add_value('category', category)
        item.add_value('packagename', packagename)
        item.add_value('size', size)
        item.add_value('version', version)
        item.add_value('permission', permissionlist)
        item.add_value('urllink', urllink)
        item.add_value('file_urls', urllink)
        item.add_value('checkpoint', self.checkpoint)
        yield item.load_item()