コード例 #1
0
    def setUpClass(cls) -> None:
        """Disable logging."""
        logging.getLogger().setLevel(logging.FATAL)
        shutil.rmtree(cls.test_dir, ignore_errors=True)
        os.makedirs(cls.test_dir, exist_ok=True)

        cls.records = [
            Record([0, 0, 0, 0, 0]),  # not in Bloom
            Record([1, 2, 3, 4, 5]),  # in Bloom
            Record([2, 2, 3, 4, 5]),  # in Bloom
            Record([3, 2, 3, 4, 5]),  # in Bloom
            Record([4, 2, 3, 4, 5]),  # not in Bloom
            Record([5, 2, 3, 4, 5]),  # not in Bloom
        ]

        for r in cls.records:
            r.set_hash_key(cls.hash_key)
        b = BloomFilter(100, 0.0001, cls.test_dir + "test.bloom")
        b.update([1, 2, 3, 4, 5, 6, 7, 8, 9, 'a', 'b', 'c'])
        b.add(b64encode(cls.records[1].get_long_hash()).decode())
        b.add(b64encode(cls.records[2].get_long_hash()).decode())
        b.add(b64encode(cls.records[3].get_long_hash()).decode())
        cls.b_encoded = b.to_base64().decode()
        cls.b = b
        cls.psi_ind = [
            cls.records[1].get_psi_index(), cls.records[2].get_psi_index(),
            cls.records[3].get_psi_index()
        ]
コード例 #2
0
class List:
    """
    Class to read lists from disk, create regex and bloom filters, and finally write them back to disk on crawl completion
    """
    def __init__(self, path):
        print("Loading " + path + "...")
        self.path = path
        with open(path) as f:
            array = f.readlines()
        array = [x.strip() for x in array]
        array = list(set(array))
        self.array = array
        if path == "exclude.txt":
            self.regex = re.compile('(?:% s)' % '|'.join(self.array))
        self.bloom = BloomFilter(10000000, 0.01)
        self.bloom.update(self.array)

    def append(self, element):
        self.bloom.add(element)
        self.array.append(element)

    def concat(self, elements):
        self.array += elements

    def write(self):
        with open(self.path, 'w') as f:
            for item in self.array:
                f.write("%s\n" % item)
コード例 #3
0
ファイル: convert.py プロジェクト: theopengroup/EAD
def main():
   #Check for command line arguments
   if len(sys.argv) != 2:
      print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
      sys.exit(1)

   #Read arguments from command line
   inFile = sys.argv[1]


   bf1 = BloomFilter(100000000, 0.001, 'bf1')   
   bf2 = BloomFilter(100000000, 0.001, 'bf2')
     
   outputFileName="converted-"+sys.argv[1]
   f = open(outputFileName, "a")



   for line in open(inFile,'r'):
      if (line[0:2]=="W," or line[0:2]=="R,"):
         hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
         hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
         if (bf1.add(hash1) and bf2.add(hash2)):
         	f.write('%s,%d\n' % (line[0],hash1*10000) )
         else:
        	   f.write('%s,%d\n' % (line[0],hash2*10000) )  
      elif(line==''):
         break
      else:
         pass
   f.close()
コード例 #4
0
ファイル: linkfilter.py プロジェクト: wangjie1991/crawler
class LinkFilter():
    
    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
    
    def index_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf_index.add(link.url):
                new_links.append(link)
        return new_links

    def html_filter(self, links):
        new_links = []
        for link in links:
            #log.msg('This is a link : %s' % link, level=log.WARNING)
            if not self.bf_html.add(link.url):
                new_links.append(link)
        return new_links
コード例 #5
0
ファイル: classify_ctgs.py プロジェクト: rsharris/DiscoverY
def getbloomFilter(bf, bf_capacity, fem_kmers, kmer_size):
    if bf:
        print("Opening Bloom Filter of k-mers from female")
        female_kmers_bf = BloomFilter.open("data/female.bloom")
        print("Done")
    else:
        print("Need to make Bloom Filter of k-mers from female")
        bf_filename = "data/female.bloom"
        female_kmers_bf = BloomFilter(bf_capacity, .001, bf_filename)

        if fem_kmers: # if female kmers file exist
            female_kmers_file = "data/female_kmers"
            with open(female_kmers_file, 'r') as fm_kmers:
                #assumes kmers are uppercase
                first_line = fm_kmers.readline()
                kmers.test_valid_kmer_format(first_line, kmer_size)
                fm_kmers.seek(0)
                for line in fm_kmers:
                    female_kmers_bf.add(line[:kmer_size])
        else :
            print("Reading female reference one record at a time and k-merizing each record...")
            female_reference_file = "data/female.fasta"
            n_kmers = "N"*kmer_size
            for record in SeqIO.parse(female_reference_file,"fasta"):
                to_kmerize_fwd = str(record.seq).upper()
                length = len(to_kmerize_fwd)
                for i in range(0, length-kmer_size+1):
                    female_kmer = to_kmerize_fwd[i:i+kmer_size]
                    if female_kmer != n_kmers:
                        female_kmers_bf.add(to_kmerize_fwd[i:i+kmer_size])

        print("Done creating bloom filter")
    return female_kmers_bf
コード例 #6
0
def get_spectrum(input_file, size=31):
    bams = pysam.AlignmentFile(input_file, 'rb')

    bloom_filter = BloomFilter(capacity=999999999, error_rate=0.1)
    # print(bloom_filter.bitarray)
    # print(bloom_filter.num_bits)

    # 统计每一个kmer出现的次数,即多样性
    hash_dict = {}
    cnt = 0
    for r in bams:
        cnt += 1
        print(cnt)
        if cnt == 200000:
            break
        read = r.query_sequence
        kmers = get_kmers(read, size)
        # print(kmers)
        for kmer in kmers:

            is_in = kmer in bloom_filter
            if is_in is True:
                # 排除假阳性
                if kmer in hash_dict:
                    hash_dict[kmer] += 1
                else:
                    hash_dict[kmer] = 1
            else:
                bloom_filter.add(kmer)
                hash_dict[kmer] = 1

    # 删除只有一个kmer
    unique_kmer = []
    for key in hash_dict.keys():
        if hash_dict[key] == 1:
            unique_kmer.append(key)

    for i in range(len(unique_kmer)):
        hash_dict.pop(unique_kmer[i])
    # print(hash_dict)

    # 统计多样性为相同值的所有kmer个数
    stat_dict = {}

    for key in hash_dict.keys():

        multiplicity = hash_dict[key]
        if multiplicity not in stat_dict.keys():
            stat_dict[multiplicity] = 1
        else:
            stat_dict[multiplicity] += multiplicity

    frequency = []
    density = []
    for key in stat_dict.keys():
        frequency.append(key)
        density.append(stat_dict[key])

    return stat_dict, frequency, density
コード例 #7
0
class Parser:
    def __init__(self, rule, item=None):
        self.rule = rule
        self.item = item
        self.parsing_urls = []
        self.pre_parse_urls = []
        self.filter_urls = BloomFilter(10000000, 0.01)
        self.done_urls = []

    def add(self, urls):
        url = '{}'.format(urls)
        if url.encode('utf-8') not in self.filter_urls:
            self.filter_urls.add(url.encode('utf-8'))
            self.pre_parse_urls.append(url)

    def parse_urls(self, html):
        urls = re.findall(self.rule, html)
        for url in urls:
            self.add(url)

    async def parse_item(self, html):
        item = self.item(html)
        await item.save()
        self.item._item_count += 1
        return item

    async def execute_url(self, spider, session, semaphore, url):
        html = await fetch(url, session, semaphore)

        if html is None:
            spider.error_urls.append(url)
            self.pre_parse_urls.append(url)
            return
        if url in spider.error_urls:
            spider.error_urls.remove(url)
        spider.urls_count += 1
        self.parsing_urls.remove(url)
        self.done_urls.append(url)
        if self.item is not None:
            await self.parse_item(html)
            logger.info('Parsed({}/{}): {}'.format(len(self.done_urls),
                                                   len(self.filter_urls), url))
        else:
            spider.parse(html)
            logger.info('Followed({}/{}): {}'.format(len(self.done_urls),
                                                     len(self.filter_urls),
                                                     url))

    async def task(self, spider, semaphore):
        with aiohttp.ClientSession() as session:
            while spider.is_running():
                if len(self.pre_parse_urls) == 0:
                    await asyncio.sleep(0.5)
                    continue
                url = self.pre_parse_urls.pop()
                self.parsing_urls.append(url)
                asyncio.ensure_future(
                    self.execute_url(spider, session, semaphore, url))
コード例 #8
0
class URIBloomFilter(BaseDupeFilter):
    def __init__(self, settings, debug = False):
        self.capacity = settings.getint("DUPEFILTER_CAPACITY")
        self.filename = settings.get("DUPEFILTER_FILENAME")
        self.debug = debug
        self.error_rate = 0.01
        self.logger = logging.getLogger(__name__)
        self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
    
    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(settings, debug)
    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if self.check(fp):
            return True
        else:
            self.insert(fp)

    ###-------todo-------##
    def request_fingerprint(self, request):
        return request_fingerprint(request)
    
    def check(self, request):

        ret = request in self.bloom_filter_
        return ret
    
    def insert(self, request):
        self.bloom_filter_.add(request)
        #print len(self.bloom_filter_)
        #print self.bloom_filter_.hash_seeds
        #print self.bloom_filter_.num_bits
        #print self.bloom_filter_.num_hashes
    
    def reset(self):
        self.bloom_filter_.clear_all()
    
    def save(self):
        pass
    def load(self):
        self.bloom_filter_.sync()
        self.bloom_filter_.open("bloom.dump") 
        pass
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
コード例 #9
0
def loadUsers(filename):
    usernames = BloomFilter(capacity=30000000, error_rate=0.00001)
    try:
        with open(filename, 'r') as f:
            for line in f:
                username = str(json.loads(line.strip())["username"])
                usernames.add(username)
    except IOError:
        print "file doesn't exist"
    return usernames
コード例 #10
0
def slave_node(kmer_pool, slave_num, stat_pool):
    local_filter = BloomFilter(capacity=9999999999, error_rate=0.1)

    local_dict = {}
    try:
        print('Slave: ', slave_num, ' start')
        time_to_stop = 0
        while True:
            self_kmres = kmer_pool[slave_num]
            while len(self_kmres) is not 0:
                time_to_stop = 0
                kmer = self_kmres[0]
                # print('Slave: ', slave_num, " del: ", kmer)
                del self_kmres[0]

                # 加入到Bloom filter中
                is_in = kmer in local_filter
                if is_in is True:
                    # 排除假阳性
                    if kmer in local_dict:
                        local_dict[kmer] += 1
                    else:
                        local_dict[kmer] = 1
                else:
                    local_filter.add(kmer)
                    local_dict[kmer] = 1

                kmer_pool[slave_num] = self_kmres

            time.sleep(0.001)
            time_to_stop += 1

            # 停止时间到
            if time_to_stop == 50:
                # 删除只有一个kmer
                unique_kmer = []
                local_stat = {}

                # 去除唯一的kmer,统计多样性为相同值的所有kmer个数
                for key in local_dict.keys():
                    multiplicity = local_dict[key]
                    if multiplicity == 1:
                        unique_kmer.append(key)
                    elif multiplicity not in local_stat.keys():
                        local_stat[multiplicity] = 1
                    else:
                        local_stat[multiplicity] += multiplicity

                for i in range(len(unique_kmer)):
                    local_dict.pop(unique_kmer[i])
                stat_pool = merge_dict(stat_pool, local_stat)
                break
        print('Slave: ', slave_num, ' end------------------')
    except Exception as ex:
        print('Slave: ', slave_num, ex)
コード例 #11
0
def hashtable_with_bf(file_name, kmer_size, bf_capacity, bf_error, top_count,
                      verbose):
    """
     Hash table with bloom filter.
    :param file_name: File to be processed.
    :param kmer_size: Length of the kmer.
    :param bf_capacity: Capacity of the bloom filter.
    :param bf_error: Probability of false positive in bloom filter.
    :param top_count: Number of kmers to be printed
    :param verbose: Option to print elapsed time and memory usage.
    :return:
    """

    start = time.time()
    # initialise a min heap
    h = Heap()
    h.populate(top_count)
    # initialise bloom filter
    bf = BF(bf_capacity, bf_error, "hashtable_with_bf")
    kmer_freq = dict()
    with open(file_name, "r") as file_from_read:
        count = 0
        for line in file_from_read:
            # take the second line to parse kmers.
            if count % 4 == 1:
                line_length = len(line)
                for i in range(line_length - kmer_size + 1):
                    kmer = line[i:kmer_size + i]
                    if kmer in bf:
                        if kmer not in kmer_freq:
                            kmer_freq[kmer] = 1
                        kmer_freq[kmer] += 1
                    else:
                        bf.add(kmer)
            count += 1
    end = time.time()
    if verbose:
        print("Hash table done in {0} seconds".format(end - start))
    start_heap = time.time()
    for kmer, freq in kmer_freq.iteritems():
        if freq > h.min():
            # h.pop()
            # h.push((freq, kmer))
            h.push_pop((freq, kmer))
    for item in h.nlargest(top_count):
        freq, kmer = item
        print(kmer, freq)
    end = time.time()
    # clean bf
    os.remove("hashtable_with_bf")
    if verbose:
        print("Heap done in {0} seconds".format(end - start_heap))
        print("Process done in {0} seconds".format(end - start))
        print("Hash table size: {0} MB".format(
            int(sys.getsizeof(kmer_freq) / 10**6)))
コード例 #12
0
def bloom_full(basename: str):
    """Measure all values at once."""
    if basename is None:
        raise ValueError("No basename given.")
    file_path = get_file_path("bloom_full", basename)
    partial_insert_file = file_path.replace(".csv", "_partial_insert.csv")
    rs = ROUNDS_START
    if not RESUME or not os.path.exists(file_path):
        # Write header if new file only
        row_fmt = f"ROUND;CAPACITY;ERROR_RATE;INSERTED ELEMENTS;" \
                  f"QUERIED ELEMENTS;SIZE;INSERT TIME;QUERY TIME;" \
                  f"# False Positives"
        write_header("Bloom Full", file_path, row_fmt)
        # row_fmt = f"ROUND;CAPACITY;ERROR_RATE;INSERTED ELEMENTS;" \
        #           f"SIZE;INSERT_TIME[s](for elements added in step);"
        # write_header("Bloom Partial Insert", partial_insert_file, row_fmt)
    else:
        # Read values to resume
        rs = get_round(file_path)
    for r in lb(range(rs, ROUNDS_END), "Rounds"):
        for capacity in lb(CAPACITY, "Capacities", leave=False):
            for error_rate in lb(ERROR_RATE, "Error Rates", leave=False):
                if FILL:
                    i = [capacity]
                else:
                    i = lb(INSERT, "Inserts", leave=False)
                for insert in i:
                    with NamedTemporaryFile() as tmp:
                        b = BloomFilter(capacity, error_rate, tmp.name)
                        real_set = [random.random() for _ in range(insert)]
                        start = time.monotonic()
                        for s in real_set:
                            # Add random value
                            b.add(s)
                        insert_time = time.monotonic() - start
                        size = len(b.to_base64())
                        if QUERY_ALL:
                            query_range = int(math.ceil(100 / error_rate))
                        else:
                            query_range = QUERY
                        for query in lb(query_range, "Queries", leave=False):
                            # +1 because only values <1 stored
                            query_set = [
                                random.random() + 1 for _ in range(query)]
                            start = time.monotonic()
                            false_positives = 0
                            for q in query_set:
                                if q in b:
                                    false_positives += 1
                            query_time = time.monotonic() - start
                            with open(file_path, "a") as fd:
                                fd.write(
                                    f"{r};{capacity};{error_rate};"
                                    f"{insert};{query};{size};{insert_time};"
                                    f"{query_time};{false_positives}\n")
コード例 #13
0
 def test_bloom(self):
     s = server.StorageServer(test_dir)
     self.assertFalse(os.path.exists(self.bloom_path))
     with patch.object(s, "_initialize_bloom_filter") as m:
         b = s.bloom
         m.assert_called_once()
         b = BloomFilter(2, .1, self.bloom_path)
         b.add(5)
         c = s.bloom
         m.assert_called_once()  # No second call
         self.assertIn(5, c)
コード例 #14
0
ファイル: kmer_bloom.py プロジェクト: the-nemz/DNA-Indexing
def load_bloom(kmers):
    """
    Inserts all of the k-mers into the bloom filter
    """
    global bloom, filename

    filename = '%d_kmer_%d_rate.bloom' % (kmer_length, int(100 * error_rate))
    print(len(kmers) // 2)
    bloom = BloomFilter(len(kmers) // 2, error_rate, filename)

    for kmer in kmers:
        bloom.add(kmer)
コード例 #15
0
class BloomFilterQueue(BaseQueue):
    def __init__(self, bloomfilter_path, capacity, wrong_rate, maxsize, *argv,
                 **kw):
        super(BloomFilterQueue, self).__init__(maxsize)
        self.crawled = BloomFilter(capacity, wrong_rate, bloomfilter_path)

    def put_request(self, request, block=True, timeout=None):
        url = request["url"] if isinstance(request, ZRequest) else request
        if url in self.crawled:
            return False
        self.crawled.add(url)
        self._queue.put(request, block=block, timeout=timeout)
コード例 #16
0
def get_spectrum(input_file, size=31):
    bams = pysam.AlignmentFile(input_file, 'rb')

    bloom_filter = BloomFilter(capacity=9999999999, error_rate=0.1)
    # print(bloom_filter.bitarray)
    # print(bloom_filter.num_bits)

    # 统计每一个kmer出现的次数,即多样性
    hash_dict = {}
    cnt = 0
    for r in bams:
        cnt += 1
        print(cnt)
        # if cnt % 100000 == 0:
        #     print(cnt)
        if cnt == 100000:
            break
        read = r.query_sequence
        kmers = get_kmers(read, size)
        # 将kmer加入到bloom中
        for kmer in kmers:
            # a= int(hash(kmer) % 3)
            # get_hash(kmer, 3)
            is_in = kmer in bloom_filter
            if is_in is True:
                # 排除假阳性
                if kmer in hash_dict:
                    hash_dict[kmer] += 1
                else:
                    hash_dict[kmer] = 1
            else:
                bloom_filter.add(kmer)
                hash_dict[kmer] = 1

    unique_kmer = []
    stat_dict = {}
    # 统计多样性为相同值的所有kmer个数,并删除只有一个kmer
    for key in hash_dict.keys():
        multiplicity = hash_dict[key]
        if multiplicity == 1:
            unique_kmer.append(key)
        elif multiplicity not in stat_dict.keys():
            stat_dict[multiplicity] = 1
        else:
            stat_dict[multiplicity] += multiplicity

    for i in range(len(unique_kmer)):
        hash_dict.pop(unique_kmer[i])

    get_usage()


    return stat_dict
コード例 #17
0
ファイル: bloomfilter.py プロジェクト: enzocxt/bloomfilter
def create_bf():
	bf = BloomFilter(count, error_rate, 'filter_base.bloom')
	keyDigest_list = []
	FILE = open(keyDigestFile, 'r')
	
	for i in range(count):
		keyDigest = FILE.read(keyDigestLen)
		keyDigest_list.append(keyDigest)
		
	FILE.close()
	
	for publicKeyID in keyDigest_list:
		bf.add(publicKeyID)
コード例 #18
0
def threaded_crawl(tid, n, proxies, lock, output_dir="."):
    global count
    global failures
    fails = 0
    logger = logging.getLogger(__name__)
    fptr = open("top-1m.csv", "r")
    fail_thresh = 10  # Use a different proxy after 10 failed requests in a row
    proxy = dict()
    linum = fails = 0
    start = tid * n  # First seed site to crawl
    end = tid * n + n  # Last seed site to crawl
    seed = BloomFilter(n * 1000000, 0.1, '/tmp/{}.bloom'.format(tid).encode())
    frontier = deque()
    logger.info('[tid {}] Loading seed URLs {} - {}'.format(tid, start, end))
    for line in fptr:
        if linum >= start and linum < end:
            url = "http://" + line.split(',')[1].strip()
            seed.add(url.encode())
            frontier.append(url)
        linum += 1
    fptr.close()
    while True:
        url = frontier.popleft()
        urls = []
        try:
            urls = parse_url(url, proxy, output_dir)
        except Exception as e:
            logger.error(
                "[tid {}] Fatal error occured while crawling: {}.".format(
                    tid, url))
        if len(urls) == 0:
            with lock:
                failures += 1
            fails += 1
            if fails > fail_thresh:
                proxy['http'] = proxies[randint(0, len(proxies) - 1)]
                logger.error("[tid {}] Failure: Activating proxy:{}".format(
                    tid, proxy['http']))
                fails = 0
        for u in urls:
            link = u.encode()
            if link not in seed:
                seed.add(link)
                frontier.append(link)
        with lock:
            count += 1
            if (count % 1000 == 0):
                logger.info('Page count: {}'.format(count))
        if len(frontier) % 1000 == 0:
            logger.info("[tid {}] Frontier count: {}".format(
                tid, len(frontier)))
コード例 #19
0
    def create(infile, outfile, capacity: int, error_rate: float = 0.05):
        import tqdm
        import urllib
        from pybloomfilter import BloomFilter

        bf = BloomFilter(capacity, error_rate, outfile)
        with open(infile) as f:
            for _, word in enumerate(tqdm.tqdm(f, total=capacity)):
                if "%" in word:
                    word = urllib.parse.unquote(word).lower()
                word = word.rstrip()
                bf.add(word)

        bf.close()
コード例 #20
0
ファイル: uniq.py プロジェクト: 235/data-utils
def process(files):
    #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
    #If a filename is '-', it is also replaced by sys.stdin.
    if os.path.isfile(bloomfile):
        UNIQUES = BloomFilter.open(bloomfile)
    else:
        UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)

    for record in fileinput.input(files):
        record = str(record).strip()
        if not record in UNIQUES:
            UNIQUES.add(record)
            print record
    UNIQUES.sync()
    UNIQUES.close()
コード例 #21
0
ファイル: uniq.py プロジェクト: 235/data-utils
def process(files):
    #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
    #If a filename is '-', it is also replaced by sys.stdin.
    if os.path.isfile(bloomfile):
        UNIQUES = BloomFilter.open(bloomfile)
    else:
        UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)

    for record in fileinput.input(files):
        record = str(record).strip()
        if not record in UNIQUES:
            UNIQUES.add(record)
            print record
    UNIQUES.sync()
    UNIQUES.close()
コード例 #22
0
ファイル: pipelines.py プロジェクト: zbxzc35/SearchEngine
class DuplicatesPipeline(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()

    def process_item(self, item, spider):
        print '************%d pages visited!*****************' %len(self.bf)
        if self.bf.add(item['url']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            #print '%d pages visited!'% len(self.url_seen)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
コード例 #23
0
ファイル: URLFilter.py プロジェクト: muye5/muye5code
class URLBloomFilter:
    dbconn = None
    cur = None
    urlbf = None
    sql = None

    def initdb(self, host = 'localhost', user = '******', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'):
        self.dbconn = MySQLConnection.MySQLConn()
        self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db)
        self.cur = self.dbconn.cursor()

    def initfilter(self, filename = './url.filter'):
        if os.path.isfile(filename):
            self.urlbf = BloomFilter.open(filename)
        else:
            self.urlbf = BloomFilter(10000000, 0.001, filename)

    def initsql(self, m_sql):
        self.sql = m_sql

    def add(self, url):
        if not self.urlbf.add(url):
            self.cur.execute(self.sql, url)
            return True
        else:
            return False

    def close(self):
        self.dbconn.close()
コード例 #24
0
class URLBloomFilter:
    dbconn = None
    cur = None
    urlbf = None
    sql = None

    def initdb(self,
               host='localhost',
               user='******',
               passwd='muye',
               db='muye',
               port=3306,
               charset='utf8'):
        self.dbconn = MySQLConnection.MySQLConn()
        self.dbconn.connect(m_host=host, m_user=user, m_passwd=passwd, m_db=db)
        self.cur = self.dbconn.cursor()

    def initfilter(self, filename='./url.filter'):
        if os.path.isfile(filename):
            self.urlbf = BloomFilter.open(filename)
        else:
            self.urlbf = BloomFilter(10000000, 0.001, filename)

    def initsql(self, m_sql):
        self.sql = m_sql

    def add(self, url):
        if not self.urlbf.add(url):
            self.cur.execute(self.sql, url)
            return True
        else:
            return False

    def close(self):
        self.dbconn.close()
コード例 #25
0
class Filter(object):
    def __init__(self, capacity=1000000, errrate=0.01, fname="filter.bloom"):
        try:
            self.bf = BloomFilter.open(fname)
        except:
            self.bf = BloomFilter(capacity, errrate, fname)
        self.syncmax = 100
        self.synccnt = 0

    def isExists(self, value):
        if value:
            return value in self.bf
        return True

    def add(self, value):
        if value:
            try:
                ret = self.bf.add(value)
                self.synccnt += 1
                if self.synccnt >= self.syncmax:
                    self.bf.sync()
                    self.synccnt = 0
                return ret
            except Exception as e:
                mylog.info("bf add fail! %s %s" % (e, value))

        return True

    def sync(self):
        self.bf.sync()
コード例 #26
0
ファイル: pipelines.py プロジェクト: pianer/SearchLaw
class MongoDBPipeline(object):

    def __init__(self):
        connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.si = SearchIndex()
        self.si.SearchInit()
        
    def process_item(self, item, spider):
        if self.bf.add(item['link']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            for data in item:
                if not data:
                    raise DropItem("Missing data!")
            self.collection.update({'link': item['link']}, dict(item), upsert=True)
            log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
            self.si.AddIndex(item)
            return item
        
    def __del__(self):
        self.si.IndexDone()
コード例 #27
0
def seating_simulated (payloads, n, k, filter_filename=None):
    packets, ngrams, repeats = 0, 0, 0
    add_ok, add_no, add_re = 0, 0, 0
    add_repeat_but_bf_add_true = 0
    bf = BloomFilter(n, 1/np.power(2,k), filter_filename)
    duniq = set()
    for payload in payloads:
        data = unhexlify(payload)
        dlen = len(data)
        packets += 1
        for i in range(min(100, dlen-k+1)):
            d = data[i:i+k]
            ngrams += 1
            if d not in duniq:
                duniq.add(d)
                repeated = False
            else:
                repeats += 1
                repeated = True
            result = bf.add(d)
            if result == False:
                add_ok += 1
                if repeated:
                    add_repeat_but_bf_add_true += 1
            elif result == True:
                if repeated:
                    add_re += 1
                else:
                    add_no += 1
    print("simulated: add_ok=(%d) add_no=(%d) add_re=(%d) repeats=(%d)" % (add_ok, add_no, add_re, repeats))
    print("add_repeat_but_bf_add_true=(%d)" % add_repeat_but_bf_add_true)
    print("bf: num_bits=%d, capacity=%d, error_rate=%f, added=%d" % (bf.num_bits, bf.capacity, bf.error_rate, len(bf)))
    return add_ok
コード例 #28
0
class bloomFilter():
    # Create an empty bloom filter
    def create_new_bf(self, capacity, error_rate, filename):
        self.bf = BloomFilter(capacity, error_rate, filename)

    # Open an existing bloom filter
    def open_bf(self, filename):
        self.bf = BloomFilter.open(filename)

    def add_item(self, item):
        self.bf.add(item)

    def check_membership(self, item):
        return item in self.bf

    def clear_all(self):
        self.bf.clear_all()
コード例 #29
0
class TwoColumnBloomFilter(Filter):
    """
    Bloom filter that takes in inputs as 2-tuples of coordinates
    """
    def __init__(self, capacity, error_rate):
        super().__init__()
        self.bloom_filter = BloomFilter(capacity, error_rate)

    def build_filter(self, matrix):
        for row in matrix:
            self.bloom_filter.add(tuple(row))

    def __contains__(self, item):
        return tuple(item) in self.bloom_filter

    def size(self):
        return self.bloom_filter.num_bits // 8
コード例 #30
0
ファイル: KillSpam.py プロジェクト: skihero/commandos
class SpamCheck (object): 
	def __init__(self):  
		# Setup the logging
	        self.ilog= logging.getLogger('prog')
		self.ilog.setLevel(logging.INFO)
	        self.console = logging.StreamHandler(sys.stderr)
	        self.console.setLevel(logging.INFO)
	        self.console.setFormatter(logging.Formatter('%(message)s'))
		self.ilog.addHandler(self.console)


		# Try loading the filter
	        try: 
	          self.__loadFilter__()
		  ilog.debug("loading filter.." ) 
		  
		# Create the filter if not present
		except: 
		       self.ilog.debug("Exception in loading ...." )
		       self.__create__()
		       self.ilog.debug("Creating the file ... ")

        def __loadFilter__(self): 
		self.bf = BloomFilter.open('filter.bloom')

	def __create__(self): 
		self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
		# Let us initalize the first time, it hacky but ok
		self.spam("000")
		# Generate the filter from a file
		with open("bad_numbers.txt") as f:
		    for nums in f:
		            self.bf.add(nums.rstrip())
			    self.ilog.debug(".")

	def spam(self, bad_entity): 
		with open("bad_numbers.txt","a+") as f: 
			f.write(bad_entity) 
			f.write("\n")
			self.ilog.info("Added bad entry to file")
		self.bf.add(bad_entity) 
		
	  	 
	def isSpam(self, entity): 
		return entity in self.bf 
コード例 #31
0
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
コード例 #32
0
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
 
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
 
    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))
 
    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
 
    def close(self, reason):
        self.fingerprints = None
コード例 #33
0
 def _count(self):
     """
     Implements the Bloom Filter k-mer counting algorithm
     """
     # initialize Bloom Filter
     bf = BloomFilter(self._reader.total_kmer, self._error_rate, 'kmer_bf')
     if self._verbose:
         # initialize progress bar
         current = 0
         update_threshold = int(self._reader.total_kmer / 100)
         format_custom_text = FormatCustomText('Hash Size: %(value).1f MB',
                                               dict(value=0))
         print('Hashing...')
         bar = ProgressBar(
             max_value=self._reader.total_kmer,
             widgets=[
                 Percentage(), ' ',
                 SimpleProgress(format='(%s)' %
                                SimpleProgress.DEFAULT_FORMAT, ), ' ',
                 Bar(), ' ',
                 Timer(), ' ',
                 AdaptiveETA(), ' ', format_custom_text
             ])
         bar.start()
     for kmer in self._reader.kmer():
         if kmer not in bf:  # not in Bloom Filter
             bf.add(kmer)
         else:  # in Bloom Filter
             try:
                 self._kmer_counter[kmer] += 1  # Increment
             except KeyError:
                 self._kmer_counter[kmer] = 2  # Add to Hash Table
         if self._verbose:
             # update progress bar
             current += 1
             if update_threshold == 0 or current % update_threshold == 0:
                 size = sys.getsizeof(self._kmer_counter) / (1024**2)
                 bar.update(current,
                            format_custom_text.update_mapping(value=size))
     os.remove('kmer_bf')  # remove Bloom Filter from disk
     if self._verbose:
         bar.finish()
         print('Hashing Done!')
コード例 #34
0
 def _read_files_and_count(self):
     if self._verbose:
         print('Reading from files...')
     for j in range(self._np):
         if self._verbose:
             # initialize progress bar
             print('Partition #{}'.format(j + 1))
             bar = ProgressBar(max_value=UnknownLength)
             bar.start()
             count = 0
         bf = BloomFilter(self._capacity, self._error_rate, 'kmer_bf')
         kmer_counter = dict()
         with open(str(j), 'r') as f:  # open file for the current partition
             for kmer in f:
                 if kmer not in bf:  # not in Bloom Filter
                     bf.add(kmer)
                 else:  # in Bloom Filter
                     try:
                         kmer_counter[kmer] += 1  # in Hash Table
                     except KeyError:  # not in Hash Table
                         kmer_counter[kmer] = 2  # Add to Hash Table
                 if self._verbose:
                     # update progress bar
                     count += 1
                     bar.update(count)
         if self._verbose:
             bar.finish()
             print('Populating the heap...')
         for kmer, count in kmer_counter.items():
             if count > self._heap[0][0]:  # item is bigger than minimum
                 # replace minimum item with the recent one
                 # kmer.rstrip() is used to eliminate the new line
                 heapq.heappushpop(self._heap, (count, kmer.rstrip()))
         if self._verbose:
             print('Heap is populated')
             print(
                 ('Partition #{} has been completed with {:.1f} MB hash ' +
                  'table').format(j + 1,
                                  sys.getsizeof(kmer_counter) / (1024**2)))
         os.remove(str(j))  # remove the partition file
         os.remove('kmer_bf')
コード例 #35
0
def dedup(fname):
    bf = BloomFilter(1E8, 0.01)
    
    with open(fname, 'r') as fin:
        with open('deduped.tsv', 'w') as fout:
            for line in fin:
                splitLine = line.split('\t')
                description = splitLine[5]
                if bf.add(md5.new(description).digest()):
                    continue
                else:
                    fout.write(line)
コード例 #36
0
    def _create_or_return_bloom(self, elements=None, filename='hashdd.bloom'):
        """Creates and/or returns a bloom filter. If the filter
        does not exist, it will be created using the items in elements. 
        If it does exist, it will be returned. 

        Keyword Arguments:
        elements -- A list of strings to add to the bloom filter
        filename -- The filename where the bloom filter should be stored
        """
        if os.path.isfile(filename):
            bf = BloomFilter.open(filename)
        else:
            print('[+] Creating Bloom filter with {} elements'.format(len(elements)))
            if not elements:
                raise Exception('Attempting to build a bloom filter, but have no items to add')

            limit = len(elements)
            bf = BloomFilter(limit, 0.0001, '{}'.format(filename))
            for element in elements:
                bf.add(element)

        return bf
コード例 #37
0
class LinkFilter():
    def __init__(self):
        if os.path.exists('bloomfilter'):
            self.bloomfilter = BloomFilter.open('bloomfilter')
        else:
            self.bloomfilter = BloomFilter(1000000, 0.01, 'bloomfilter')

    def process(self, links):
        new_links = []
        for link in links:
            if not self.bloomfilter.add(link.url):
                new_links.append(link)
        return new_links
コード例 #38
0
ファイル: pipelines.py プロジェクト: tingyunsay/hong_spider
class FilterPipeline(object):
    def __init__(self):
        self.bloomname = "filter"
        self.f = open("/home/hong/文档/sina_working/2to3_test/log.txt", 'a')
        self.now = time.time()
        self.es = Elasticsearch("10.13.1.126:9200")
        self.one_month_ago = datetime.datetime(
            time.localtime(self.now).tm_year,
            time.localtime(self.now).tm_mon - 1,
            time.localtime(self.now).tm_mday)

    def open_spider(self, spider):
        self.bloomname = "filter"
        isexists = os.path.exists(self.bloomname + ".bloom")
        if isexists:
            print("打开一个存在的filter文件", file=self.f)
            self.bf = BloomFilter.open(self.bloomname + ".bloom")
        else:
            print("创建一个新的filter文件", file=self.f)
            self.bf = BloomFilter(100000000, 0.001, self.bloomname + ".bloom")

    def process_item(self, item, spider):
        token = item['lost_mid']
        time_temp = re.search(r'(\d+).?(\d+).?(\d+)', str(item['lost_time']))
        time_stamp = datetime.datetime(int(time_temp.group(1)),
                                       int(time_temp.group(2)),
                                       int(time_temp.group(3)))
        if time.mktime(time_stamp.timetuple()) < time.mktime(
                self.one_month_ago.timetuple()):
            #print("At Time %s , the item[%s] : the datetime is overtimed._____________"%(time.ctime(),token),file=self.f)
            raise DropItem(
                "****************************The datetime is overtimed!!!!!")

        item['lost_title'] = item['lost_describe']
        items = get_thing_array(item['lost_describe'])
        if not items:
            raise DropItem(
                "****************************the items has no match!!!!!")
        else:
            item['lost_describe'] = items
        flag = self.bf.add(token)
        if flag == False:
            return item
        #这里表示:如果没有重复,item接着放到下面的pipeline类中处理
        else:
            self.f.write(
                "At Time %s , the item[%s] is overread url , Not Allowed._____________"
                % (time.ctime(), token))
            self.f.close()
            raise DropItem(
                "****************************is the overread url!!!!!")
コード例 #39
0
ファイル: filter.py プロジェクト: MatthewShao/Centaur
class DuplicatedFlowFilter(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')

    def add(self, flow):
        """
        :param flow: the flow dict received from Proxy.
        :return: if the flow already in the filter.
        """
        f = (flow[METHOD], flow[URL])
        return self.bf.add(f)

    def __contains__(self, flow):
        f = (flow[METHOD], flow[URL])
        return self.bf.__contains__(f)
コード例 #40
0
class LinkFilter():

    def __init__(self, name):
        self.name = name + ".bf"
        self.bf = BloomFilter(100000000, 0.01, self.name)
        '''
        if os.path.exists(self.name):
            self.bf = BloomFilter.open(self.name)
        else:
            self.bf = BloomFilter(100000000, 0.01, self.name)
        '''

    def link_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf.add(link.url):
                new_links.append(link)
        return new_links
コード例 #41
0
ファイル: bloom.py プロジェクト: LouisVN/LR-Data
def createBloomFilter(contentFile, filterFilename):
    bf = BloomFilter(10000000, 0.9999999, filterFilename)
    total = 0
    count = 0
    failed = 0
    with open(contentFile, "r") as f:
        for domain in f:
            total += 1
            d = domain.rstrip()

            if bf.add(d):
                count += 1
                print(d)
            else:
                failed += 1

    print "Total ", total
    print "Added ", count
    print "Conflicted", failed
コード例 #42
0
ファイル: pipelines.py プロジェクト: l1905/qau_search
class DuplicatesPipeline(object):
    def __init__(self):
        self.conn = Connection()
        self.db = self.conn.blog
        self.siteTables = self.db.siteTables 
        self.bf = BloomFilter(100000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')

    def process_item(self, item, spider):
        print '***********%d pages visited! *********'%len(self.bf)
        if self.bf.add(item['web_urls']):
            raise DropItem("Duplicate item found: %s"%item)
        else:
            self.save_to_visited(item['web_urls'], item['title'], item['content'])
            return item

    def save_to_visited(self, url, utitle,content):
        self.siteTables.insert({"url":url,"title":utitle.encode('utf-8'), "content": content.encode('utf-8')})
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')
    def __del__(self):
        self.f_write.close()
コード例 #43
0
class DuplicateFilter(RFPDupeFilter):
    """
    A dupe filter for url
    """
    def __init__(self, path=FILTER_PATH, debug=False):
        if os.path.exists(FILTER_PATH):
            self.url_filter = BloomFilter.open(FILTER_PATH)
        else:
            print "created a new bloom filter. "
            self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
        super(DuplicateFilter, self).__init__(path, debug)

    def request_fingerprint(self, request):
        return request_fingerprint(request)

    def request_seen(self, request):
        if request.url.startswith("http://www.dianping.com/shop/"):
            fp = self.request_fingerprint(request)
            if self.url_filter.add(fp):
                print ">" * 5 + "filtered " + request.url + "<" * 5
                return True

    def close(self, reason):
        self.url_filter = None
コード例 #44
0
class MultinomiamNaiveBayes(object):
	def __init__(self, base, alpha, initial_capacity, error_rate, cache_size):
		self.initial_capacity = initial_capacity
		self.error_rate = error_rate
		self.alpha = alpha
		self.base = base
		
		#Tracks count | class for p(x|c)
		self.class_conditional_counts = BloomFreqMap(base)
		
		#Tracks count all tokens | class for p(x|c)
		self.tokens_per_class = {}
		
		#Tracks count(class) for p(c)
		self.class_freqs = {}
		
		#Counts vocab size for smoothing
		self.token_type_bf = BloomFilter(capacity=initial_capacity, error_rate=error_rate)
		
		self.vocab_sizes = {}
		
		#Tracks the tokens in each class so that we can penalize unseen tokens
		#self.class_to_toks_bf = {}
		
		self.N = 0 #instance count
	
	def makeTokenFreqmap(self, tokens):
		f = {}
		get = f.get
		for token in tokens:
			f[token] = get(token, 0) + 1
		return f
		
	def fit(self, tokens, class_label):
		#if class_label not in self.class_to_toks_bf:
		#	self.class_to_toks_bf[class_label] = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate)
		
		if class_label not in self.vocab_sizes:
			self.vocab_sizes[class_label] = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate)
			
		self.tokens_per_class[class_label] = self.tokens_per_class.get(class_label, 0) + len(tokens)
		tok_freqs = self.makeTokenFreqmap(tokens)
		
		for token, token_freq in tok_freqs.iteritems():
			#self.class_to_toks_bf[class_label].add(token)
			self.token_type_bf.add(token)
			#conditional_counts_bf[token+'_'+class_label] += token_freq
			self.class_conditional_counts[token+'_'+class_label] += token_freq
			self.vocab_sizes[class_label].add(token)
			
		self.class_freqs[class_label] = self.class_freqs.get(class_label, 0) + 1
		self.N += 1
	
	def predict(self, tokens, tie_breaker='highest_freq', use_class_prior=True):
		
		N = self.N
		max_class, max_score = None, -inf
		tok_freqs = self.makeTokenFreqmap(tokens)
		num_instances = sum((item[1] for item in self.class_freqs.iteritems()))
		for c, cf in self.class_freqs.iteritems():
			this_score = log(cf) - log(N) if use_class_prior else 0.0
			f_t_c = self.tokens_per_class[c]
			num_unseen = 0
			V = len(self.vocab_sizes[c])
			theta_denominator = log(f_t_c + V)
			for token, freq in tok_freqs.iteritems():
				count_in_c = self.class_conditional_counts[token+'_'+c]
				if count_in_c == 0:
					num_unseen += freq
					continue
				this_score += freq*(log(count_in_c + self.alpha) - theta_denominator)
			
			#Penalize unseen tokens
			this_score += num_unseen*(log(self.alpha) - log(theta_denominator))
			
			max_score, max_class = max((max_score, max_class), (this_score, c))
		
		return max_class, max_score
コード例 #45
0
ファイル: crawling.py プロジェクト: ramsayleung/betacat
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = BloomFilter(10000000, 0.01)
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    async def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = await response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = await response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    LOGGER.info("response.url:%s,type:%s",
                                response.url, type(response.url))
                    LOGGER.info("parse_links url:%s,type:%s",
                                url, type(url))
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links) - len(self.seen_urls))

        return stat, links

    async def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = await self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r',
                            tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = await self.parse_links(response)
                self.record_statistic(stat)
                for link in utils.difference(links, self.seen_urls):

                    # for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                # self.seen_urls.update(links)
                self.seen_urls.update(links)
        finally:
            await response.release()

    async def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = await self.q.get()
                assert url in self.seen_urls
                LOGGER.info("url:%s", url)
                LOGGER.info("max_redirect:%s", max_redirect)
                await self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    async def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
コード例 #46
0
ファイル: Giggle.py プロジェクト: surajkapoor/Giggle
if __name__ == "__main__":

	processes = 4	
	urls_pool = Pool(processes = processes)

	for host in HOSTS:
		urls_to_crawl.append(host)
	
	try:
		while len(urls_to_crawl) > 0:
			time.sleep(1)
			for url in urls_to_crawl: #1__get items to crawl
				url = _remove_hash_in_url(url) #2__remove hash in each url if present
				urls_to_crawl.remove(url)#3__remove from list
				print len(urls_to_crawl)	
				if in_visited(url) == False: #4__check to see if in visited
					visited.add(url) #5__add url to visited list
					VISITED_LIST.append(url) 
					new_urls = urls_pool.apply_async(get_new_urls, [url]) #6__get new links and add them to tocrawl
					try:
						new_urls_get = new_urls.get()
					except (requests.exceptions.ConnectionError) as c:
						new_urls_get = False		
					if new_urls_get:
						for new_url in new_urls_get:	
							urls_to_crawl.append(new_url)
		write_crawled_sites(VISITED_LIST)					
	except KeyboardInterrupt:
		print "quit..."
		exit(0)								
コード例 #47
0
ファイル: bloomfilter.py プロジェクト: yylover/programe
                path = os.path.join(base_dir, name)
                if not os.path.exists(path):
                    os.mkdir(path)

                ##mm homepage
                url = "http:" + url
                mm_html = urllib2.urlopen(url).read().decode("GBK")
                mm_html = bs4.BeautifulSoup(mm_html).find_all("div", {"id":"J_AixiuShow"})

                #find all pics
                pics = re.findall(rc, str(mm_html))
                print pics
                count = 0
                for pic_url in pics:
                    count += 1
                    print "url", pic_url
                    pic_name = os.path.join(path, str(count) + '.jpg')
                    urllib.urlretrieve(pic_url, pic_name)

        print list_url
        # print tag_mm_name


if __name__ == "__main__":
    print "hello"
    print checkIfInBloom("test")
    bf.add("test")
    print checkIfInBloom("test")

    get_mms(2, 3)
    print base_dir
コード例 #48
0
ファイル: create_bf.py プロジェクト: enzocxt/bloomfilter
count = 1000000
error_rate = 0.01
keyLen = 2048
# key digest length is 32 bytes

bf = BloomFilter(count, error_rate, 'filter.bloom')
keyDigest_list = []

FILE = open('/home/enzo/CCNx/python_projects/bloomfilter/PublicKeyDigest', 'r')

for i in range(count):
	keyDigest = FILE.read(32)
	keyDigest_list.append(keyDigest)
	
FILE.close()
	
for publicKeyID in keyDigest_list:
	bf.add(publicKeyID)

#print "length of list is %d" % len(keyDigest_list)

#randindex = random.randint(0, count-1)
#rint "randindex is %s" % randindex
#publicKeyID = keyDigest_list[randindex]
#if publicKeyID in bf:
#	print "True"
#else:
#	print "False"


コード例 #49
0
ファイル: parser.py プロジェクト: outlandishlizard/glutenberg
import sys
import glob
import cPickle
from pybloomfilter import BloomFilter

folderpath = sys.argv[1]

for book_filepath in glob.glob(folderpath+'/*.txt'):
    book = open(book_filepath).read()
    sentences  = book.split('.') 
    bf = BloomFilter(100000,0.01,'filter.bloom')
    for sentence in sentences:
        words = sentence.split()
        for word in words:
            bf.add(word.strip('"'))
    print 'the' in bf
    print 'wut' in bf
    print 'laughter' in bf
    BloomFilter.from_base64(book_filepath+'.bf',BloomFilter.to_base64(bf))


コード例 #50
0
ファイル: bloomfilter.py プロジェクト: 3rdDegree/w3af
    [0] https://github.com/andresriancho/w3af/issues/485
    [1] https://github.com/axiak/pybloomfiltermmap/issues/50
    """
    print(OSX_MSG)
else:
    try:
        # This might fail since it is a C library that only works in Linux
        from pybloomfilter import BloomFilter as CMmapFilter

        # There were reports of the C mmap filter not working properly in OSX,
        # just in case, I'm testing here...
        temp_file = GenericBloomFilter.get_temp_file()
        try:
            bf = CMmapFilter(1000, 0.01, temp_file)
            bf.add(1)
            assert 1 in bf
            assert 2 not in bf
        except:
            WrappedBloomFilter = FileSeekFilter
        else:
            WrappedBloomFilter = CMmapFilter
    except:
        WrappedBloomFilter = FileSeekFilter


class BloomFilter(GenericBloomFilter):
    def __init__(self, capacity, error_rate):
        """
        :param capacity: How many items you want to store, eg. 10000
        :param error_rate: The acceptable false positive rate, eg. 0.001
コード例 #51
0
            # For each bloom filter
            opened_bloom = []
            for bloo in bloop_path_set:
                # Opening blooms
                opened_bloom.append(BloomFilter.open(bloo))
            # For each hash of the paste
            for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
                nb_hash_current += 1

                # Adding the hash in Redis & limiting the set
                if r_serv1.scard(line_hash) <= set_limit:
                    r_serv1.sadd(line_hash, index)
                    r_serv1.sadd("HASHS", line_hash)
                # Adding the hash in the bloom of the month
                bloom.add(line_hash)
                # Go throught the Database of the bloom filter (of the month)
                for bloo in opened_bloom:
                    if line_hash in bloo:
                        db = bloo.name[-6:]
                        # Go throught the Database of the bloom filter (month)
                        r_serv_bloom = dico_redis[db]

                        # set of index paste: set([1,2,4,65])
                        hash_current = r_serv_bloom.smembers(line_hash)
                        # removing itself from the list
                        hash_current = hash_current - set([index])

                        # if the hash is present at least in 1 files
                        # (already processed)
                        if len(hash_current) != 0:
コード例 #52
0
class ObjectTracker(object):
    invCleanPeriod = 300
    invInitialCapacity = 50000
    invErrorRate = 0.03
    trackingExpires = 3600
    initialTimeOffset = 60

    def __init__(self):
        self.objectsNewToMe = RandomTrackingDict()
        self.objectsNewToThem = {}
        self.objectsNewToThemLock = RLock()
        self.initInvBloom()
        self.initAddrBloom()
        self.lastCleaned = time.time()

    def initInvBloom(self):
        if haveBloom:
            # lock?
            self.invBloom = BloomFilter(capacity=ObjectTracker.invInitialCapacity,
                                        error_rate=ObjectTracker.invErrorRate)

    def initAddrBloom(self):
        if haveBloom:
            # lock?
            self.addrBloom = BloomFilter(capacity=ObjectTracker.invInitialCapacity,
                                         error_rate=ObjectTracker.invErrorRate)

    def clean(self):
        if self.lastCleaned < time.time() - ObjectTracker.invCleanPeriod:
            if haveBloom:
                # FIXME
                if PendingDownloadQueue().size() == 0:
                    self.initInvBloom()
                self.initAddrBloom()
            else:
                # release memory
                deadline = time.time() - ObjectTracker.trackingExpires
                with self.objectsNewToThemLock:
                    self.objectsNewToThem = {k: v for k, v in self.objectsNewToThem.iteritems() if v >= deadline}
            self.lastCleaned = time.time()

    def hasObj(self, hashid):
        if haveBloom:
            return hashid in self.invBloom
        else:
            return hashid in self.objectsNewToMe

    def handleReceivedInventory(self, hashId):
        if haveBloom:
            self.invBloom.add(hashId)
        try:
            with self.objectsNewToThemLock:
                del self.objectsNewToThem[hashId]
        except KeyError:
            pass
        if hashId not in missingObjects:
            missingObjects[hashId] = time.time()
        self.objectsNewToMe[hashId] = True

    def handleReceivedObject(self, streamNumber, hashid):
        for i in network.connectionpool.BMConnectionPool().inboundConnections.values() + network.connectionpool.BMConnectionPool().outboundConnections.values():
            if not i.fullyEstablished:
                continue
            try:
                del i.objectsNewToMe[hashid]
            except KeyError:
                if streamNumber in i.streams and \
                    (not Dandelion().hasHash(hashid) or \
                    Dandelion().objectChildStem(hashid) == i):
                    with i.objectsNewToThemLock:
                        i.objectsNewToThem[hashid] = time.time()
                    # update stream number, which we didn't have when we just received the dinv
                    # also resets expiration of the stem mode
                    Dandelion().setHashStream(hashid, streamNumber)

            if i == self:
                try:
                    with i.objectsNewToThemLock:
                        del i.objectsNewToThem[hashid]
                except KeyError:
                    pass

    def hasAddr(self, addr):
        if haveBloom:
            return addr in self.invBloom

    def addAddr(self, hashid):
        if haveBloom:
            self.addrBloom.add(hashid)
コード例 #53
0
class Worker:

	def __init__(self, seeds, done_que, run_que):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_que = run_que
		self.done_que = done_que
		self.tasks = []
		self.done = 1

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 60
		self.poolmaxfree = 20
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		self.tt = 1

		self.done_sites_fname='done_sites.bin'
		try:
			self.bfdone = BloomFilter.open(self.done_sites_fname)
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M 

		if self.run_que.qsize() == 0:
			for seed in seeds:
				self.run_que.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
		else:
			self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)


	def cb_httpget(self, data = None):

		if not data:
			return
		seed, err, headers, content = data
		st = time.time()

		if err:
			self.handle_error(err,seed)
			return

		if self.https_enable == 0:
			seed = seed[7:]

		self.bfdone.add(seed)
		self.done += 1

		data={'seed':seed,'headers':headers,'content':content}

		dat = cPickle.dumps(data)
		self.done_que.put(dat)

		et = time.time()
		self.cbcputime += (et-st)
		#self.tt=(et-st)

		if self.done % self.showpercounts == 0:
			self.out(seed)
			pass

	def out(self, seed):

		spendtime = time.time() - self.starttime
		spendtime = 1 if spendtime == 0 else spendtime
		nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
		now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )
		print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \
			(self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed )
	
	
	def work(self):

		while self.quit == 0:

			st = time.time()
			curdone = self.done

			self.freecount = self.down_pool.free_count()
			

			if self.freecount > self.poolmaxfree:
				self.tasks = []
				minlen = min(self.freecount+1,self.run_que.qsize())
				#if minlen <=0:break
				
				for i in range( minlen):
					stt = time.time()
					url = self.run_que.get()
					ett = time.time()
					if url in self.bfdone:# 5%-10%
							continue

					url = "http://"+url
					self.tasks.append(url)

				for url in self.tasks:
					self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget)

			
			time.sleep(0.1)
			et = time.time()	
			self.curspeed = (self.done - curdone) / (et-st)
			#self.tt = (et-st)

	
		self.down_pool.join()
		print "All OVER"

	def handle_error(self,e,url):

		if e.find('DNSError') > 0 :
			self.err.dns += 1
			self.err.rdns.append(url)
		elif e.find('reset') > 0 :#Connection reset
			self.err.reset += 1
			self.err.rreset.append(url)
		elif e.find('Max retries') > 0 or e.find('Connection aborted'): #
			self.err.conntimeout += 1
			self.err.rconntimeout.append(url)
		elif e.find('refused') > 0: #Connection refused
			self.err.refuse += 1
			self.err.rrefuse.append(url)

		else:
			self.err.others +=1
			self.err.rothers.append(url)
			print "Error", url, e

	# requests is better through test
	def httpget_requests(self, url):

		st = time.time()
		con = ""
		e = ""
		res_headers = ""
		headers = {
					'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
					'Accept-Encoding':'gzip,deflate',
					'Connection':'close',
					'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
				}


		res = None
		try:
			# todo: query the ip of the website before get through dns
			req = requests
			req.max_redirects = 1
			res = req.get(url, timeout = (3,2), headers = headers )
			if self.https_enable == 0 and res.url.lower().startswith('http:'):
				if 'content-type' not in res.headers.keys() or 'html' not in res.headers['content-type']:
					return None
				con = res.content
				
			res.close()

		except KeyboardInterrupt:
				raise
		except Exception as e:
			e = str(e)
			if res:
				res.close()

			return url,e,None,None

		et = time.time()
		self.totalnettime += (et-st)
		self.tt = (et-st)
		return url, e, res.headers, con

	def savestate(self):

		self.quit = 1
		now = time.time()
		self.oldtime += (now - self.starttime)

		#should hold on the singal for procdata done


		with open('state.txt','wb') as f:
			f.write(str(self.oldtime) + '\n')
			# tasks run_queue done
			f.write(str(len(self.tasks)) + '\n')
			for t in self.tasks:
				f.write(t + '\n')
			l = self.run_que.qsize()
			f.write(str(l)+ '\n')
			while l > 0:
				f.write( self.run_que.pop() + '\n')
				l-=1
			f.write(str((self.done)) + '\n')
 
		with open('err_records.pack','wb') as f:
			cPickle.dump(self.err,f,2)

		print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully."
		f.close()
		exit(0)

	def loadstate(self):

		try:
			with open('state.txt') as f:
				self.oldtime = float(f.readline())
				tasks = int(f.readline())
				for i in xrange(tasks):
					self.run_que.add(f.readline().rstrip('\n'))

				runnings = int(f.readline())
				for i in xrange(runnings):
					self.run_que.add(f.readline().rstrip('\n'))

				self.done = int(f.readline())

			with open('err_records.pack','rb') as f:
				self.err = cPickle.load(f)

			print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly."
		except Exception as e:
				print e
コード例 #54
0
ファイル: myspider.py プロジェクト: mylinlan/spider
class MySpider(object):
    def __init__(self, start_url, basic_url):
        self.basic_url = basic_url
        self.start_url = start_url
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile('filter.bloom'):
            self.bf = BloomFilter.open('filter.bloom')
        else:
            self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')

    def __get_time(self):
        return self.datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def __cal_time(self, date1, date2):
        date1 = self.time.strptime(date1, "%Y-%m-%d %H:%M:%S")
        date2 = self.time.strptime(date2, "%Y-%m-%d %H:%M:%S")
        date1 = self.datetime.datetime(date1[0], date1[1], date1[2], date1[3], date1[4], date1[5])
        date2 = self.datetime.datetime(date2[0], date2[1], date2[2], date2[3], date2[4], date2[5])
        return str(date2 - date1)

    def __log(self, log_str, *args, **kw):
        current_time = self.__get_time()
        print current_time + ' : ' + log_str,
        for each in args:
            print each,
        print '\n',
        for each in kw.keys():
            print current_time + ' : ' + each + ' is ' + kw[each]

    def __process_text(self, my_str):
        my_str = self.re.sub("http.*?html", "", my_str).encode('utf-8')
        if type(my_str) == 'unicode':
            my_str = my_str.encode('utf-8')
        return my_str.replace(" ", "").replace("  ", "").replace('\n', '')

    def __open_url(self, url):
        req = self.requests.get(url)
        content = req.text
        soup = BeautifulSoup(content)
        return soup

    def __process_sub_content(self, result, insert_id):
        for each in result:
            # print '\n' + '楼层:'
            # 时间以及作者
            # print each.attrs.get('js_restime')
            # print each.a.text
            # 内容
            # bbs_content = each.select("[class~=bbs-content]")
            # text = bbs_content[0].text.strip()
            # 去除链接以及空格
            # text = re.sub("http.*?html", "", text).encode('utf-8')
            # text = text.replace(" ", "").replace("  ", "")
            # print self.__process_text(text)
            # print process_text(text)
            replies = each.select('ul li')
            for reply in replies:
                self.__log('process the reply ... start')
                reply_time = reply.get('_replytime')
                reply_author = reply.get('_username')
                reply_content = reply.select("[class~=ir-content]")
                reply_text = reply_content[0].text
                reply_dict = {
                    "title_id": insert_id,
                    "author": reply_author,
                    "time": reply_time,
                    "text": reply_text
                        }
                self.__log('content is', reply_text)
                self.__log('insert to database ...start')
                self.mysql.insert_data('reply', reply_dict)
                self.__log('insert to database ...done')
                self.__log('process the reply ... done')
        # 处理每一层楼

    def process_content_page(self, url, author, reply_time, insert_id):
        self.__process_reply_page(url, author, reply_time, insert_id)

    def __process_reply_page(self, url, author, reply_time, insert_id):
        self.__log('process reply page... start')
        soup = self.__open_url(url)
        # 各层楼的tag
        result = soup.select("[class~=atl-item]")

        if len(result):
            self.__log('the html was read successfully')
        else:
            self.__log('html read fail. maybe the page is lose. function return')
            self.__log('process reply page ... done')
            return
        # 回复页总页数
        page_id = soup.select("form a")
        if page_id:
            total_page_num = int(page_id[-2].text)
        else:
            total_page_num = 1

        self.__log('have read', total_page_num, 'pages')

        # 首层楼的内容
        main_content = result[0].select("[class~=bbs-content]")
        main_content = main_content[0].text.strip()
        main_text = self.__process_text(main_content)
        reply_dict = {
                    "title_id": insert_id,
                    "author": author,
                    "time": reply_time,
                    "text": main_text
                    }
        self.mysql.insert_data('reply', reply_dict)
        result = result[1:]
        self.__log('process every floor')
        self.__process_sub_content(result, '1')
        if total_page_num > 1:
            for num in range(2, total_page_num + 1):
                self.__log('process the', str(num), 'reply page ... start')
                next_url = url[:-7]+str(num)+url[-6:]
                print next_url
                new_soup = self.__open_url(next_url)
                result = new_soup.select("[class~=atl-item]")
                self.__process_sub_content(result, insert_id)
                self.__log('process the', str(num), 'reply page ... done')
        self.__log('process reply page ... done')

    def __process_titles_page(self, page_url):
        self.__log('reading titles page .... start')
        req = self.requests.get(page_url)
        content = req.text
        soup = BeautifulSoup(content)

        # 获取所有标题
        titles = soup.select('tbody tr')
        # 去掉不符合的部分
        titles = titles[1:]
        # 对每一个标题进行处理
        self.__log('reading titles page .... done')
        self.__log('processing all titles in', self.start_url, ' ... start')
        counter = 1
        for each in titles:
            # 获取标题的tag信息
            # 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白
            # 所以下面content索引需要考虑到这点
            self.__log('process the', counter, 'title', ' ... start')
            counter += 1
            title_content = each.contents
            title_href = title_content[1].a.get('href')         # 获取标题链接
            title_text = title_content[1].text.strip()          # 获取标题内容
            title_author = title_content[3].a.text              # 获取作者
            title_click_num = title_content[5].text             # 点击数
            title_reply_number = title_content[7].text          # 获取回复数
            title_time = title_content[9].get('title')          # 获取时间
            sub_href = self.basic_url + title_href                   # 子链接
            # 构造标题的字典,插入标题
            title_dict = {
                "reply_num": title_reply_number,
                "click_num": title_click_num,
                "author": title_author,
                "time": title_time,
                "link": sub_href,
                "text": title_text
                    }
            # for each in title_dict:
            #    print each
            #    print type(title_dict[each])
            # 利用链接地址和回复数判断是否重复
            # flag = sub_href + title_click_num
            flag = sub_href
            if not (self.bf.add(flag)):
                self.__log('', flag, 'not in bloom filter')
                self.__log('insert to database ... start')

                insert_id = self.mysql.insert_data("titles", title_dict)
                self.__log('insert to database ... done')
                self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id))
            self.__log('process the', counter, 'title', ' ... done')

        # 下一页的链接
        next_page_tag = soup.find('a', text='下一页')
        if next_page_tag:
            next_page = next_page_tag.get('href')
            next_page = self.basic_url + next_page
        else:
            next_page = None
        return next_page

    # 清空bloom filter
    def clean_bloom_filter(self):
        self.__log('clean all in bloom filter ... start')
        self.bf.clear_all()
        self.__log('clean all in bloom filter ... done')

    def bloom_filter_len(self):
        return len(self.bf)

    def main(self):
        self.__log('spider ... start')
        self.__log('process start url ... running')
        next_page = self.__process_titles_page(self.start_url)
        self.__log('process start url ... done')
        start_time = self.__get_time()
        print start_time
        depth = 1
        while next_page:
            # if depth == 2:
            #    break
            self.__log('now it is the', str(depth), 'page')
            next_page = self.__process_titles_page(next_page)
            depth += 1
        end_time = self.__get_time()
        print end_time
        duration = self.__cal_time(start_time, end_time)
        self.__log('duration are', duration)
        self.__log('spider ... done')

    def clean_table(self, table):
        self.mysql.clean_table(table)

    def test(self):
        test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml'
        print self.bf.add(test_url)
コード例 #55
0
ファイル: pipelines.py プロジェクト: wybini/search-engine
class DuplicatesPipeline(object):
    

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        self.db = MySQLdb.connect("localhost","root","","storecount")
        self.cursor = self.db.cursor()
        self.cursor.execute("DROP TABLE IF EXISTS POPULAR")
        sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);"""
    
        try:
            self.cursor.execute(sql1)
            self.db.commit()
#             print "cao create"
        except:
            traceback.print_exc()
            self.db.rollback()
#         self.dbpool = adbapi.ConnectionPool('MySQLdb',
#                                             host = '127.0.0.1',
#                                             db = 'storecount',
#                                             user = '******',
#                                             passwd = '',
#                                             cursorclass = MySQLdb.cursors.DictCursor,
#                                             charset = 'utf8',
#                                             use_unicode = True)
        self.mark = 0
        
        
#     def _conditional_insert(self,tx,item):
#         sql = 'insert into popular values (%s, %d)'
#         tx.execute(sql, (item['url'],self.mark))

    def process_item(self, item, spider):
#         print '************%d pages visited!*****************' %len(self.bf)
        if self.bf.add(item['url']):#True if item in the BF
            sql2 = "UPDATE POPULAR SET COUNT_MARK = COUNT_MARK + 1 WHERE URL = '%s'" %item['url']
            try:
                print "update"
                self.cursor.execute(sql2)
                self.db.commit()
            except:
                traceback.print_exc()
                self.db.rollback()
#             self.dbpool.runOperation("UPDATE popular SET mark+1")
            
            raise DropItem("Duplicate item found: %s" % item)
            
            
        else:
            #print '%d pages visited!'% len(self.url_seen)
            self.count_num += 1
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            sql3 = """INSERT INTO POPULAR(URL,COUNT_MARK) VALUES ("%s",0);""" % item['url']
            try:
                self.cursor.execute(sql3)
                self.db.commit()
                
            except:
                traceback.print_exc()
                self.db.rollback()
#             self._conditional_insert(self,self.dbpool, item['url'], 0)
            
#             print self.count_num
            if self.count_num >=100000 and self.count_num % 10000 :
                print self.count_num
            return item
        

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
コード例 #56
0
import tempfile
import time
from pybloomfilter import BloomFilter

NS = 10**9
for _p in xrange(1,3):
    p = 10 ** _p
    for e in xrange(9):
        with tempfile.NamedTemporaryFile() as f:
            X = int(1000 * 10 ** (e / 2.0))
            print X, p, 
            name = f.name
            bloomfilter = BloomFilter(X + 1, 1.0/p, name)
            t = time.time()

            for x in xrange(X):
                bloomfilter.add(x)
            print (time.time() - t) / X * NS,
            t = time.time()
            for x in xrange(X):
                x in bloomfilter
            print (time.time() - t) / X * NS,
            t = time.time()
            for x in xrange(X, 2*X):
                x in bloomfilter
            print (time.time() - t ) / X * NS