Exemple #1
0
 def rotateFilters(self):
     assert self.R * self.counters[self.first] >= self.capacity_per_filter
     last = (self.first + 2) % 3
     # clear the last filter and make it an empty first filter
     self.cfs[last] = pydablooms.Dablooms(capacity=self.capacity_per_filter,
                                          error_rate=FPP_RATE,
                                          filepath="%d_%s" %
                                          (last, self.filepath))
     self.counters[last] = 0
     self.first = last
     self.rotations += 1
 def __init__(self, gt, lt):
     self.gt = int(gt)
     self.lt = int(lt)
     if 'pydablooms' in sys.modules.keys():
         self.bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY,
                                          error_rate=DABLOOMS_ERROR_RATE,
                                          filepath=DABLOOMS_FILEPATH)
     else:
         self.bloom = None
         host = settings.get('MONGOD_HOST', MONGOD_HOST)
         port = settings.get('MONGOD_PORT', MONGOD_PORT)
         self.db = _default_mongo(host, port, usedb='master_timeline')
Exemple #3
0
 def __init__(self, R=1, capacity=0, filepath="ghostlist.bf"):
     self.filepath = filepath
     self.capacity_per_filter = int(ceil(0.5 * capacity))
     for i in xrange(3):
         cf = pydablooms.Dablooms(capacity=self.capacity_per_filter,
                                  error_rate=FPP_RATE,
                                  filepath="%d_%s" % (i, self.filepath))
         self.cfs.append(cf)
         self.counters.append(0)
     #print self.cfs
     self.total_count = 0
     self.first = 0
     self.capacity = capacity
     #print "capacity_per_filter=%d, but capacity=%d" % (self.capacity_per_filter, self.capacity)
     self.ghosthits = 0
     self.ghostmisses = 0
     self.rotations = 0
     self.R = R
     print "R=%d" % R
     self.Rignored = 0
     self.Rpassed = 0
     self.ghostplus = [0.0 for c in xrange(capacity)]
     self.pdf = [0.0 for j in xrange(int(ceil(1.5 * capacity)))]
Exemple #4
0
#! /usr/bin/env python
import pydablooms

CAPACITY = 5000
ERROR_RATE = float(1) / CAPACITY
bloom = pydablooms.Dablooms(capacity=CAPACITY,
                            error_rate=ERROR_RATE,
                            filepath='bloom.bin')

f = open('classes.txt', 'r')
g = open('classnames.txt', 'w')
for line in f:
    fields = line.split("'")
    if fields[0] == 'class ':
        g.write(fields[1] + '\n')

f.close()
g.close()
h = open('classnames.txt', 'rb')
i = 0
for line in h:
    bloom.add(line.rstrip(), i)
    i += 1

bloom.flush()
Exemple #5
0
#!/usr/bin/python
import pydablooms
import sys
import md5
sys.path.append('..')
import known_libs

from androguard.core.bytecode import *
from androguard.core.bytecodes.apk import *
from androguard.core.analysis.analysis import *

bloom = pydablooms.Dablooms(capacity=10000000,
                            error_rate=.05,
                            filepath='libs.bbf')


def isLibraryClass(classname, libs=None):
    package_method = False
    if libs == None:
        for package in known_libs.known_libs:
            package_name = "L" + package + "/"
            package_name = package_name.replace(".", "/")
            if package_name in classname:
                package_method = True
                break
    else:
        for package in libs:
            if package in classname:
                package_method = True
                break
    return package_method
Exemple #6
0
def build_filter(bloom_filename,
                 linear_refs,
                 circular_refs,
                 kmer,
                 mismatches,
                 inserts,
                 deletions,
                 error_rate=0.01,
                 rc=True):
    #Using 5e-06 is close to a set for my example, both in run time
    #(a fraction more) and the number of reads kept (9528 vs 8058
    #with sets).
    simple = set()
    del_hashes = set()
    count = 0
    t0 = time.time()
    if linear_refs:
        for fasta in linear_refs:
            sys.stderr.write("Hashing linear references in %s\n" % fasta)
            handle = open(fasta)
            for upper_seq, raw_read in fasta_iterator(handle):
                #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \
                #    % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT"))
                #Note we do the disambiguate call on the fragments rather than
                #the whole reference to avoid too many levels of recursion.
                for i in range(0, len(upper_seq) - kmer):
                    for fragment in disambiguate(upper_seq[i:i + kmer]):
                        assert set(fragment).issubset("ACGT"), fragment
                        simple.add(fragment)
                        #bloom.add(fragment, kmer)
                        count += 1  #TODO - Can do this in one go from len(upper_seq)
                if deletions:
                    for i in range(0, len(upper_seq) - kmer + 1):
                        for fragment in make_deletions(upper_seq[i:i + kmer +
                                                                 1]):
                            del_hashes.add(fragment)
            handle.close()

    if circular_refs:
        for fasta in circular_refs:
            sys.stderr.write("Hashing circular references in %s\n" % fasta)
            handle = open(fasta)
            for upper_seq, raw_read in fasta_iterator(handle):
                #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \
                #    % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT"))
                #Want to consider wrapping round the origin, add k-mer length:
                upper_seq += upper_seq[:kmer]
                for i in range(0, len(upper_seq) - kmer):
                    for fragment in disambiguate(upper_seq[i:i + kmer]):
                        assert set(fragment).issubset("ACGT"), fragment
                        simple.add(fragment)
                        #bloom.add(fragment, kmer)
                        count += 1  #TODO - Can do this in one go from len(upper_seq)
                if deletions:
                    for i in range(0, len(upper_seq) - kmer + 1):
                        for fragment in make_deletions(upper_seq[i:i + kmer +
                                                                 1]):
                            del_hashes.add(fragment)
            handle.close()
    if rc:
        #Would popping be slow? Should mean less memory at once
        temp = simple.copy()
        for fragment in temp:
            simple.add(reverse_complement(fragment))
        del temp
    if mismatches or inserts or deletions:
        sys.stderr.write("Have %i unique k-mers before consider fuzzy matches\n" \
                         % (len(simple)))
        if deletions:
            #Do this first to avoid 3 large sets in memory!
            new = del_hashes
            del del_hashes
            new.update(simple)
            sys.stderr.write("Adding deletions brings this to %i unique k-mers\n" \
                             % len(new))
        else:
            new = simple.copy()
        if mismatches:
            for fragment in simple:
                for var in make_variants(fragment, mismatches):
                    new.add(var)
            sys.stderr.write("Adding %i mis-matches per k-mer, have %i unique k-mers\n" \
                             % (mismatches, len(new)))
        if inserts:
            for fragment in simple:
                for var in make_inserts(fragment):
                    new.add(var)
            sys.stderr.write("Adding inserts brings this to %i unique k-mers\n" \
                             % len(new))
        simple = new
    capacity = len(simple)
    bloom = pydablooms.Dablooms(capacity, error_rate, bloom_filename)
    for fragment in simple:
        bloom.add(fragment)
    bloom.flush()
    sys.stderr.write(
        "Set and bloom filter of %i-mers created (%i k-mers considered, %i unique)\n"
        % (kmer, count, len(simple)))
    sys.stderr.write(
        "Using Bloom filter with capacity %i and error rate %r\n" %
        (capacity, error_rate))
    sys.stderr.write("Building filters took %0.1fs\n" % (time.time() - t0))
    return simple, bloom
# -*- coding: utf-8 -*-

# 将已有master_timeline的微博加入dablooms的集合

import pydablooms
import time
from utils4scrapy.tk_maintain import _default_mongo

MONGOD_HOST = 'localhost'
MONGOD_PORT = 27017
DABLOOMS_CAPACITY = 2000000000
DABLOOMS_ERROR_RATE = .001
DABLOOMS_FILEPATH = '/opt/scrapy_weibo/scrapy_weibo/bloom.bin'
#DABLOOMS_FILEPATH = '/tmp/bloom.bin'

bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY,
                            error_rate=DABLOOMS_ERROR_RATE,
                            filepath=DABLOOMS_FILEPATH)
db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb='master_timeline')

for status in db.master_timeline_weibo.find():
    bloom.add(status['mid'], int(time.time() * 1000))
Exemple #8
0
import pydablooms

capacity = 100000
error_rate = 0.05

print("pydablooms version: %s" % pydablooms.__version__)

if len(sys.argv) != 3:
    sys.stderr.write("Usage: %s <bloom_file> <words_file>\n" % sys.argv[0])
    sys.exit(1)

bloom_fname = sys.argv[1]
words_fname = sys.argv[2]

bloom = pydablooms.Dablooms(capacity=capacity,
                            error_rate=error_rate,
                            filepath=bloom_fname)

words_file = open(words_fname, 'rb')
i = 0
for line in words_file:
    bloom.add(line.rstrip(), i)
    i += 1

words_file.seek(0)
i = 0
for line in words_file:
    if i % 5 == 0:
        bloom.delete(line.rstrip(), i)
    i += 1