Esempio n. 1
0
 def __init__(self, **kwargs):
     # args
     super(CarSpider, self).__init__(**kwargs)
     #problem report
     self.mailer = MailSender.from_settings(settings)
     self.counts=0
     # Mongo
     settings.set('CrawlCar_Num', carnum, priority='cmdline')
     settings.set('MONGODB_DB', 'usedcar', priority='cmdline')
     settings.set('MONGODB_COLLECTION', website, priority='cmdline')
     #mysql
     # mysql
     mysqldb = MySQLdb.connect("192.168.1.94", "root", "Datauser@2017", "usedcar", port=3306)
     mysqldbc = mysqldb.cursor()
     # read
     mysqldbc.execute("select newcarurl from che58")
     items = mysqldbc.fetchall()
     self.urllist=[]
     df =pybloom.BloomFilter(carnum,0.01)
     for i in items:
         j=i[0]
         md5i= hashlib.md5(j)
         rf = df.add(md5i)
         if not rf:
             self.urllist.append(j)
Esempio n. 2
0
	def process_loops(self, node, context):
		if "path" not in context:
			context["path"] = []

		if "loop?" not in context:
			context["loop?"] = False

		if "detection" not in context:
			context["detection"] = 0

		if "bf" not in context:
			context["bf"] = pb.BloomFilter(self.capacity, self.error_rate)

		if "loopstart" not in context:
			try:
				context["loopstart"] = context["path"].index(node)
				context["loopsize"] = len(context["path"]) - context["loopstart"]
			except ValueError:
				pass

		if (node in context["bf"]):
			context["detection"] += 1
			if context["detection"] >= self.detections:
				context["loop?"] = True
				return False

		context["path"].append(node)
		context["bf"].add(node)

		return True
Esempio n. 3
0
	def report(self, oneline = False):
		nl = "," if oneline else "\n"

		bf = pb.BloomFilter(self.capacity, self.error_rate)
		print self.__class__.__name__, nl,
		print self.pcsv("Null:"), "--", nl,
		print self.pcsv("Cap:"), self.capacity, nl,
		print self.pcsv("Rate:"), self.error_rate, nl,
		print self.pcsv("Hashes:"), bf.num_slices, nl,
		print self.pcsv("Mem:"), bf.num_bits + math.log(self.detections, 2), self.pcsv("bits"), nl,
		super(self.__class__, self).report(oneline)
Esempio n. 4
0
    def get_bf(self, w_size, offset):
        bitshred = pybloom.BloomFilter(capacity=10000, error_rate=0.001)
        for i in range(-1+offset, (len(self.opcodes) - w_size)*-1, -1):
            window = ""
            for asm in self.opcodes[i:i - w_size:-1]:
                window += asm + "\n"
            #print("dbg: window:\n%s" % window)
            #self.dbg_windows.append(window)
            bitshred.add(window)

        return bitshred
Esempio n. 5
0
def Bloom_Init(mysqltable,
               collection,
               mysqldbc,
               bfrate=0.001,
               keycol="statusplus"):
    # pybloom
    num = collection.count() * 1.1
    df = pybloom.BloomFilter(capacity=num, error_rate=bfrate)
    # read
    mysqldbc.execute("select " + keycol + " from " + mysqltable)
    items = mysqldbc.fetchall()
    for i in items:
        item = hashlib.md5(i[0]).hexdigest()
        df.add(item)
    return df
Esempio n. 6
0
 def __init__(self, n=10000):
     self.count = 0
     self.max_size = 100
     self.cache = queue.Queue(0)
     self.pageset = pybloom.BloomFilter(n)
     self.lock = threading.Lock()
     self.url_header = {
         'User-Agent': 'Mozilla/5.0 (compatible; '
         'Googlebot/2.1; +http://www.google.com/bot.html)',
         'Proxy-Connection': 'keep-alive',
         'Pragma': 'no-cache',
         'Cache-Control': 'no-cache',
         'Upgrade-Insecure-Requests': '1',
         'DNT': '1',
         'Accept-Encoding': 'gzip, deflate',
         'Accept-Language':
         'zh-SG,zh;q=0.9,zh-CN;q=0.8,en;q=0.7,zh-TW;q=0.6'
     }
     self.start_time = time.time()
     self.logger = utils.get_logger("cadre.spider")
     self.open_spider()
Esempio n. 7
0
def main():
    setup=json.load(open(sys.argv[1],'r'))
    feeds=json.load(open(sys.argv[2],'r'))
    bloom=pb.BloomFilter(1000000)
    bloomloc=sys.argv[3]
    try:
        bloom=bloom.fromfile(open(bloomloc,'r'))
    except:
        print "starting over"
        pass
    z = ""
    for x,y in feeds.iteritems():
        temp_title ="<h1>" + x + "</h1><br>\n "
        temp_feeds = feeds_to_html(parse_feeds(y, bloom))
        if len(temp_feeds)>0:
            z += temp_title+temp_feeds
        print x, len(temp_feeds)
    if len(z) > 0:
        send_email("RSS digest",
                   "<body>\n" + z + "</body>",
                   setup)
    bloom.tofile(open(bloomloc,'w'))
## Import external configuration if set

if args.config:
    dirname = os.path.dirname(args.config)
    basename = os.path.basename(args.config)
    modname = os.path.splitext(basename)[0]

    sys.path.insert(0, dirname)
    globals().update(importlib.import_module(modname).__dict__)

#
## Generate bf_error_rates if not set

if enbloomfilter:
    if not len(bf_error_rates):
        bf_num_bits_pairs = [(pb.BloomFilter(bf_capacity, p).num_bits, p)
                             for p in [(z + 1) / 1000000.
                                       for z in xrange(999999)]]
        bf_error_rates = [
            w for (i, (q, w)) in enumerate(bf_num_bits_pairs)
            if q < bf_num_bits_pairs[i - 1][0]
        ]

#
## Override number of runs if set

if args.runs:
    packets = args.runs

#
## Create topology if necessary
	dirname = os.path.dirname(args.config)
	basename = os.path.basename(args.config)
	modname = os.path.splitext(basename)[0]

	sys.path.insert(0, dirname)
	globals().update(importlib.import_module(modname).__dict__)
	


#
## Generate bf_error_rates if not set

if enbloomfilter:
	if not len(bf_error_rates):
		bf_num_bits_pairs = [(pb.BloomFilter(bf_capacity, p).num_bits, p) for p in [(z+1)/1000000. for z in xrange(999999)]]
		bf_error_rates = [w for (i,(q,w)) in enumerate(bf_num_bits_pairs) if q < bf_num_bits_pairs[i-1][0]]


#
## Override number of runs if set

if args.runs:
	packets = args.runs


#
## Create topology if necessary

if topoloops or topopaths:
	topo = Topology.load(topofile, parser=topoparser, create_hosts=True, allcycles=True, directed=False)
Esempio n. 10
0
 def __init__(self):
     self.scrawled_urls = pybloom.BloomFilter(n=10000000)
Esempio n. 11
0
# prng = np.random.RandomState(1)
# pt=prng.rand(1,1000000)
#
def check(list,blf):
    count=0
    print('xx:',type(list),len(list))
    i=0
    for x in list:
        if i<2:
            print('t:',blf.add(x)[0])
            i+=1
        if  blf.add(x)[0]==False:
            count+=1
    print('w:',count)
    return count
f = pb.BloomFilter(capacity=1500000, error_rate=0.01)
pl1=[]
# # f1 = pb.BloomFilter(capacity=1050, error_rate=0.001)
oldtime=time.time()
for x in plx:
    _, bi0 = f.add(x)
newtime=time.time()
print('0:',(newtime-oldtime))
newtime=time.time()
oldtime=time.time()
_, bi1= f.add(1)
newtime=time.time()
# print('0:',oldtime)
# print('1:',newtime)
print('1:',(newtime-oldtime))
# for x in plx:
Esempio n. 12
0
import pybloom
import sys

if __name__ == "__main__":
    capacity = sys.argv[1]
    bf = pybloom.BloomFilter(capacity=int(capacity), error_rate=0.001)
    for line in sys.stdin:
        if not bf.add(line.strip()):
            print line.strip()
Esempio n. 13
0
import requests
import urllib.parse
from bs4 import BeautifulSoup
import threading
import queue
import os
import time
import pybloom

count = sum([
    len(x) for _, _, x in os.walk(os.path.dirname("../dsimage/file/"))
])  # index of file
MAX_SIZE = 10000  # stop when the count reaches MAX_SIZE
INIT_URL = "http://www.mm4000.com/"
lock_count = threading.Lock()
pageset = pybloom.BloomFilter(100000)
imgset = pybloom.BloomFilter(100000)
cache = queue.Queue(0)
stack = queue.LifoQueue(0)
stack.put(INIT_URL)
urlheader = {
    'User-Agent':
    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
    'Proxy-Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-SG,zh;q=0.9,zh-CN;q=0.8,en;q=0.7,zh-TW;q=0.6'
}
Esempio n. 14
0
def test_performance():

    n = 100000
    p = 0.001
    
    # create set of strings to use
    strings = set()
    string_size = 50    # make this number higher 
                        # if performance test is taking too long
    while len(strings) < n:
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
        strings.add(string)
        
    # create another set
    otherstrings = set()
    while len(otherstrings) < n:
        string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
        
        if string not in strings:
            otherstrings.add(string)
    
    print "[*] Strings created."
    
    ### 1) pybloom
    import pybloom
    bf1 = pybloom.BloomFilter(capacity=n, error_rate=p)
    
    ### 2) pybloomfilter
    import pybloomfilter
    bf2 = pybloomfilter.BloomFilter(n, p)
    
    ### 3) bloompy
    import bloompy
    bf3 = bloompy.BloomFilter(capacity=n, error_rate=p)
    
    # add them
    bfs = [("pybloom", bf1), ("pybloomfilter", bf2), ("bloompy", bf3)]
    for s in strings:
        for _, b in bfs:
            b.add(s)
            
    print "[*] Bloom filters to compare performance:\n %s\n\n" % bfs
    
    # add all strings
    for _, bf in bfs:
        for string in strings:
            bf.add(string)
    
    # then test for collisions
    # add all strings
    print "[*] Now testing with %d unique strings and desired error rate of %f" % (n, p)
    print "[*] Performance results: "
    for name, bf in bfs:
        collisions = 0
        starttime = time.time()
        for string in otherstrings:
            if string in bf:
                collisions += 1
        elapsed = time.time() - starttime
        error_rate = float(collisions) / float(n)
        print "%s: %f seconds with error rate = %f" % (name, elapsed, error_rate)