Esempi in Python per CountMinSketch.add, esempi in Python per countminsketch.CountMinSketch.add

Esempio n. 1

0

Mostra file

def test():
    ''' basic testing method '''
    print('build in memory check')
    cms = CountMinSketch(width=100000, depth=7)
    # add elements
    for i in range(100):
        tmp = 100 * (i + 1)
        cms.add(str(i), tmp)

    print(cms.check(str(0), 'min'))
    print(cms.check(str(0), 'mean'))
    print(cms.check(str(0), 'mean-min'))
    cms.export('./dist/py_test.cms')

    print('import from disk check')
    cmsf = CountMinSketch(filepath='./dist/py_test.cms')
    if cms.width != cmsf.width:
        print('width does not match!')
    if cms.depth != cmsf.depth:
        print('depth does not match!')

    print(cmsf.check(str(0), 'min'))
    print(cmsf.check(str(0), 'mean'))
    print(cmsf.check(str(0), 'mean-min'))

    try:
        print('\n\nTest invalid initialization')
        cms_ex = CountMinSketch()
    except SyntaxError as ex:
        print(ex)

Esempio n. 2

0

Mostra file

def show_statistics():
    while True:
        H1 = copy.deepcopy(H)
        distinctFlows1 = copy.deepcopy(distinctFlows)
        N1 = copy.deepcopy(N)
        depth = 10
        width = 40000
        hash_functions = [hash_function(i) for i in range(depth)]
        sketch1 = CountMinSketch(depth, width, hash_functions, M=N1)

        for fp_key in H1:
            ef = H1[fp_key][0]
            rf = H1[fp_key][1]
            df = H1[fp_key][2]
            sketch1.add(fp_key, rf + df + ef)

        time.sleep(1)
        sketch = CountMinSketch(depth, width, hash_functions, M=N)

        for fp_key in H:
            ef = H[fp_key][0]
            rf = H[fp_key][1]
            df = H[fp_key][2]
            sketch.add(fp_key, rf + df + ef)

        top_flows = get_top_flows(sketch, sketch1)

        system('clear')
        print " flow                      rate"
        for flow in top_flows:
            print "#" + str(flow[0]) + " :: " + str(flow[1]) + "b/s"

Esempio n. 3

0

Mostra file

File: run_experiment.py Progetto: michelsb/SFCMon

def simulate_sfcmon(df, num_chunks):
    print("#################### PROCESSING CHUNKS #########################")
    num_packets = df.shape[0]
    print("# Number of packets = {}".format(num_packets))
    chunks = split(df, num_packets / num_chunks)
    results = {"packets": set(), "size": set()}
    for c in chunks:
        print("# Chunk Data: {}; {}".format(c.shape, c.index))
        sketch_packets = CountMinSketch(
            5436, 5)  # table size=1000, hash functions=10
        sketch_size = CountMinSketch(5436,
                                     5)  # table size=1000, hash functions=10
        count_packets = 0
        count_size = 0
        for row in zip(c["SrcIP"], c["Size"]):
            flow_id = row[0]
            sketch_packets.add(flow_id)
            sketch_size.add(flow_id, value=row[1])
            count_packets += 1
            count_size += row[1]
            if count_packets > num_rows_to_start:
                hh_threshold_packets = count_packets * bound
                hh_threshold_size = count_size * bound
                if sketch_packets[flow_id] > hh_threshold_packets:
                    results["packets"].add(flow_id)
                if sketch_size[flow_id] > hh_threshold_size:
                    results["size"].add(flow_id)
    return results

Esempio n. 4

0

Mostra file

def runner(input_file_name, config):
    # read the list of integers
    with open(input_file_name) as f:
        lst = list(map(lambda l: int(l.strip()), f.readlines()))

    m = math.ceil(math.e / config["eps"])
    d = math.ceil(ln(1 / config["delta"]))

    # print(m, d)

    sketch = CountMinSketch(m, d)

    # measure the running time
    startTime = time.time()
    for num in lst:
        sketch.add(num)
    endTime = time.time()

    result = {}

    for num in lst:
        result[num] = sketch[num]

    return {
        "acc": result,
        "time": endTime - startTime,
        "space": 0,
    }

Esempio n. 5

0

Mostra file

def runner(input_file_name, config):
    # read the list of integers
    # with open(input_file_name) as f:
    #     lst = list(map(lambda l: int(l.strip()), f.readlines()))

    sketch = CountMinSketch(config["m"], config["d"])

    lst = AxProf.zipfGenerator(config["n"], config["skew"],
                               int(time.time() * 1000) % 2**32)

    # measure the running time
    startTime = time.time()
    for num in lst:
        sketch.add(num)
    endTime = time.time()

    actual_map = actual_count(lst)
    error_map = {}

    for num in set(lst):
        error_map[num] = abs(actual_map[num] - sketch[num])

    return {
        "input": lst,
        "acc": error_map,
        "time": config["m"] * config["d"],  # endTime - startTime,
        "space": 0,
    }

Esempio n. 6

0

Mostra file

def update_statistics():
    global N, H

    depth = 10
    width = 40000
    hash_functions = [hash_function(i) for i in range(depth)]
    sketch = CountMinSketch(depth, width, hash_functions, M=N)

    for fp_key in H:
        ef = H[fp_key][0]
        rf = H[fp_key][1]
        df = H[fp_key][2]
        sketch.add(fp_key, rf + df + ef)

    system('clear')
    flows_to_display = []
    for flow in distinctFlows:
        flows_to_display.append((flow, sketch.query(flow)))
    for flow in H.keys():
        flows_to_display.append((flow, sketch.query(flow)))

    top_flows = sorted(flows_to_display, key=lambda x: x[1],
                       reverse=True)[0:20]
    for flow in top_flows:
        print flow
    print "Total flows:" + str(len(distinctFlows) + len(H.keys()))

Esempio n. 7

0

Mostra file

File: test_countminsketch.py Progetto: wanmingjia123/countminsketch

 def test_counts_overestimate(self):
     text = open(__file__).read()
     counter = Counter(text)
     sketch = CountMinSketch(10, 5)
     for x in text:
         sketch.add(x)
     for x in set(text):
         self.assertGreaterEqual(sketch[x], counter[x])

Esempio n. 8

0

Mostra file

File: test_countminsketch.py Progetto: wanmingjia123/countminsketch

 def test_simple_usage(self):
     N = 1000
     sketch = CountMinSketch(10, 5)
     for _ in xrange(N):
         sketch.add("a")
     self.assertEqual(sketch.query("a"), N)
     self.assertEqual(sketch.query("b"), 0)
     self.assertEqual(len(sketch), N)

Esempio n. 9

0

Mostra file

File: history.py Progetto: AshBT/cos521

	def aggregate_unit(self, data_block):
		# update time once per unit
		self.t += 1
		# we use this to keep track of the current time unit
		# convert the data_block into a CM sketch
		accumulator = CMSketch(self.m, self.d)
		# add each hashtag in the data_block to the CM sketch
		# while this data is coming in, we maintain a separate
		# data structure with the exact frequencies that we can
		# query for exact frequencies. 
		# with frequency 1 for each appearance

		# reset the present when we aggregate the whole thing
		self.present = CMSketch(self.m, self.d)
		# (data_block is the present)
		for data in data_block:
			accumulator.add(data, 1)
			# update present as we update the accumulator
			self.present.add(data, 1)

		self.ready = False
		# we update the whole structure with M_bar
		# we calculate l: 
		# l = max over all i such that (t mod 2^i) == 0
		# efficient -- takes log t time to find at worst
		def find_l(t):
			l = 0
			if t == 0:
				return l
			while t % 2 == 0:
				l += 1
				t = t/2
			return l

		# go up to the index that is find_l + 1, or the max index
		# if find_l + 1 >= to it
		for i in range(min(find_l(self.t) + 1, self.n)):
			# now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j)
			# M_bar - M^j
			difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1))
			# A = A + (1/2)^i difference
			self.aggregate_score = sketch_sum(self.aggregate_score, 
									sketch_scalar_product(difference, pow(0.5, i)))
			# temporary storage
			T = deepcopy(accumulator)
			# aggregate into accumulator for next round
			accumulator = sketch_sum(accumulator, self.cm_sketch_list[i])
			# set the value
			self.cm_sketch_list[i] = T
		# now we're ready to use CM-sketch values
		self.ready = True
		# reset the present now that we're done with one time block
		self.present = CMSketch(self.m, self.d)

Esempio n. 10

0

Mostra file

def plot_error(lst, m, d, color=None, label=None):
    sketch = CountMinSketch(m, d)
    actual_map = actual_count(lst)

    for i in lst:
        sketch.add(i)

    errors = []
    unique = set(lst)

    for i in unique:
        actual = actual_map[i]
        error = abs(actual - sketch[i])
        errors.append(error)

    print(len(errors))

    plot.hist(errors, bins=len(errors) // 100, color=color, label=label)
    plot.xlim(0, int(len(lst) * 0.1))

Esempio n. 11

0

Mostra file

File: run_experiments.py Progetto: michelsb/FrameRTP4

def simulate_rtp4mon(df, bound, pkts_to_start):
    print("# " + str(datetime.datetime.now()) +
          " - Begin simulation RTP4Mon...")
    results = {"packets": set(), "5t_packets": set()}
    sketch_packets = CountMinSketch(5436,
                                    5)  # table size=1000, hash functions=10
    count_packets = 0
    for row in zip(df["SrcIP"], df["DstIP"], df["SrcPort"], df["DstPort"],
                   df["Proto"]):
        flow_id = row[0]
        five_flow_id = row
        sketch_packets.add(flow_id)
        count_packets += 1
        if count_packets > pkts_to_start:
            hh_threshold_packets = count_packets * bound
            if sketch_packets[flow_id] > hh_threshold_packets:
                results["packets"].add(flow_id)
                results["5t_packets"].add(five_flow_id)
    print("# " + str(datetime.datetime.now()) + " - End simulation RTP4Mon...")
    return results

Esempio n. 12

0

Mostra file

def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1),
                                                "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()),
                                WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()

Esempio n. 13

0

Mostra file

File: main.py Progetto: 21zhouyun/CountMinSketch

def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()), WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()

Esempio n. 14

0

Mostra file

File: SketchMer.py Progetto: alorchhota/sketch-mer

def build_countminsketch(ksup, w=1000, h=10):
    """returns a countminsketch object inserting all kmers from given KmerSupplier."""
    sketch = CountMinSketch(w, h)
    for kmer in ksup.iterkmers():
        sketch.add(kmer)
    return sketch

Esempio n. 15

0

Mostra file

File: test_countminsketch.py Progetto: wanmingjia123/countminsketch

 def test_syntax_sugar(self):
     sketch = CountMinSketch(10, 5)
     self.assertEqual(sketch.query("a"), sketch["a"])
     sketch.add("a")
     self.assertEqual(sketch.query("a"), sketch["a"])

Esempio n. 16

0

Mostra file

File: history.py Progetto: AshBT/cos521

class History(object):
	def __init__(self, n, m, d):
		# time counter (update for each new unit)
		self.t = 0
		# n is number of CM sketches
		self.n = n 
		# m is size of array for each hash function
		self.m = m
		# d is number of hash functions
		self.d = d
		# present is a count-min sketch containing
		# sub-unit time counts of indexes. 
		self.present = CMSketch(m, d)
		# ready is a t/f value to determine whether or not
		# to use the present as a score while the aggregate weighted score
		# is being computed
		self.ready = False
		# use a CM-sketch to keep track of aggregate weighted score
		# A = sum{j = 1 to log T} (M^j / 2^j)
		# (we add the present ourselves)
		# keep track of A at every time interval
		# initialized to zero
		self.aggregate_score = CMSketch(m, d)
		# n count-min sketches 
		# we retain resolutions 1, 2, 4, ..., 2^n
		# move to next sketch (update curr_sketch) when 
		# time unit filled = 2^i (its position in the list)
		self.cm_sketch_list = []
		for i in range(n):
			self.cm_sketch_list.append(CMSketch(m, d))

	def update_present_only(self, datum):
		self.ready = False
		# don't update the full time
		# this is a sub-unit update
		self.present.add(datum, 1)

	# data_block is a block of data, presented as an iterable object
	# the block of data consists of data that arrived in a single time unit
	# implements algorithm 2 from the paper
	# this structures maintains n CM-sketches, M0, M1, ..., Mn
	# M0 always holds [t-1, t] where t is current time
	# M1 always holds [t - tmod2 - 2, t - tmod2]
	# ...
	# Mn always holds [t - tmod(2^n) - 2^n, t - tmod(2^n)]
	# for t = 8, for example:
	# M0: [7, 8]
	# M1: [6, 8]
	# M2: [4, 8]
	# M3: [0, 8]
	# rest: 0
	def aggregate_unit(self, data_block):
		# update time once per unit
		self.t += 1
		# we use this to keep track of the current time unit
		# convert the data_block into a CM sketch
		accumulator = CMSketch(self.m, self.d)
		# add each hashtag in the data_block to the CM sketch
		# while this data is coming in, we maintain a separate
		# data structure with the exact frequencies that we can
		# query for exact frequencies. 
		# with frequency 1 for each appearance

		# reset the present when we aggregate the whole thing
		self.present = CMSketch(self.m, self.d)
		# (data_block is the present)
		for data in data_block:
			accumulator.add(data, 1)
			# update present as we update the accumulator
			self.present.add(data, 1)

		self.ready = False
		# we update the whole structure with M_bar
		# we calculate l: 
		# l = max over all i such that (t mod 2^i) == 0
		# efficient -- takes log t time to find at worst
		def find_l(t):
			l = 0
			if t == 0:
				return l
			while t % 2 == 0:
				l += 1
				t = t/2
			return l

		# go up to the index that is find_l + 1, or the max index
		# if find_l + 1 >= to it
		for i in range(min(find_l(self.t) + 1, self.n)):
			# now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j)
			# M_bar - M^j
			difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1))
			# A = A + (1/2)^i difference
			self.aggregate_score = sketch_sum(self.aggregate_score, 
									sketch_scalar_product(difference, pow(0.5, i)))
			# temporary storage
			T = deepcopy(accumulator)
			# aggregate into accumulator for next round
			accumulator = sketch_sum(accumulator, self.cm_sketch_list[i])
			# set the value
			self.cm_sketch_list[i] = T
		# now we're ready to use CM-sketch values
		self.ready = True
		# reset the present now that we're done with one time block
		self.present = CMSketch(self.m, self.d)

	# we want to put these values into its own count-min sketch, (call it A)
	# updated in sync so as to not waste log T time summing
	# for each query.
	# this value will provide a key for our heap
	def query_slow(self, x):
		return self.present.query(x) + sum(pow(0.5, i) * self.cm_sketch_list[i].query(x) for i in range(self.n))

	# using a CMSketch to keep track of the score
	# note that we stored the 'scores' we calculated in CM-sketch
	# therefore it will pick the minimum of these
	# this is exactly equivalent to doing the sum over the minimums since we added termwise
	# (used matrix addition and scalar multiplication)
	def query(self, x):
		if self.ready:
			return self.aggregate_score.query(x)
		else: # only if we're not ready 
			return self.present.query(x) + self.aggregate_score.query(x)

Esempio n. 17

0

Mostra file

File: test_countminsketch.py Progetto: wanmingjia123/countminsketch

 def test_add_greater_than_one(self):
     sketch = CountMinSketch(10, 5)
     sketch.add("a", 123)
     self.assertEqual(sketch.query("a"), 123)

Esempio n. 18

0

Mostra file

File: stock_cms_analysis.py Progetto: rameshbala50/apds

    stock_trade_filename = source_data_dir + stock_etf + "_trade_" + time_interval + str(no_of_record) + '.csv'
    crossref_not_present_count = 0
    crossref_present_count = 0

    #print('CrossRef CMS Process Starts Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Process Starts
    crossref_stock_trade_frq_cms = CountMinSketch(100000, 10)
    stock_trade_file = open(stock_trade_filename,"r")
    stock_trade_lines = csv.reader(stock_trade_file, delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
    next(stock_trade_file)

    #print('CrossRef CMS Create Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Create
    for stock_trade_line in stock_trade_lines:
        stock_symbol = stock_trade_line[0].strip()
        add1 = crossref_stock_trade_frq_cms.add(stock_symbol)
    stock_trade_file.close()

    #print('CrossRef CMS Membership Check Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Membership Check
    check_stock_symbol_file = open(check_stock_symbol_filename, "r")

    for stock_symbol_line in check_stock_symbol_file:
        stock_symbol = stock_symbol_line.strip()
        stock_frq = crossref_stock_trade_frq_cms.check(stock_symbol)
        if stock_frq == 0: crossref_not_present_count = crossref_not_present_count + 1
        else: crossref_present_count = crossref_present_count + 1
    check_stock_symbol_file.close()
    #CrossRef CMS Process Completed
    #print('CrossRef CMS Process Completed Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))

Esempio n. 19

0

Mostra file

            stock_vol_apds = CountMinSketch(width, depth)
            stock_trade_record_count = 0
            vol_sketch_time = 0

            # add elements to sketch
            for stock_trade_line in stock_trade_lines:
                #print('stock_trade_record_count:',stock_trade_record_count,' no_of_record:',no_of_record)
                if stock_trade_record_count >= no_of_record: break
                stock_trade_record_count = stock_trade_record_count + 1
                stock_symbol = stock_trade_line[0].strip()
                trade_date = stock_trade_line[1].strip()
                stock_vol = int(stock_trade_line[7].strip())
                #stock_input_file.write(stock_symbol + "," + str(trade_date) + "," + str(stock_vol) + "\n")

                vol_sketch_starttime = time.process_time()
                apds_cmsadded = stock_vol_apds.add(stock_symbol,stock_vol)
                #print('stock_trade_record_count:',stock_trade_record_count, '  stock_symbol:',stock_symbol,'   stock_vol:',stock_vol,' add1:',apds_cmsadded)
                vol_sketch_endtime = time.process_time()
                vol_sketch_time = vol_sketch_time + (vol_sketch_endtime - vol_sketch_starttime)
                #print(vol_sketch_time,vol_sketch_endtime,vol_sketch_starttime)

                if stock_symbol in stock_vol_dist.keys():
                    stock_symbol_freq = stock_freq_dist[stock_symbol] + 1
                    stock_symbol_vol = stock_vol_dist[stock_symbol] + stock_vol
                else:
                    stock_symbol_freq = 1
                    stock_symbol_vol = stock_vol

                stock_freq_dist.update({stock_symbol: stock_symbol_freq})
                stock_vol_dist.update({stock_symbol: stock_symbol_vol})

Esempio n. 20

0

Mostra file

File: prob1.py Progetto: cmertin/Spring2017

def CountMin_Sketch(stream, k, h):
    sketch = CountMinSketch(k, h)
    for e in stream:
        sketch.add(e)

    return sketch

Esempio n. 21

0

Mostra file

                tp += 1
            else:
                tn += 1
        if (g_it.target[i] == 1):
            coef = (part_minus + 1.0) / (part_plus + part_minus + 1.0)
            part_plus += 1
        else:
            coef = (part_plus + 1.0) / (part_plus + part_minus + 1.0)
            part_minus += 1
        tao = min(C, max(0.0, ((1.0 - g_it.target[i] * dot) * coef) / module))

        if (tao > 0.0):
            for row, col in zip(rows, cols):
                ((row, col), ex[row, col])
                #print col, ex[row,col]
                WCMS.add(col, g_it.target[i] * tao * ex[row, col])

            #print "Correct prediction example",i, "pred", score, "target",g_it.target[i]

        if i % 50 == 0 and i != 0:
            #output performance statistics every 50 examples
            if (tn + fp) > 0:
                pos_part = float(fp) / (tn + fp)
            else:
                pos_part = 0
            if (tp + fn) > 0:
                neg_part = float(fn) / (tp + fn)
            else:
                neg_part = 0
            BER = 0.5 * (pos_part + neg_part)
            print "1-BER Window esempio ", i, (1.0 - BER)

Esempio n. 22

0

Mostra file

File: frequency_using_cms.py Progetto: ivenkatababji/pds

#-
# Frequency using CSK
#-
from countminsketch import CountMinSketch
# Initialize
# table size=1000, hash functions=10
ds = CountMinSketch(1000, 10)
# Add
ds.add(1)
ds.add(2)
ds.add(1)

# Test
assert ds[1] == 2
assert ds[2] == 1
assert ds[3] == 0

Esempio n. 23

0

Mostra file

      len(pickle.dumps(Heavyhitters)))

#%%CountMinSketch

#pip install countminsketch if necessary
from countminsketch import CountMinSketch

sketch20_1 = CountMinSketch(20000, 1)  # table size=20000, hash functions=1
sketch40_1 = CountMinSketch(40000, 1)
sketch40_2 = CountMinSketch(40000, 2)
sketch20_2 = CountMinSketch(20000, 2)
sketch10_3 = CountMinSketch(10000, 3)
sketch50_4 = CountMinSketch(50000, 4)

for index, row in ratings.iterrows():
    sketch20_1.add(row['movieId'])
    sketch40_1.add(row['movieId'])
    sketch40_2.add(row['movieId'])
    sketch20_2.add(row['movieId'])
    sketch10_3.add(row['movieId'])
    sketch50_4.add(row['movieId'])

print(
    "Using a table of size 10000 and 3 hash functions estimated frequency of \n movie 296 is: ",
    sketch10_3[296])
print()
print("\n Memory size (table of size 10000 and 3 hash functions) in bytes= ",
      len(pickle.dumps(sketch10_3)))
print()

print(