Ejemplo n.º 1
0
def show_statistics():
    while True:
        H1 = copy.deepcopy(H)
        distinctFlows1 = copy.deepcopy(distinctFlows)
        N1 = copy.deepcopy(N)
        depth = 10
        width = 40000
        hash_functions = [hash_function(i) for i in range(depth)]
        sketch1 = CountMinSketch(depth, width, hash_functions, M=N1)

        for fp_key in H1:
            ef = H1[fp_key][0]
            rf = H1[fp_key][1]
            df = H1[fp_key][2]
            sketch1.add(fp_key, rf + df + ef)

        time.sleep(1)
        sketch = CountMinSketch(depth, width, hash_functions, M=N)

        for fp_key in H:
            ef = H[fp_key][0]
            rf = H[fp_key][1]
            df = H[fp_key][2]
            sketch.add(fp_key, rf + df + ef)

        top_flows = get_top_flows(sketch, sketch1)

        system('clear')
        print " flow                      rate"
        for flow in top_flows:
            print "#" + str(flow[0]) + " :: " + str(flow[1]) + "b/s"
Ejemplo n.º 2
0
def test():
    ''' basic testing method '''
    print('build in memory check')
    cms = CountMinSketch(width=100000, depth=7)
    # add elements
    for i in range(100):
        tmp = 100 * (i + 1)
        cms.add(str(i), tmp)

    print(cms.check(str(0), 'min'))
    print(cms.check(str(0), 'mean'))
    print(cms.check(str(0), 'mean-min'))
    cms.export('./dist/py_test.cms')

    print('import from disk check')
    cmsf = CountMinSketch(filepath='./dist/py_test.cms')
    if cms.width != cmsf.width:
        print('width does not match!')
    if cms.depth != cmsf.depth:
        print('depth does not match!')

    print(cmsf.check(str(0), 'min'))
    print(cmsf.check(str(0), 'mean'))
    print(cmsf.check(str(0), 'mean-min'))

    try:
        print('\n\nTest invalid initialization')
        cms_ex = CountMinSketch()
    except SyntaxError as ex:
        print(ex)
Ejemplo n.º 3
0
def simulate_sfcmon(df, num_chunks):
    print("#################### PROCESSING CHUNKS #########################")
    num_packets = df.shape[0]
    print("# Number of packets = {}".format(num_packets))
    chunks = split(df, num_packets / num_chunks)
    results = {"packets": set(), "size": set()}
    for c in chunks:
        print("# Chunk Data: {}; {}".format(c.shape, c.index))
        sketch_packets = CountMinSketch(
            5436, 5)  # table size=1000, hash functions=10
        sketch_size = CountMinSketch(5436,
                                     5)  # table size=1000, hash functions=10
        count_packets = 0
        count_size = 0
        for row in zip(c["SrcIP"], c["Size"]):
            flow_id = row[0]
            sketch_packets.add(flow_id)
            sketch_size.add(flow_id, value=row[1])
            count_packets += 1
            count_size += row[1]
            if count_packets > num_rows_to_start:
                hh_threshold_packets = count_packets * bound
                hh_threshold_size = count_size * bound
                if sketch_packets[flow_id] > hh_threshold_packets:
                    results["packets"].add(flow_id)
                if sketch_size[flow_id] > hh_threshold_size:
                    results["size"].add(flow_id)
    return results
Ejemplo n.º 4
0
def test_merge():
    countmin1 = CountMinSketch(0.1, 0.1)
    countmin2 = CountMinSketch(0.1, 0.1)
    word = 'hello'
    countmin1.increment(word)
    countmin2.increment(word)
    countmin1.merge(countmin2)

    assert countmin2.estimate(word) == 1
    assert countmin1.estimate(word) == 2

    top_10_list = [('hello', 2.0)]
    assert set(countmin1.top_10_dict.items()) == set(top_10_list)
Ejemplo n.º 5
0
def runner(input_file_name, config):
    # read the list of integers
    # with open(input_file_name) as f:
    #     lst = list(map(lambda l: int(l.strip()), f.readlines()))

    sketch = CountMinSketch(config["m"], config["d"])

    lst = AxProf.zipfGenerator(config["n"], config["skew"],
                               int(time.time() * 1000) % 2**32)

    # measure the running time
    startTime = time.time()
    for num in lst:
        sketch.add(num)
    endTime = time.time()

    actual_map = actual_count(lst)
    error_map = {}

    for num in set(lst):
        error_map[num] = abs(actual_map[num] - sketch[num])

    return {
        "input": lst,
        "acc": error_map,
        "time": config["m"] * config["d"],  # endTime - startTime,
        "space": 0,
    }
Ejemplo n.º 6
0
def runner(input_file_name, config):
    # read the list of integers
    with open(input_file_name) as f:
        lst = list(map(lambda l: int(l.strip()), f.readlines()))

    m = math.ceil(math.e / config["eps"])
    d = math.ceil(ln(1 / config["delta"]))

    # print(m, d)

    sketch = CountMinSketch(m, d)

    # measure the running time
    startTime = time.time()
    for num in lst:
        sketch.add(num)
    endTime = time.time()

    result = {}

    for num in lst:
        result[num] = sketch[num]

    return {
        "acc": result,
        "time": endTime - startTime,
        "space": 0,
    }
Ejemplo n.º 7
0
def update_statistics():
    global N, H

    depth = 10
    width = 40000
    hash_functions = [hash_function(i) for i in range(depth)]
    sketch = CountMinSketch(depth, width, hash_functions, M=N)

    for fp_key in H:
        ef = H[fp_key][0]
        rf = H[fp_key][1]
        df = H[fp_key][2]
        sketch.add(fp_key, rf + df + ef)

    system('clear')
    flows_to_display = []
    for flow in distinctFlows:
        flows_to_display.append((flow, sketch.query(flow)))
    for flow in H.keys():
        flows_to_display.append((flow, sketch.query(flow)))

    top_flows = sorted(flows_to_display, key=lambda x: x[1],
                       reverse=True)[0:20]
    for flow in top_flows:
        print flow
    print "Total flows:" + str(len(distinctFlows) + len(H.keys()))
Ejemplo n.º 8
0
 def _make_sketch(kmer_counts_dict: defaultdict) -> CountMinSketch:
     # Read the dictionary into a compressed data structure to allow deleting kmer_counts_dict
     NUM_ROWS = 8
     kmer_counts = CountMinSketch(NUM_ROWS)
     for kmer, count in kmer_counts_dict.items():
         kmer_counts.update(kmer, count)
     return kmer_counts
 def test_counts_overestimate(self):
     text = open(__file__).read()
     counter = Counter(text)
     sketch = CountMinSketch(10, 5)
     for x in text:
         sketch.add(x)
     for x in set(text):
         self.assertGreaterEqual(sketch[x], counter[x])
 def test_simple_usage(self):
     N = 1000
     sketch = CountMinSketch(10, 5)
     for _ in xrange(N):
         sketch.add("a")
     self.assertEqual(sketch.query("a"), N)
     self.assertEqual(sketch.query("b"), 0)
     self.assertEqual(len(sketch), N)
Ejemplo n.º 11
0
def test_increment_and_estimate():
    word1 = 'hello'
    word2 = 'world'
    word3 = 'other'
    countmin = CountMinSketch(0.1, 0.1)
    countmin.increment(word1)
    countmin.increment(word2)
    countmin.increment(word2)

    assert countmin.estimate(word3) == 0
    assert countmin.estimate(word1) == 1
    assert countmin.estimate(word2) == 2

    top_10_list = [('hello', 1.0), ('world', 2.0)]
    assert set(countmin.top_10_dict.items()) == set(top_10_list)
Ejemplo n.º 12
0
def plot_error(lst, m, d, color=None, label=None):
    sketch = CountMinSketch(m, d)
    actual_map = actual_count(lst)

    for i in lst:
        sketch.add(i)

    errors = []
    unique = set(lst)

    for i in unique:
        actual = actual_map[i]
        error = abs(actual - sketch[i])
        errors.append(error)

    print(len(errors))

    plot.hist(errors, bins=len(errors) // 100, color=color, label=label)
    plot.xlim(0, int(len(lst) * 0.1))
Ejemplo n.º 13
0
def simulate_rtp4mon(df, bound, pkts_to_start):
    print("# " + str(datetime.datetime.now()) +
          " - Begin simulation RTP4Mon...")
    results = {"packets": set(), "5t_packets": set()}
    sketch_packets = CountMinSketch(5436,
                                    5)  # table size=1000, hash functions=10
    count_packets = 0
    for row in zip(df["SrcIP"], df["DstIP"], df["SrcPort"], df["DstPort"],
                   df["Proto"]):
        flow_id = row[0]
        five_flow_id = row
        sketch_packets.add(flow_id)
        count_packets += 1
        if count_packets > pkts_to_start:
            hh_threshold_packets = count_packets * bound
            if sketch_packets[flow_id] > hh_threshold_packets:
                results["packets"].add(flow_id)
                results["5t_packets"].add(five_flow_id)
    print("# " + str(datetime.datetime.now()) + " - End simulation RTP4Mon...")
    return results
Ejemplo n.º 14
0
def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1),
                                                "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()),
                                WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()
Ejemplo n.º 15
0
    def _make_sketch(self, kmer_counts_dict: defaultdict) -> CountMinSketch:
        if self.print_runtime:
            print("\n>--- STARTING TO MAKE COUNTMIN SKETCH AT T = {:.2f} ---".
                  format(time.time() - self.start_time))

        # Read the dictionary into a compressed data structure
        NUM_ROWS = 10
        kmer_counts = CountMinSketch(NUM_ROWS)
        for i, (kmer, count) in enumerate(kmer_counts_dict.items()):
            if self.print_runtime and i % 50000 == 0:
                print(">Processed {0} kmers by time T={1:.2f}".format(
                    i,
                    time.time() - self.start_time))
            kmer_counts.update(kmer, count)

        if self.print_runtime:
            print(">FINISHED MAKING COUNTMIN SKETCH AT T = {:.2f}".format(
                time.time() - self.start_time))
        if self.print_syssizeof:
            print(">SIZE OF COUNTMIN SKETCH: {:,}".format(
                sys.getsizeof(kmer_counts)))
        return kmer_counts
Ejemplo n.º 16
0
    #print ESN

    #netDataSet=[]
    #netTargetSet=[]
    #netKeyList=[]
    BERtotal = []
    bintargets = [1, -1]
    #print features
    #print list_for_deep.keys()
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    part_plus = 0
    part_minus = 0
    WCMS = CountMinSketch(m, d)
    for i in xrange(features.shape[0]):

        ex = features[i][0]
        W = csr_matrix(ex)

        rows, cols = ex.nonzero()
        dot = 0.0
        module = 0.0
        for row, col in zip(rows, cols):
            ((row, col), ex[row, col])
            module += ex[row, col]**2
            #print col, ex[row,col]
            dot += WCMS[col] * ex[row, col]
            #print dot
            #TODO aggiungere bias
Ejemplo n.º 17
0
#-
# Frequency using CSK
#-
from countminsketch import CountMinSketch
# Initialize
# table size=1000, hash functions=10
ds = CountMinSketch(1000, 10)
# Add
ds.add(1)
ds.add(2)
ds.add(1)

# Test
assert ds[1] == 2
assert ds[2] == 1
assert ds[3] == 0
 def test_syntax_sugar(self):
     sketch = CountMinSketch(10, 5)
     self.assertEqual(sketch.query("a"), sketch["a"])
     sketch.add("a")
     self.assertEqual(sketch.query("a"), sketch["a"])
 def test_bad_init(self):
     with self.assertRaises(ValueError):
         CountMinSketch(0, 5)
     with self.assertRaises(ValueError):
         CountMinSketch(100, 0)
Ejemplo n.º 20
0
            stock_symbol_file.write(',Actual Trade Freq, APDS Trade Freq, Freq Accuracy')
            stock_symbol_file.write(',Actual Trade Volume, APDS Trade Volume, Vol Accuracy\n')

            #stock_input_filename = proj_dir + "stock_vol_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_input.csv"
            #stock_input_file = open(stock_input_filename, "w")
            #stock_input_file.write('Stock Symbol, Trade Date, Volume\n')

            apds_filename = proj_dir + "apds_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_freq.apds"

            stock_freq_dist = {}
            total_freq_accuracy = 0

            stock_vol_dist = {}
            total_vol_accuracy = 0

            stock_vol_apds = CountMinSketch(width, depth)
            stock_trade_record_count = 0
            vol_sketch_time = 0

            # add elements to sketch
            for stock_trade_line in stock_trade_lines:
                #print('stock_trade_record_count:',stock_trade_record_count,' no_of_record:',no_of_record)
                if stock_trade_record_count >= no_of_record: break
                stock_trade_record_count = stock_trade_record_count + 1
                stock_symbol = stock_trade_line[0].strip()
                trade_date = stock_trade_line[1].strip()
                stock_vol = int(stock_trade_line[7].strip())
                #stock_input_file.write(stock_symbol + "," + str(trade_date) + "," + str(stock_vol) + "\n")

                vol_sketch_starttime = time.process_time()
                apds_cmsadded = stock_vol_apds.add(stock_symbol,stock_vol)
Ejemplo n.º 21
0
processinfo_file = open(processinfo_filename, "w")
processinfo_file.write('Sketch Width, No of hash functions')
processinfo_file.write(',No of Trade Records, No of Stock Symbols')
processinfo_file.write(',CrossRef Not Present Count, CrossRef Present Count')
processinfo_file.write(',Not Present Count, Present Count,Error Count,Error %')
processinfo_file.write(',CMS Time, Sketch Time, Sketch Save Time, Sketch Query Time, CMS Query Time')
processinfo_file.write(',CMS Start Time\n')

for no_of_record in no_of_records:
    stock_trade_filename = source_data_dir + stock_etf + "_trade_" + time_interval + str(no_of_record) + '.csv'
    crossref_not_present_count = 0
    crossref_present_count = 0

    #print('CrossRef CMS Process Starts Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Process Starts
    crossref_stock_trade_frq_cms = CountMinSketch(100000, 10)
    stock_trade_file = open(stock_trade_filename,"r")
    stock_trade_lines = csv.reader(stock_trade_file, delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
    next(stock_trade_file)

    #print('CrossRef CMS Create Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Create
    for stock_trade_line in stock_trade_lines:
        stock_symbol = stock_trade_line[0].strip()
        add1 = crossref_stock_trade_frq_cms.add(stock_symbol)
    stock_trade_file.close()

    #print('CrossRef CMS Membership Check Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Membership Check
    check_stock_symbol_file = open(check_stock_symbol_filename, "r")
 def test_zero_at_start(self):
     sketch = CountMinSketch(10, 5)
     for thing in (0, 1, -1, tuple, tuple(), "", "yeah", object()):
         self.assertEqual(sketch.query(thing), 0)
 def test_add_greater_than_one(self):
     sketch = CountMinSketch(10, 5)
     sketch.add("a", 123)
     self.assertEqual(sketch.query("a"), 123)
Ejemplo n.º 24
0
def CountMin_Sketch(stream, k, h):
    sketch = CountMinSketch(k, h)
    for e in stream:
        sketch.add(e)

    return sketch
 def test_bad_sketch(self):
     with self.assertRaises(ValueError):
         CountMinSketch(0, 10, seed=seeds)
     with self.assertRaises(ValueError):
         CountMinSketch(10, 0, seed=seeds)
Ejemplo n.º 26
0
packetport = "5556"
bufferport = "5557"
ctrlPlanePort = "5560"
ctrlplane_ip = sys.argv[1]
context = zmq.Context()

buffersock = context.socket(zmq.REQ)
buffersock.connect("tcp://localhost:%s" % bufferport)

ctrlPlaneSock = context.socket(zmq.PUSH)
ctrlPlaneSock.connect("tcp://" + ctrlplane_ip + ":%s" % ctrlPlanePort)

depth = 10
width = 40000
hash_functions = [hash_function(i) for i in range(depth)]
sketch = CountMinSketch(depth, width, hash_functions)

lastUpd = time.clock()
distinctFlows = []
distinctFlowsDelta = []


def has_ports(str):
    splitted = str.split(" ")
    if "->" in splitted:
        return True
    else:
        return False


def get_port_type(str):
Ejemplo n.º 27
0
import sys
import csv
from countminsketch import CountMinSketch


#  Setup the strean and some variables.


item_set = set()
currentCustomer=''
# CountMinSketch with 20 hashes to try to prevent hash collision so that every product gets unique identity. As many hashes used, the more accurate is the result. So
# maybe for biger data sets we need to increase the second argument of the CountMinSketch.
# two CountMinSketch instances one for item count and one for total revenue
# since productId is hashed and cannot be reverted back we need to store the products in item_set set data structure (doesn't contain duplicates)
# while retrieving the counts and revenue we need to hash the strings stoted in item_set since same string is supposed to output the same hash.
itemCount = CountMinSketch(10,20)
itemRevenue = CountMinSketch(10,20)
def salesRead(filename):
    with open(filename, 'r') as fi:
        reader = csv.DictReader(fi)
        for row in reader:
            yield(row)


# Now get the stream of data and process it


input_file = sys.argv[1]
out_file = sys.argv[2]
individual_cart = set()
for hod in salesRead(input_file):  
Ejemplo n.º 28
0
    plt.plot(count_min_sketch, color='red', label="Count-Min Sketch")
    plt.legend()
    plt.show()
    plt.gcf().clear()

    plt.title('Plot with log scaling')
    plt.plot(actual_count, linewidth=5, label="Actual Count")
    plt.plot(count_min_sketch, color='red', label="Count-Min Sketch")
    plt.yscale('log')
    plt.legend()
    plt.show()


tweets = []
for line in open('data/tweets.json.1', 'r', encoding='latin-1'):
    tweets.append(json.loads(line))
data = json_normalize(tweets)

choices = [(100000, 10), (10000, 10), (1000, 10), (1000, 100), (1000, 5)]

for i in choices:
    sketch = CountMinSketch(i[0], i[1])
    print('\nData for CountMinSketch(', i[0], ',', i[1], ')\n')
    tag_count_df = data_sketching(data, sketch)
    metrics_and_plotting(tag_count_df)

#sketch = CountMinSketch(10000, 10)
#
#tag_count_df = data_sketching(data,sketch)
#metrics_and_plotting(tag_count_df)
Ejemplo n.º 29
0
def node_countminsketch():
    sketch = CountMinSketch(6000, 10)
    return sketch
Ejemplo n.º 30
0
end = time.time()

print("Time needed with default single core configuration: ", (end - start))
#export to CSV
Heavyhitters2 = pd.DataFrame.from_records(Heavyhitters)
Heavyhitters2.to_csv("movie-counter.csv", index=False, header=False)

print("\n Memory size of HeavyHitters list in bytes= ",
      len(pickle.dumps(Heavyhitters)))

#%%CountMinSketch

#pip install countminsketch if necessary
from countminsketch import CountMinSketch

sketch20_1 = CountMinSketch(20000, 1)  # table size=20000, hash functions=1
sketch40_1 = CountMinSketch(40000, 1)
sketch40_2 = CountMinSketch(40000, 2)
sketch20_2 = CountMinSketch(20000, 2)
sketch10_3 = CountMinSketch(10000, 3)
sketch50_4 = CountMinSketch(50000, 4)

for index, row in ratings.iterrows():
    sketch20_1.add(row['movieId'])
    sketch40_1.add(row['movieId'])
    sketch40_2.add(row['movieId'])
    sketch20_2.add(row['movieId'])
    sketch10_3.add(row['movieId'])
    sketch50_4.add(row['movieId'])

print(