예제 #1
0
def test_sink():
    import numpy as np
    scheduler = Stream.scheduler

    ## ----------------------------------------------
    ## # Examples from AssembleSoftware website: KEEP!!
    ## ----------------------------------------------
    ## def print_index(v, state, delimiter):
    ##     print str(state) + delimiter + str(v)
    ##     return state+1 # next state
    ## s = Stream()
    ## sink(print_index, s, 0, delimiter=':')
    ## s.extend(list(range(100,105)))

    ## s = Stream()
    ## def print_index(v, state, delimiter):
    ##     print str(state) + delimiter + str(v)
    ##     return state+1 # next state
    ## sink(print_index, s, 0, delimiter=':')
    ## s.extend(list(range(100,105)))
    # Set up parameters for call to stream_to_list
    ## ----------------------------------------------
    ## # Finished examples from AssembleSoftware website
    ## ----------------------------------------------

    #-----------------------------------------------
    # Set up parameters for call to sink
    print_list = []
    print_list_for_array = []

    def print_index(v, state, print_list):
        print_list.append(str(state) + ':' + str(v))
        return state + 1  # next state

    s = Stream('s')
    s_array = StreamArray('s_array', dtype=int)

    #-----------------------------------------------
    # Call sink with initial state of 0
    sink(func=print_index, in_stream=s, state=0, print_list=print_list)
    sink(func=print_index,
         in_stream=s_array,
         state=0,
         print_list=print_list_for_array)

    s.extend(list(range(100, 103)))
    s_array.extend(np.arange(100, 103))
    scheduler.step()
    assert print_list == ['0:100', '1:101', '2:102']
    assert print_list_for_array == print_list
    s.extend(list(range(200, 203)))
    scheduler.step()
    assert print_list == ['0:100', '1:101', '2:102', '3:200', '4:201', '5:202']

    #-----------------------------------------------
    input_stream = Stream('input stream')
    input_stream_array = StreamArray('input stream array', dtype=int)
    output_list = []
    output_list_array = []

    # Call stream_to_list with no function
    stream_to_list(input_stream, output_list)
    stream_to_list(input_stream_array, output_list_array)
    # A test
    a_test_list = list(range(100, 105))
    a_test_array = np.arange(100, 105)
    input_stream.extend(a_test_list)
    input_stream_array.extend(a_test_array)
    scheduler.step()
    assert output_list == a_test_list
    assert output_list_array == a_test_list

    #-----------------------------------------------
    # test stream to list with a function
    def h(v, multiplier, addend):
        return v * multiplier + addend

    ss = Stream('ss')
    ss_array = StreamArray('ss_array', dtype=int)
    l = []
    l_array = []
    stream_to_list(ss, l, h, multiplier=2, addend=100)
    stream_to_list(in_stream=ss_array,
                   target_list=l_array,
                   element_function=h,
                   multiplier=2,
                   addend=100)
    test_list = [3, 23, 14]
    ss.extend(test_list)
    ss_array.extend(np.array(test_list))
    scheduler.step()
    assert l == [v * 2 + 100 for v in test_list]
    assert l_array == l

    #-----------------------------------------------
    # test stream to list with a function and state
    def h(v, state, multiplier, addend):
        return v * multiplier + addend + state, v + state

    ss = Stream('ss')
    ss_array = StreamArray('ss_array', dtype=int)
    l = []
    l_array = []
    stream_to_list(ss, l, h, 0, multiplier=2, addend=100)
    stream_to_list(in_stream=ss_array,
                   target_list=l_array,
                   element_function=h,
                   state=0,
                   multiplier=2,
                   addend=100)
    test_list = [3, 23, 14]
    ss.extend(test_list)
    ss_array.extend(np.array(test_list))
    scheduler.step()
    assert l == [106, 149, 154]
    assert l_array == l

    ss = Stream('ss')
    ss_array = StreamArray('ss_array', dtype=int)
    l = []
    l_array = []
    stream_to_list(ss, l, h, 0, multiplier=2, addend=100)
    stream_to_list(in_stream=ss_array,
                   target_list=l_array,
                   element_function=h,
                   state=0,
                   multiplier=2,
                   addend=100)
    test_list = list(range(5))
    ss.extend(test_list)
    ss_array.extend(np.array(test_list))
    scheduler.step()
    assert l == [100, 102, 105, 109, 114]
    assert l_array == l

    # Test sink
    # func operates on a single element of the single input stream and does
    # not return any value.
    def p(v, lst):
        lst.append(v)

    in_stream_sink = Stream('in_stream_sink')
    a_list = []
    b_list = []
    sink_agent = sink_element(func=p,
                              in_stream=in_stream_sink,
                              name='sink_agent',
                              lst=a_list)
    sink(func=p, in_stream=in_stream_sink, lst=b_list)
    test_list = [1, 13, 29]
    in_stream_sink.extend(test_list)
    scheduler.step()
    assert a_list == test_list
    assert b_list == test_list

    # ------------------------------------
    # Test sink with state
    # func operates on a single element of the single input stream and state.
    # func does not return any value.

    def p_s(element, state, lst, stream_name):
        lst.append([stream_name, element])
        return state + 1

    in_stream_sink_with_state = Stream('s')
    c_list = []
    sink_with_state_agent = sink_element(func=p_s,
                                         in_stream=in_stream_sink_with_state,
                                         state=0,
                                         name='sink_with_state_agent',
                                         lst=c_list,
                                         stream_name='s')

    #------------------------------------------------------------------------------
    # Test sink as a function with state
    d_list = []
    sink(p_s, in_stream_sink_with_state, state=0, lst=d_list, stream_name='s')
    in_stream_sink_with_state.extend(list(range(2)))
    scheduler.step()
    assert c_list == [['s', 0], ['s', 1]]
    assert d_list == c_list

    # ------------------------------------
    # Test sink with side effect
    # func operates on a single element of the single input stream and state.
    # func does not return any value.

    def sink_with_side_effect_func(element, side_effect_list, f):
        side_effect_list.append(f(element))
        return None

    side_effect_list_0 = []
    side_effect_list_1 = []
    side_effect_list_2 = []

    def ff(element):
        return element * 2

    def fff(element):
        return element + 10

    stm = Stream('stm')

    sink_with_side_effect_agent_0 = sink_element(
        func=sink_with_side_effect_func,
        in_stream=stm,
        name='sink_with_side_effect_agent_0',
        side_effect_list=side_effect_list_0,
        f=ff)

    sink_with_side_effect_agent_1 = sink_element(
        func=sink_with_side_effect_func,
        in_stream=stm,
        name='sink_with_side_effect_agent_1',
        side_effect_list=side_effect_list_1,
        f=fff)

    def f_stateful(element, state):
        return element + state, element + state

    def f_stateful_2(element, state):
        return element * state, element + state

    target_stream_to_list_simple = []
    stream_to_list(stm, target_stream_to_list_simple)
    stream_to_list(in_stream=stm,
                   target_list=side_effect_list_2,
                   element_function=lambda v: 2 * v)
    target_stream_to_list_stateful = []
    stream_to_list(in_stream=stm,
                   target_list=target_stream_to_list_stateful,
                   element_function=f_stateful,
                   state=0)
    target_stream_to_list_stateful_2 = []
    stream_to_list(in_stream=stm,
                   target_list=target_stream_to_list_stateful_2,
                   element_function=f_stateful_2,
                   state=0)

    stream_to_file(stm, 'test1.txt')
    stream_to_file(stm, 'test2.txt', lambda v: 2 * v)
    stream_to_file(stm, 'test3.txt', f_stateful, state=0)

    is_py2 = sys.version[0] == '2'
    if is_py2:
        import Queue as queue
    else:
        import queue as queue
    queue_1 = queue.Queue()
    queue_2 = queue.Queue()
    queue_3 = queue.Queue()
    stream_to_queue(stm, queue_1)
    stream_to_queue(stm, queue_2, lambda v: 2 * v)
    stream_to_queue(stm, queue_3, f_stateful, 0)

    stm.extend(list(range(5)))
    scheduler.step()
    assert target_stream_to_list_stateful == [0, 1, 3, 6, 10]
    assert target_stream_to_list_stateful_2 == [0, 0, 2, 9, 24]
    assert side_effect_list_0 == [0, 2, 4, 6, 8]
    assert side_effect_list_1 == [10, 11, 12, 13, 14]
    assert side_effect_list_0 == side_effect_list_2
    assert target_stream_to_list_simple == list(range(5))

    with open('test1.txt') as the_file:
        file_contents_integers = [int(v) for v in (the_file.readlines())]
    assert file_contents_integers == recent_values(stm)

    with open('test2.txt') as the_file:
        file_contents_integers = [int(v) for v in (the_file.readlines())]
    assert file_contents_integers == [2 * v for v in recent_values(stm)]

    with open('test3.txt') as the_file:
        file_contents_integers = [int(v) for v in (the_file.readlines())]
    assert file_contents_integers == [0, 1, 3, 6, 10]
    os.remove('test1.txt')
    os.remove('test2.txt')
    os.remove('test3.txt')

    def h(v, multiplier, addend):
        return v * multiplier + addend

    ss = Stream()
    stream_to_file(ss, 'test4.txt', h, multiplier=2, addend=100)
    test_list = [3, 23, 14]
    ss.extend(test_list)
    scheduler.step()
    with open('test4.txt') as the_file:
        file_contents_integers = [int(v) for v in (the_file.readlines())]
    assert file_contents_integers == [v * 2 + 100 for v in test_list]
    os.remove('test4.txt')

    def h(v, state, multiplier, addend):
        return v * multiplier + addend + state, v + state

    ss = Stream()
    stream_to_file(ss, 'test5.txt', h, 0, multiplier=2, addend=100)
    test_list = [3, 23, 14]
    ss.extend(test_list)
    scheduler.step()
    with open('test5.txt') as the_file:
        file_contents_integers = [int(v) for v in (the_file.readlines())]
    scheduler.step()
    assert file_contents_integers == [106, 149, 154]
    os.remove('test5.txt')

    # ------------------------------------
    # Testing stream_to_queue
    def h(v, state, multiplier, addend):
        return v * multiplier + addend + state, v + state

    ss = Stream()
    queue_4 = queue.Queue()
    stream_to_queue(ss, queue_4, h, 0, multiplier=2, addend=100)
    test_list = [3, 23, 14]
    ss.extend(test_list)
    scheduler.step()
    queue_contents = []
    while not queue_4.empty():
        queue_contents.append(queue_4.get())
    assert queue_contents == [106, 149, 154]

    # Test with state and keyword arguments
    def h(v, state, multiplier, addend):
        return v * multiplier + addend + state, v + state

    ss = Stream()
    stream_to_queue(ss, queue_4, h, 0, multiplier=2, addend=100)
    test_list = [3, 23, 14]
    ss.extend(test_list)
    queue_contents = []
    scheduler.step()
    while not queue_4.empty():
        queue_contents.append(queue_4.get())
    assert queue_contents == [106, 149, 154]

    # Another test with state and keyword arguments
    ss = Stream()
    queue_5 = queue.Queue()
    stream_to_queue(ss, queue_5, h, 0, multiplier=2, addend=100)
    test_list = list(range(5))
    ss.extend(test_list)
    scheduler.step()
    queue_contents = []
    while not queue_5.empty():
        queue_contents.append(queue_5.get())
    assert queue_contents == [100, 102, 105, 109, 114]

    # Test stream_to_buffer
    s = Stream()
    buf = Buffer(max_size=10)
    stream_to_buffer(s, buf)
    test_list = list(range(5))
    s.extend(test_list)
    scheduler.step()
    assert buf.get_all() == test_list
    next_test = list(range(5, 10, 1))
    s.extend(next_test)
    scheduler.step()
    assert buf.read_all() == next_test
    assert buf.get_all() == next_test

    s = Stream('s')
    print_list = []

    def f(lst):
        print_list.extend(lst)

    sink_window(func=f, in_stream=s, window_size=4, step_size=2)
    s.extend(list(range(10)))
    scheduler.step()
    assert print_list == [0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9]

    s = Stream('s')
    print_list = []

    def f(lst):
        print_list.extend(lst)

    sink_list(func=f, in_stream=s)
    s.extend(list(range(10)))
    Stream.scheduler.step()
    assert print_list == list(range(10))

    import numpy as np
    t = StreamArray('t', dtype='int')
    print_list = []

    def f(lst):
        print_list.extend(lst)

    sink_list(func=f, in_stream=t)
    t.extend(np.arange(10))
    Stream.scheduler.step()
    assert print_list == list(range(10))
    print('TEST OF SINK IS SUCCESSFUL')
예제 #2
0
class Shaper():

    def __init__(self,config):
        self.myBuffer = Buffer(config)
        self.out_csv = ''
        self.in_io_count = 0
        self.out_io_count = 0
        self.csv_row = ['-','-',0,0,0,0,0,'hadoop','bin/spark']

    def read_csv(self,in_file):
        colnames = ['STIME','TIME','UID','PID','D','BLOCK','SIZE','COMM','PATHNAME']
        data = pandas.read_csv(in_file,names = colnames)
        data_dir = data.D.tolist()
        data_addr = map(lambda x: x//8, data.BLOCK.tolist())
        data_size = map(lambda x: x//4096, data.SIZE.tolist())
        arrs = []
        length = len(data_dir)
        self.start_time = time.time()
        for i in range(0,length):
            arrs.append([data_dir[i],data_addr[i],data_size[i]])
        return arrs

    def run(self,in_file,out_file):
        arrs = self.read_csv(in_file)
        self.in_io_count = len(arrs)
        self.out_csv = open(out_file,'wb')
        for each in arrs:
            if each[0]=='W':
                #TODO: big file, dont put into cache, need to invalidate the data in cache if cache hit
                while not self.myBuffer.add([each[1],each[1]+each[2]]):
                    ios = self.myBuffer.get_cold()
                    self.gen_write_io(ios)
            else:
                #TODO: check cache hit, if hit, return data immediately instead of generate read io
                ios = [[each[1],each[2]]]
                self.gen_read_io(ios)

        #clear all data in Buffer
        ios = self.myBuffer.get_all();
        self.gen_write_io(ios)

        arrs = self.read_csv(out_file)
        self.out_io_count = len(arrs)

    def gen_write_io(self,ios):
        for each in ios:
            self.write_csv(['W',each[0],each[1]])

    def gen_read_io(self,ios):
        for each in ios:
            self.write_csv(['R',each[0],each[1]])

    def write_csv(self,item):
        content_list = []
        self.csv_row[4] = item[0]
        self.csv_row[5] = item[1]*8
        self.csv_row[6] = item[2]*4096
        content_list.append(self.csv_row) 
        csv_writer = csv.writer(self.out_csv)
        csv_writer.writerows(content_list)

    def print_io_count(self):
        print "=========================================================="
        print "I/O count before optimization:", self.in_io_count
        print "I/O count after optimization:", self.out_io_count
        print "=========================================================="
예제 #3
0
class Shaper():
    def __init__(self, config):
        self.myBuffer = Buffer(config)
        self.out_csv = ''
        self.in_io_count = 0
        self.out_io_count = 0
        self.csv_row = ['-', '-', 0, 0, 0, 0, 0, 'hadoop', 'bin/spark']
        self.in_ran_io_count = 0
        self.in_seq_size_count = 0
        self.out_ran_io_count = 0
        self.out_seq_size = 0
        self.write_seq_threshold = config['write_seq_threshold']
        self.print_input_config(config)

    def read_csv(self, in_file):
        colnames = [
            'STIME', 'TIME', 'UID', 'PID', 'D', 'BLOCK', 'SIZE', 'COMM',
            'PATHNAME'
        ]
        data = pandas.read_csv(in_file, names=colnames)
        data_dir = data.D.tolist()
        data_addr = map(lambda x: x // 8, data.BLOCK.tolist())
        data_size = map(lambda x: x // 4096, data.SIZE.tolist())
        arrs = []
        length = len(data_dir)
        self.start_time = time.time()
        for i in range(0, length):
            arrs.append([data_dir[i], data_addr[i], data_size[i]])
        return arrs

    def run(self, in_file, out_file):
        arrs = self.read_csv(in_file)
        self.get_in_stats(arrs)
        self.out_csv = open(out_file, 'wb')
        for each in arrs:
            if each[0] == 'W':
                if each[2] < self.write_seq_threshold // 4:
                    #TODO: big file, dont put into cache, need to invalidate the data in cache if cache hit
                    while not self.myBuffer.add([each[1], each[1] + each[2]]):
                        ios = self.myBuffer.get_cold()
                        self.gen_write_io(ios)
                else:
                    self.write_csv(['W', each[1], each[2]])

            else:
                #TODO: check cache hit, if hit, return data immediately instead of generate read io
                ios = [[each[1], each[2]]]
                self.gen_read_io(ios)

        #clear all data in Buffer
        ios = self.myBuffer.get_all()
        self.gen_write_io(ios)
        self.out_csv.close()

    def gen_write_io(self, ios):
        for each in ios:
            self.write_csv(['W', each[0], each[1] - each[0]])

    def gen_read_io(self, ios):
        for each in ios:
            self.write_csv(['R', each[0], each[1]])

    def write_csv(self, item):
        content_list = []
        self.csv_row[4] = item[0]
        self.csv_row[5] = item[1] * 8
        self.csv_row[6] = item[2] * 4096
        content_list.append(self.csv_row)
        csv_writer = csv.writer(self.out_csv)
        csv_writer.writerows(content_list)
        self.out_io_count += 1
        if item[2] <= 1:
            self.out_ran_io_count += 1
        else:
            self.out_seq_size += item[2] * 4096

    def get_in_stats(self, arrs):
        self.in_io_count = len(arrs)
        for each in arrs:
            if each[2] <= 1:
                self.in_ran_io_count += 1
            else:
                self.in_seq_size_count += each[2] * 4096

    def print_input_config(self, config):
        print "=========================================================="
        print "Shaper configuration:"
        print config
        print ""

    def print_io_count(self):
        print "Before Optimizatin:"
        print "I/O count:", self.in_io_count
        print "Seq size:", self.in_seq_size_count
        print "random count:", self.in_ran_io_count
        print ""
        print "After Optimizatin:"
        print "I/O count:", self.out_io_count
        print "Seq size:", self.out_seq_size
        print "random count:", self.out_ran_io_count
        print "=========================================================="