Ejemplo n.º 1
0
def test_union():
    t1 = util.ThruputObserver()
    t2 = util.ThruputObserver()
    t2.update_tallies(n=10)
    u = util.ThruputObserver.union((t1, t2))
    tail = lambda t: str(t).split()[2:]  # Skip PID / ID header
    assert tail(u) == tail(t2)
Ejemplo n.º 2
0
def test_works_in_counter():
    from collections import Counter

    t1 = util.ThruputObserver(name='t1', n_total=3)
    t1.start_block()
    t1.stop_block(n=1, num_bytes=2)
    counter1 = Counter()
    counter1['thruput'] = t1

    t2 = util.ThruputObserver(name='t2', n_total=3)
    t2.start_block()
    t2.stop_block(n=1, num_bytes=2)
    t2.start_block()
    t2.stop_block(n=1, num_bytes=2)
    counter2 = Counter()
    counter2['thruput'] = t2

    final_counter = counter1 + counter2

    final = final_counter['thruput']
    assert final.name == 't1'  # The first observer added to the counter wins
    assert final.n == 3
    assert final.num_bytes == 6
    assert final.n_total == 3
    assert len(final.ts) == 3
Ejemplo n.º 3
0
 def __init__(self):
     self.overall_thruput = util.ThruputObserver(name=logging_name,
                                                 log_on_del=True,
                                                 n_total=df.count())
     # TODO: count() can be slow
     self.overall_thruput.start_block()
     self.lock = threading.Lock()
Ejemplo n.º 4
0
def test_some_blocks_thru():
    t3 = util.ThruputObserver(name='test_thruput_observer', n_total=10)
    for _ in range(10):
        t3.update_tallies(n=1, new_block=True)
        t3.maybe_log_progress()
    t3.stop_block()
    assert re.search('N thru.*10', str(t3))
    assert re.search('N chunks.*10', str(t3))
Ejemplo n.º 5
0
def test_some_thru():
    t2 = util.ThruputObserver()

    import random
    import time
    MAX_WAIT = 0.01
    for _ in range(10):
        with t2.observe(n=1, num_bytes=1):
            time.sleep(random.random() * MAX_WAIT)

    assert str(t2)
    assert t2.total_time <= 10. * MAX_WAIT
    assert re.search('N thru.*10', str(t2))
    assert re.search('N chunks.*10', str(t2))
Ejemplo n.º 6
0
        def __call__(self, pid):
            # Convert pesky numpy boxed numeric types if needed
            import numpy as np
            if isinstance(pid, np.generic):
                pid = pid.item()

            part_df = df.filter(df[shard_col] == pid)
            part_rdd = part_df.rdd.repartition(100)
            rows = part_rdd.map(spark_row_to_tf_element).toLocalIterator()
            util.log.info("Reading partition %s " % pid)
            t = util.ThruputObserver(name='Partition %s' % pid,
                                     log_on_del=True)
            t.start_block()
            for row in rows:
                yield row
                t.update_tallies(n=1, num_bytes=util.get_size_of_deep(row))
            t.stop_block()
            util.log.info("Done reading partition %s, stats:\n %s" % (pid, t))
            with self.lock:
                # Since partitions are read in parallel, we need to maintain
                # independent timing stats for the main thread
                self.overall_thruput.stop_block(n=t.n, num_bytes=t.num_bytes)
                self.overall_thruput.maybe_log_progress(every_n=1)
                self.overall_thruput.start_block()
Ejemplo n.º 7
0
def test_some_thru2():
    t = util.ThruputObserver()
    for _ in range(10):
        with t.observe() as my_t:
            my_t.update_tallies(n=1)
    assert t.n == 10
Ejemplo n.º 8
0
def test_empty():
    t1 = util.ThruputObserver()
    assert str(t1)
Ejemplo n.º 9
0
def test_union():
    t1 = util.ThruputObserver()
    t2 = util.ThruputObserver()
    t2.update_tallies(n=10)
    u = util.ThruputObserver.union((t1, t2))
    assert str(u) == str(t2)