def test_union(): t1 = util.ThruputObserver() t2 = util.ThruputObserver() t2.update_tallies(n=10) u = util.ThruputObserver.union((t1, t2)) tail = lambda t: str(t).split()[2:] # Skip PID / ID header assert tail(u) == tail(t2)
def test_works_in_counter(): from collections import Counter t1 = util.ThruputObserver(name='t1', n_total=3) t1.start_block() t1.stop_block(n=1, num_bytes=2) counter1 = Counter() counter1['thruput'] = t1 t2 = util.ThruputObserver(name='t2', n_total=3) t2.start_block() t2.stop_block(n=1, num_bytes=2) t2.start_block() t2.stop_block(n=1, num_bytes=2) counter2 = Counter() counter2['thruput'] = t2 final_counter = counter1 + counter2 final = final_counter['thruput'] assert final.name == 't1' # The first observer added to the counter wins assert final.n == 3 assert final.num_bytes == 6 assert final.n_total == 3 assert len(final.ts) == 3
def __init__(self): self.overall_thruput = util.ThruputObserver(name=logging_name, log_on_del=True, n_total=df.count()) # TODO: count() can be slow self.overall_thruput.start_block() self.lock = threading.Lock()
def test_some_blocks_thru(): t3 = util.ThruputObserver(name='test_thruput_observer', n_total=10) for _ in range(10): t3.update_tallies(n=1, new_block=True) t3.maybe_log_progress() t3.stop_block() assert re.search('N thru.*10', str(t3)) assert re.search('N chunks.*10', str(t3))
def test_some_thru(): t2 = util.ThruputObserver() import random import time MAX_WAIT = 0.01 for _ in range(10): with t2.observe(n=1, num_bytes=1): time.sleep(random.random() * MAX_WAIT) assert str(t2) assert t2.total_time <= 10. * MAX_WAIT assert re.search('N thru.*10', str(t2)) assert re.search('N chunks.*10', str(t2))
def __call__(self, pid): # Convert pesky numpy boxed numeric types if needed import numpy as np if isinstance(pid, np.generic): pid = pid.item() part_df = df.filter(df[shard_col] == pid) part_rdd = part_df.rdd.repartition(100) rows = part_rdd.map(spark_row_to_tf_element).toLocalIterator() util.log.info("Reading partition %s " % pid) t = util.ThruputObserver(name='Partition %s' % pid, log_on_del=True) t.start_block() for row in rows: yield row t.update_tallies(n=1, num_bytes=util.get_size_of_deep(row)) t.stop_block() util.log.info("Done reading partition %s, stats:\n %s" % (pid, t)) with self.lock: # Since partitions are read in parallel, we need to maintain # independent timing stats for the main thread self.overall_thruput.stop_block(n=t.n, num_bytes=t.num_bytes) self.overall_thruput.maybe_log_progress(every_n=1) self.overall_thruput.start_block()
def test_some_thru2(): t = util.ThruputObserver() for _ in range(10): with t.observe() as my_t: my_t.update_tallies(n=1) assert t.n == 10
def test_empty(): t1 = util.ThruputObserver() assert str(t1)
def test_union(): t1 = util.ThruputObserver() t2 = util.ThruputObserver() t2.update_tallies(n=10) u = util.ThruputObserver.union((t1, t2)) assert str(u) == str(t2)