def test_basics(self): v = ('a' * 10, 'b' * 90) pickler = coders.PickleCoder() self.assertEquals(v, pickler.decode(pickler.encode(v))) pickler = coders.Base64PickleCoder() self.assertEquals(v, pickler.decode(pickler.encode(v))) self.assertEquals(coders.Base64PickleCoder().encode(v), base64.b64encode(coders.PickleCoder().encode(v)))
def test_should_sample(self): # Order of magnitude more buckets than highest constant in code under test. buckets = [0] * 300 # The seed is arbitrary and exists just to ensure this test is robust. # If you don't like this seed, try your own; the test should still pass. random.seed(1720) # Do enough runs that the expected hits even in the last buckets # is big enough to expect some statistical smoothing. total_runs = 10 * len(buckets) # Fill the buckets. for _ in range(total_runs): opcounts = OperationCounters(CounterFactory(), 'some-name', coders.PickleCoder(), 0) for i in range(len(buckets)): if opcounts.should_sample(): buckets[i] += 1 # Look at the buckets to see if they are likely. for i in range(10): self.assertEqual(total_runs, buckets[i]) for i in range(10, len(buckets)): self.assertTrue( buckets[i] > 7 * total_runs / i, 'i=%d, buckets[i]=%d, expected=%d, ratio=%f' % (i, buckets[i], 10 * total_runs / i, buckets[i] / (10.0 * total_runs / i))) self.assertTrue( buckets[i] < 14 * total_runs / i, 'i=%d, buckets[i]=%d, expected=%d, ratio=%f' % (i, buckets[i], 10 * total_runs / i, buckets[i] / (10.0 * total_runs / i)))
def test_update_int(self): opcounts = OperationCounters(CounterFactory(), 'some-name', coders.PickleCoder(), 0) self.verify_counters(opcounts, 0) opcounts.update_from(GlobalWindows.windowed_value(1)) opcounts.update_collect() self.verify_counters(opcounts, 1)
class SyncFn(beam.DoFn): STATE = userstate.BagStateSpec('state', coders.PickleCoder()) def __init__(self, size): assert size > 0, 'Must provide a positive size' self.size = size def process(self, element, state=beam.DoFn.StateParam(STATE)): key, value = element cache = list(state.read()) if cache: cache = cache[0] else: cache = {} values = cache.get(key, []) values.append(value) if len(values) == self.size: if key in cache: del cache[key] yield tuple(values) else: cache[key] = values state.clear() if cache: state.add(cache)
def test_update_str(self): coder = coders.PickleCoder() opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) estimated_size = coder.estimate_size(value) self.verify_counters(opcounts, 1, estimated_size)
class IndexAssigningStatefulDoFn(beam.DoFn): INDEX_STATE = CombiningValueStateSpec(name="index", coder=coders.PickleCoder(), combine_fn=sum) def process(self, element, index=beam.DoFn.StateParam(INDEX_STATE)): _, value = element current_index = index.read() index.add(1) yield (current_index, value)
def test_update_old_object(self): coder = coders.PickleCoder() opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) obj = OldClassThatDoesNotImplementLen() value = GlobalWindows.windowed_value(obj) opcounts.update_from(value) estimated_size = coder.estimate_size(value) self.verify_counters(opcounts, 1, estimated_size)
class _StatefulJobOutputsFn(beam.DoFn): STATE = userstate.BagStateSpec('state', coders.PickleCoder()) def process(self, element, level, state=beam.DoFn.StateParam(STATE)): assert level in JobAggregateLevel.STATEFUL # example payload structure... # { # 'source': Any # 'graphid': 0, # 'jobtasks': {0: 3, 1: 3}, # 'jobid': 0, # 'taskid': 2, # 'output': [ # '/tmp/job-0_output-0.task-2.ext', # '/tmp/job-0_output-1.task-2.ext', # ], # } _, payload = element # There are two values we will track that differ depending on the # aggregation type/level desired. # # - key : aggregation per-unique value # - size : total number of times expected to see `key` key = payload[level] if level == JobAggregateLevel.JOB: # str(key) is to deal with json making all dict keys strings size = payload['jobtasks'][str(key)] elif level == JobAggregateLevel.GRAPH: size = sum(payload['jobtasks'].values()) else: raise NotImplementedError cache = dict(state.read()) seen, data = cache.get(key, (0, [])) seen += 1 data.extend(payload['output']) cache[key] = (seen, data) state.clear() for k, v in cache.items(): # size == seen if size == v[0]: # cprint('fire-{}: {}'.format(level, k), 'red', attrs=['bold']) yield cache.pop(k)[1] else: state.add((k, v))
def test_update_multiple(self): coder = coders.PickleCoder() total_size = 0 opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0) self.verify_counters(opcounts, 0, float('nan')) value = GlobalWindows.windowed_value('abcde') opcounts.update_from(value) total_size += coder.estimate_size(value) value = GlobalWindows.windowed_value('defghij') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 2, float(total_size) / 2) value = GlobalWindows.windowed_value('klmnop') opcounts.update_from(value) total_size += coder.estimate_size(value) self.verify_counters(opcounts, 3, float(total_size) / 3)
def run_Create(self, transform_node): transform = transform_node.transform step = self._add_step(TransformNames.CREATE_PCOLLECTION, transform_node.full_label, transform_node) # TODO(silviuc): Eventually use a coder based on typecoders. # Note that we base64-encode values here so that the service will accept # the values. element_coder = coders.PickleCoder() step.add_property( PropertyNames.ELEMENT, [base64.b64encode(element_coder.encode(v)) for v in transform.value]) # The service expects a WindowedValueCoder here, so we wrap the actual # encoding in a WindowedValueCoder. step.encoding = self._get_cloud_encoding( coders.WindowedValueCoder(element_coder)) step.add_property( PropertyNames.OUTPUT_INFO, [{PropertyNames.USER_NAME: ( '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT}])
class SolveDoFn(beam.DoFn): PREV_TIMESTAMP = BagStateSpec(name="timestamp_state", coder=coders.PickleCoder()) PREV_ELEMENTS = BagStateSpec(name="elements_state", coder=coders.PickleCoder()) PREV_MODEL = BagStateSpec(name="model_state", coder=coders.PickleCoder()) PREV_SAMPLESET = BagStateSpec(name="sampleset_state", coder=coders.PickleCoder()) def process( self, value, timestamp=beam.DoFn.TimestampParam, timestamp_state=beam.DoFn.StateParam(PREV_TIMESTAMP), elements_state=beam.DoFn.StateParam(PREV_ELEMENTS), model_state=beam.DoFn.StateParam(PREV_MODEL), sampleset_state=beam.DoFn.StateParam(PREV_SAMPLESET), algorithm=None, algorithm_options=None, map_fn=None, solve_fn=None, unmap_fn=None, solver=LocalSolver(exact=False), # default solver initial_mtype=sawatabi.constants.MODEL_ISING, ): _, elements = value # Sort with the event time. # If we sort a list of tuples, the first element of the tuple is recognized as a key by default, # so just `sorted` is enough. sorted_elements = sorted(elements) # generator into a list timestamp_state_as_list = list(timestamp_state.read()) elements_state_as_list = list(elements_state.read()) model_state_as_list = list(model_state.read()) sampleset_state_as_list = list(sampleset_state.read()) # Extract the previous timestamp, elements, and model from state if len(timestamp_state_as_list) == 0: prev_timestamp = -1.0 else: prev_timestamp = timestamp_state_as_list[-1] if len(elements_state_as_list) == 0: prev_elements = [] else: prev_elements = elements_state_as_list[-1] if len(model_state_as_list) == 0: prev_model = sawatabi.model.LogicalModel(mtype=initial_mtype) else: prev_model = model_state_as_list[-1] if len(sampleset_state_as_list) == 0: prev_sampleset = None else: prev_sampleset = sampleset_state_as_list[-1] # Sometimes, when we use the sliding window algorithm for a bounded data (such as a local file), # we may receive an outdated event whose timestamp is older than timestamp of previously processed event. if float(timestamp) < float(prev_timestamp): yield ( f"The received event is outdated: Timestamp is {timestamp.to_utc_datetime()}, " + f"while an event with timestamp of {timestamp.to_utc_datetime()} has been already processed." ) return # Algorithm specific operations # Incremental: Append current window into the all previous data. if algorithm == sawatabi.constants.ALGORITHM_INCREMENTAL: sorted_elements.extend(prev_elements) sorted_elements = sorted(sorted_elements) # Partial: Merge current window with the specified data. elif algorithm == sawatabi.constants.ALGORITHM_PARTIAL: filter_fn = algorithm_options["filter_fn"] filtered = filter(filter_fn, prev_elements) sorted_elements = list(filtered) + sorted_elements sorted_elements = sorted(sorted_elements) # Resolve outgoing elements in this iteration def resolve_outgoing(prev_elements, sorted_elements): outgoing = [] for p in prev_elements: if p[0] >= sorted_elements[0][0]: break outgoing.append(p) return outgoing outgoing = resolve_outgoing(prev_elements, sorted_elements) # Resolve incoming elements in this iteration def resolve_incoming(prev_elements, sorted_elements): incoming = [] if len(prev_elements) == 0: incoming = sorted_elements else: for v in reversed(sorted_elements): if v[0] <= prev_elements[-1][0]: break incoming.insert(0, v) return incoming incoming = resolve_incoming(prev_elements, sorted_elements) # Clear the BagState so we can hold only the latest state, and # Register new timestamp and elements to the states timestamp_state.clear() timestamp_state.add(timestamp) elements_state.clear() elements_state.add(sorted_elements) # Map problem input to the model try: model = map_fn(prev_model, prev_sampleset, sorted_elements, incoming, outgoing) except Exception as e: yield f"Failed to map: {e}\n{traceback.format_exc()}" return # Clear the BagState so we can hold only the latest state, and # Register new model to the state model_state.clear() model_state.add(model) # Algorithm specific operations # Attenuation: Update scale based on data timestamp. if algorithm == sawatabi.constants.ALGORITHM_ATTENUATION: model.to_physical() # Resolve removed interactions. TODO: Deal with placeholders. ref_timestamp = model._interactions_array[algorithm_options["attenuation.key"]] min_ts = min(ref_timestamp) max_ts = max(ref_timestamp) min_scale = algorithm_options["attenuation.min_scale"] if min_ts < max_ts: for i, t in enumerate(ref_timestamp): new_scale = (1.0 - min_scale) / (max_ts - min_ts) * (t - min_ts) + min_scale model._interactions_array["scale"][i] = new_scale # Solve and unmap to the solution try: sampleset = solve_fn(solver, model, prev_sampleset, sorted_elements, incoming, outgoing) except Exception as e: yield f"Failed to solve: {e}\n{traceback.format_exc()}" return # Clear the BagState so we can hold only the latest state, and # Register new sampleset to the state sampleset_state.clear() sampleset_state.add(sampleset) try: yield unmap_fn(sampleset, sorted_elements, incoming, outgoing) except Exception as e: yield f"Failed to unmap: {e}\n{traceback.format_exc()}"
def test_equality(self): self.assertEquals(coders.PickleCoder(), coders.PickleCoder()) self.assertEquals(coders.Base64PickleCoder(), coders.Base64PickleCoder()) self.assertNotEquals(coders.Base64PickleCoder(), coders.PickleCoder()) self.assertNotEquals(coders.Base64PickleCoder(), object())