def find_all(pcoll, regex, group=0, outputEmpty=True): """ Returns the matches if a portion of the line matches the Regex. By default, list of group 0 will return with empty items. To get all groups, pass the `Regex.ALL` flag in the `group` parameter which returns all the groups in the tuple format. Args: regex: the regular expression string or (re.compile) pattern. group: (optional) name of the group, it can be integer or a string value. outputEmpty: (optional) Should empty be output. True to output empties and false if not. """ regex = Regex._regex_compile(regex) def _process(element): matches = regex.finditer(element) if group == Regex.ALL: yield [(m.group(), m.groups()[0]) for m in matches if outputEmpty or m.groups()[0]] else: yield [ m.group(group) for m in matches if outputEmpty or m.group(group) ] return pcoll | FlatMap(_process)
def expand(self, pcoll): class ReifyTimestamps(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield element[0], TimestampedValue(element[1], timestamp) class RestoreTimestamps(DoFn): def process(self, element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. yield windowed_value.WindowedValue( (element[0], element[1].value), element[1].timestamp, [window]) windowing_saved = pcoll.windowing # The linter is confused. # pylint: disable=abstract-class-instantiated result = ( pcoll | ParDo(ReifyTimestamps()) | 'IdentityWindow' >> WindowInto( _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()), trigger=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST, ) | GroupByKey() | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value) for value in e[1]]) | ParDo(RestoreTimestamps())) result._windowing = windowing_saved return result
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value( (key, value), timestamp) for (value, timestamp) in values ] else: # typing: All conditional function variants must have identical signatures def reify_timestamps( # type: ignore[misc] element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [ wv.with_value((key, wv.value)) for wv in windowed_values ] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Always(), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) window_fn = window.GlobalWindows() MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value((key, value), timestamp) for (value, timestamp) in values] else: # The linter is confused. # hash(1) is used to force "runtime" selection of _IdentityWindowFn # pylint: disable=abstract-class-instantiated cls = hash(1) and _IdentityWindowFn window_fn = cls( windowing_saved.windowfn.get_window_coder()) def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element return key, TimestampedValue(value, timestamp) def restore_timestamps(element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. key, values = element return [ windowed_value.WindowedValue( (key, value.value), value.timestamp, [window]) for value in values] ungrouped = pcoll | Map(reify_timestamps) ungrouped._windowing = Windowing( window_fn, triggerfn=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps)) result._windowing = windowing_saved return result
def expand(self, pvalue): keyed_pc = (pvalue | 'AssignKey' >> Map(lambda x: (uuid.uuid4(), x))) if keyed_pc.windowing.windowfn.is_merging(): raise ValueError( 'Transform ReadAllFiles cannot be used in the presence ' 'of merging windows') if not isinstance(keyed_pc.windowing.triggerfn, DefaultTrigger): raise ValueError( 'Transform ReadAllFiles cannot be used in the presence ' 'of non-trivial triggers') return (keyed_pc | 'GroupByKey' >> GroupByKey() # Using FlatMap below due to the possibility of key collisions. | 'DropKey' >> FlatMap(lambda k_values: k_values[1]))
def all_matches(pcoll, regex): """ Returns all matches (groups) if zero or more characters at the beginning of string match the regular expression. Args: regex: the regular expression string or (re.compile) pattern. """ regex = Regex._regex_compile(regex) def _process(element): m = regex.match(element) if m: yield [m.group(ix) for ix in range(m.lastindex + 1)] return pcoll | FlatMap(_process)
def find(pcoll, regex, group=0): """ Returns the matches if a portion of the line matches the Regex. Returns the entire group (group 0 by default). Group can be integer value or a string value. Args: regex: the regular expression string or (re.compile) pattern. group: (optional) name of the group, it can be integer or a string value. """ regex = Regex._regex_compile(regex) def _process(element): r = regex.search(element) if r: yield r.group(group) return pcoll | FlatMap(_process)
def matches_kv(pcoll, regex, keyGroup, valueGroup=0): """ Returns the KV pairs if the string matches the regular expression, deriving the key & value from the specified group of the regular expression. Args: regex: the regular expression string or (re.compile) pattern. keyGroup: The Regex group to use as the key. Can be int or str. valueGroup: (optional) Regex group to use the value. Can be int or str. The default value "0" returns entire matched string. """ regex = Regex._regex_compile(regex) def _process(element): match = regex.match(element) if match: yield (match.group(keyGroup), match.group(valueGroup)) return pcoll | FlatMap(_process)
def split(pcoll, regex, outputEmpty=False): """ Returns the list string which was splitted on the basis of regular expression. It will not output empty items (by defaults). Args: regex: the regular expression string or (re.compile) pattern. outputEmpty: (optional) Should empty be output. True to output empties and false if not. """ regex = Regex._regex_compile(regex) outputEmpty = bool(outputEmpty) def _process(element): r = regex.split(element) if r and not outputEmpty: r = list(filter(None, r)) yield r return pcoll | FlatMap(_process)
def find_kv(pcoll, regex, keyGroup, valueGroup=0): """ Returns the matches if a portion of the line matches the Regex. Returns the specified groups as the key and value pair. Args: regex: the regular expression string or (re.compile) pattern. keyGroup: The Regex group to use as the key. Can be int or str. valueGroup: (optional) Regex group to use the value. Can be int or str. The default value "0" returns entire matched string. """ regex = Regex._regex_compile(regex) def _process(element): matches = regex.finditer(element) if matches: for match in matches: yield (match.group(keyGroup), match.group(valueGroup)) return pcoll | FlatMap(_process)
def matches(pcoll, regex, group=0): """ Returns the matches (group 0 by default) if zero or more characters at the beginning of string match the regular expression. To match the entire string, add "$" sign at the end of regex expression. Group can be integer value or a string value. Args: regex: the regular expression string or (re.compile) pattern. group: (optional) name/number of the group, it can be integer or a string value. Defaults to 0, meaning the entire matched string will be returned. """ regex = Regex._regex_compile(regex) def _process(element): m = regex.match(element) if m: yield m.group(group) return pcoll | FlatMap(_process)
def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=range-builtin-not-iterating batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=range-builtin-not-iterating return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) fastavro_output = '/'.join([self.output, 'fastavro']) avro_output = '/'.join([self.output, 'avro']) self.addCleanup(delete_files, [self.output + '*']) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, self.SCHEMA, use_fastavro=True ) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_avro' >> WriteToAvro( avro_output, self.SCHEMA, use_fastavro=False ) result = self.test_pipeline.run() result.wait_until_finish() assert result.state == PipelineState.DONE fastavro_read_pipeline = TestPipeline(is_integration_test=True) fastavro_records = \ fastavro_read_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \ | Map(lambda rec: (rec['number'], rec)) avro_records = \ fastavro_read_pipeline \ | 'create-avro' >> Create(['%s*' % avro_output]) \ | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \ | Map(lambda rec: (rec['number'], rec)) def check(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(v.keys(), ['avro', 'fastavro']) avro_values = v['avro'] fastavro_values = v['fastavro'] assertEqual(avro_values, fastavro_values) assertEqual(len(avro_values), 1) # pylint: disable=expression-not-assigned { 'avro': avro_records, 'fastavro': fastavro_records } \ | CoGroupByKey() \ | Map(check) fastavro_read_pipeline.run().wait_until_finish() assert result.state == PipelineState.DONE
def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 fastavro_output = '/'.join([self.output, 'fastavro']) # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=bad-option-value batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=bad-option-value return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, parse_schema(json.loads(self.SCHEMA_STRING)), ) result = self.test_pipeline.run() result.wait_until_finish() fastavro_pcoll = self.test_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro() mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map( lambda x: (x['number'], x)) mapped_record_pcoll = records_pcoll | "map_record" >> Map( lambda x: (x['number'], x)) def validate_record(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll']) record_pcoll_values = v['record_pcoll'] fastavro_values = v['fastavro'] assertEqual(record_pcoll_values, fastavro_values) assertEqual(len(record_pcoll_values), 1) { "record_pcoll": mapped_record_pcoll, "fastavro": mapped_fastavro_pcoll } | CoGroupByKey() | Map(validate_record) result = self.test_pipeline.run() result.wait_until_finish() self.addCleanup(delete_files, [self.output]) assert result.state == PipelineState.DONE
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.rows_test` (revision INT64 NOT NULL, value FLOAT64 NOT NULL, std_error FLOAT64, `timestamp` TIMESTAMP NOT NULL, test STRING NOT NULL, master STRING, bot STRING, properties STRING) PARTITION BY DATE(`timestamp`); """ # pylint: disable=pointless-string-statement bq_row_schema = {'fields': [ {'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED'}, {'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED'}, {'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE'}, {'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'}, {'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED'}, {'name': 'master', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'bot', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE'}, ]} def RowEntityToRowDict(entity): entities_read.inc() try: d = { 'revision': entity.key.id, 'value': FloatHack(entity['value']), 'std_error': FloatHack(entity.get('error')), 'timestamp': entity['timestamp'].isoformat(), 'test': entity.key.parent.name, } # Add the expando properties as a JSON-encoded dict. properties = {} for key, value in entity.items(): if key in d or key in ['parent_test', 'error']: # skip properties with dedicated columns. continue if isinstance(value, float): value = FloatHack(value) properties[key] = value d['properties'] = json.dumps(properties) if properties else None # Add columns derived from test: master, bot. test_path_parts = d['test'].split('/', 2) if len(test_path_parts) >= 3: d['master'] = test_path_parts[0] d['bot'] = test_path_parts[1] return [d] except KeyError: logging.getLogger().exception('Failed to convert Row') failed_entity_transforms.inc() return [] row_query_params = dict(project=project, kind='Row') row_entities = ( p | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore( row_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) row_dicts = ( row_entities | 'ConvertEntityToRow(Row)' >> FlatMap(RowEntityToRowDict)) table_name = '{}:chromeperf_dashboard_data.rows{}'.format( project, bq_export_options.table_suffix) _ = row_dicts | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery( table_name, bq_row_schema) result = p.run() result.wait_until_finish() PrintCounters(result)
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') row_conflicts = Metrics.counter('main', 'row_conflicts') multiple_histograms_for_row = Metrics.counter( 'main', 'multiple_histograms_for_row') orphaned_histogram = Metrics.counter('main', 'orphaned_histogram') """ CREATE TABLE `chromeperf.chromeperf_dashboard_rows.<MASTER>` (revision INT64 NOT NULL, value FLOAT64 NOT NULL, std_error FLOAT64, `timestamp` TIMESTAMP NOT NULL, master STRING NOT NULL, bot STRING NOT NULL, measurement STRING, test STRING NOT NULL, properties STRING, sample_values ARRAY<FLOAT64>) PARTITION BY DATE(`timestamp`) CLUSTER BY master, bot, measurement; """ # pylint: disable=pointless-string-statement bq_row_schema = { 'fields': [ { 'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED' }, { 'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'master', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bot', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'sample_values', 'type': 'FLOAT', 'mode': 'REPEATED' }, ] } def RowEntityToRowDict(entity): entities_read.inc() try: d = { 'revision': entity.key.id, 'value': FloatHack(entity['value']), 'std_error': FloatHack(entity.get('error')), 'timestamp': entity['timestamp'].isoformat(), 'test': entity.key.parent.name, } # Add the expando properties as a JSON-encoded dict. properties = {} for key, value in entity.items(): if key in d or key in ['parent_test', 'error']: # skip properties with dedicated columns. continue if isinstance(value, float): value = FloatHack(value) properties[key] = value d['properties'] = json.dumps(properties) if properties else None # Add columns derived from test: master, bot. test_path_parts = d['test'].split('/', 2) if len(test_path_parts) >= 3: d['master'] = test_path_parts[0] d['bot'] = test_path_parts[1] d['measurement'] = '/'.join(test_path_parts[2:]) return [d] except KeyError: logging.getLogger().exception('Failed to convert Row') failed_entity_transforms.inc() return [] row_query_params = dict(project=project, kind='Row') row_entities = ( p | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore( row_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) row_dicts = (row_entities | 'ConvertEntityToDict(Row)' >> FlatMap(RowEntityToRowDict)) # The sample_values are not found in the Row entity. So we have to fetch all # the corresponding Histogram entities and join them with our collection of # Rows (by using test + revision as the join key). We also need to unpack the # sample values arrays out of the zlib-compressed JSON stored in the # Histogram's "data" property. def HistogramEntityToDict(entity): """Returns dicts with keys: 'test', 'revision', 'sample_values'.""" entities_read.inc() try: data = entity['data'] except KeyError: logging.getLogger().exception('Histogram missing "data" field') failed_entity_transforms.inc() return [] try: json_str = zlib.decompress(data) except zlib.error: logging.getLogger().exception('Histogram data not valid zlib: %r', data) failed_entity_transforms.inc() return [] try: data_dict = json.loads(json_str) except json.JSONDecodeError: logging.getLogger().exception('Histogram data not valid json.') failed_entity_transforms.inc() return [] sample_values = data_dict.get('sampleValues', []) if not isinstance(sample_values, list): logging.getLogger().exception( 'Histogram data.sampleValues not valid list.') failed_entity_transforms.inc() return [] count = len(sample_values) sample_values = [v for v in sample_values if v is not None] if len(sample_values) != count: logging.getLogger().warn( 'Histogram data.sampleValues contains null: %r', entity.key) for v in sample_values: if not isinstance(v, (int, float)): logging.getLogger().exception( 'Histogram data.sampleValues contains non-numeric: %r', v) failed_entity_transforms.inc() return [] try: return [{ 'test': entity['test'].name, 'revision': entity['revision'], 'sample_values': sample_values, }] except KeyError: logging.getLogger().exception( 'Histogram missing test or revision field/s') failed_entity_transforms.inc() return [] histogram_query_params = dict(project=project, kind='Histogram') histogram_entities = ( p | 'ReadFromDatastore(Histogram)' >> ReadTimestampRangeFromDatastore( histogram_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) histogram_dicts = ( histogram_entities | 'ConvertEntityToDict(Histogram)' >> FlatMap(HistogramEntityToDict)) def TestRevision(element): return (element['test'], element['revision']) rows_with_key = (row_dicts | 'WithKeys(Row)' >> beam.WithKeys(TestRevision)) histograms_with_key = ( histogram_dicts | 'WithKeys(Histogram)' >> beam.WithKeys(TestRevision)) def MergeRowAndSampleValues(element): group_key, join_values = element rows, histograms = join_values if len(rows) == 0: orphaned_histogram.inc() logging.getLogger().error("No Row for Histogram(s) (%r)", group_key) return [] elif len(rows) > 1: row_conflicts.inc() logging.getLogger().error("Multiple rows (%d) for %r", len(rows), group_key) return rows row = rows[0] if len(histograms) > 1: # We'll merge these, so this isn't an error. multiple_histograms_for_row.inc() elif len(histograms) == 0: # No sample values to annotate the row with. This is common. return [row] # Merge multiple histogram's values into a single row. row['sample_values'] = list( itertools.chain.from_iterable(h['sample_values'] for h in histograms)) return [row] joined_and_annotated = ((rows_with_key, histograms_with_key) | beam.CoGroupByKey() | beam.FlatMap(MergeRowAndSampleValues)) def TableNameFn(unused_element): return '{project}:{dataset}.rows{suffix}'.format( project=project, dataset=bq_export_options.dataset.get(), suffix=bq_export_options.table_suffix) _ = (joined_and_annotated | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery( TableNameFn, bq_row_schema, additional_bq_parameters={ 'clustering': { 'fields': ['master', 'bot', 'measurement'] } })) result = p.run() result.wait_until_finish() PrintCounters(result)