def __init__(self): super(WordExtractingDoFn, self).__init__() self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.word_lengths_dist = Metrics.distribution( self.__class__, 'word_len_dist') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
def __init__(self, pattern): super(FilterTextFn, self).__init__() self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Those # values will be available in the monitoring system of the runner used # to run the pipeline. These metrics below track the number of # matched and unmatched words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
def select_split(cumulative_splits, kv, unused_num_partitions): """Select split for an `(id, _)` tuple using a hash of `id`.""" key, _ = kv m = hashlib.md5(key) r = int(m.hexdigest(), 16) / (2 ** (8 * m.digest_size)) for i, (name, p) in enumerate(cumulative_splits): if r < p: Metrics.counter('select_split', name).inc() return i assert False
def filter_invalid_notes(min_pitch, max_pitch, kv): """Filter notes with out-of-range pitch from NoteSequence protos.""" key, ns_str = kv ns = music_pb2.NoteSequence.FromString(ns_str) valid_notes = [note for note in ns.notes if min_pitch <= note.pitch <= max_pitch] if len(valid_notes) < len(ns.notes): del ns.notes[:] ns.notes.extend(valid_notes) Metrics.counter('filter_invalid_notes', 'out_of_range_pitch').inc() return key, ns.SerializeToString()
def __init__(self): self.total_metric = Metrics.counter(self.__class__, 'total_values') self.dist_metric = Metrics.distribution( self.__class__, 'distribution_values') # TODO(ajamato): Add a verifier for gauge once it is supported by the SDKs # and runners. self.latest_metric = Metrics.gauge(self.__class__, 'latest_value')
def __init__(self): super(BitcoinTxnCountDoFn, self).__init__() self.txn_counter = Metrics.counter(self.__class__, 'txns') self.inputs_dist = Metrics.distribution(self.__class__, 'inputs_per_txn') self.outputs_dist = Metrics.distribution(self.__class__, 'outputs_per_txn') self.output_amts_dist = Metrics.distribution(self.__class__, 'output_amts') self.txn_amts_dist = Metrics.distribution(self.__class__, 'txn_amts')
def repl(*args): namespace = args[2] counter = Metrics.counter(namespace, counter_name) element = args[1] _, value = element for i in range(len(value)): counter.inc(i) return f(*args)
def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics to count unmatched words, and know the distribution of # word lengths in the input PCollection. self.word_len_dist = Metrics.distribution(self.__class__, 'word_len_dist') self.unmatched_words = Metrics.counter(self.__class__, 'unmatched_words')
def prepare_image_transforms(element, image_columns): """Replace an images url with its jpeg bytes. Args: element: one input row, as a dict image_columns: list of columns that are image paths Return: element, where each image file path has been replaced by a base64 image. """ import base64 import cStringIO from PIL import Image from tensorflow.python.lib.io import file_io as tf_file_io from apache_beam.metrics import Metrics img_error_count = Metrics.counter('main', 'ImgErrorCount') img_missing_count = Metrics.counter('main', 'ImgMissingCount') for name in image_columns: uri = element[name] if not uri: img_missing_count.inc() continue try: with tf_file_io.FileIO(uri, 'r') as f: img = Image.open(f).convert('RGB') # A variety of different calling libraries throw different exceptions here. # They all correspond to an unreadable file so we treat them equivalently. # pylint: disable broad-except except Exception as e: logging.exception('Error processing image %s: %s', uri, str(e)) img_error_count.inc() return # Convert to desired format and output. output = cStringIO.StringIO() img.save(output, 'jpeg') element[name] = base64.urlsafe_b64encode(output.getvalue()) return element
def process(self, input_example): tf.logging.info('Splitting %s', input_example.features.feature['id'].bytes_list.value[0]) wav_data = input_example.features.feature['audio'].bytes_list.value[0] ns = music_pb2.NoteSequence.FromString( input_example.features.feature['sequence'].bytes_list.value[0]) Metrics.counter('split_wav', 'read_midi_wav_to_split').inc() if self._split == 'test': # For the 'test' split, use the full length audio and midi. split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, min_length=0, max_length=-1, sample_rate=self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'full_example').inc() yield example else: split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, self._min_length, self._max_length, self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example
def __init__(self, project_id, instance_id, table_id): """ Constructor of the Write connector of Bigtable Args: project_id(str): GCP Project of to write the Rows instance_id(str): GCP Instance to write the Rows table_id(str): GCP Table to write the `DirectRows` """ super(_BigTableWriteFn, self).__init__() self.beam_options = {'project_id': project_id, 'instance_id': instance_id, 'table_id': table_id} self.table = None self.batcher = None self.written = Metrics.counter(self.__class__, 'Written Row')
def process(self, input_example): tf.logging.info('Splitting %s', input_example.features.feature['id'].bytes_list.value[0]) wav_data = input_example.features.feature['audio'].bytes_list.value[0] ns = music_pb2.NoteSequence.FromString( input_example.features.feature['sequence'].bytes_list.value[0]) Metrics.counter('split_wav', 'read_midi_wav_to_split').inc() if not self._chunk_files: split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, min_length=0, max_length=-1, sample_rate=self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'full_example').inc() yield example else: try: split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, self._min_length, self._max_length, self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example except AssertionError: output_file = 'badexample-' + hashlib.md5(ns.id).hexdigest() + '.proto' output_path = os.path.join(self._output_directory, output_file) tf.logging.error('Exception processing %s. Writing file to %s', ns.id, output_path) with tf.gfile.Open(output_path, 'w') as f: f.write(input_example.SerializeToString()) raise
def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) _LOGGER.info('counter: %s' % self.counter.metric_name)
def __init__(self, key_cols, val_col): # Count the row with missing values. self.null_row_count = Metrics.counter(self.__class__, 'null_row') self.key_cols = key_cols self.val_col = val_col
def __init__(self): self.empty_line_counter = Metrics.counter('main', 'empty_lines') self.word_length_counter = Metrics.counter('main', 'word_lengths') self.word_counter = Metrics.counter('main', 'total_words') self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
def __init__(self): self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.word_lengths_dist = Metrics.distribution( self.__class__, 'word_len_dist') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
def _process_ns(self, ns): if self._filters: if ns.total_time > self._filters['max_total_time']: logging.info('Skipping %s: total_time=%f', ns.id, ns.total_time) beam_metrics.counter('ExtractExamplesDoFn', 'filtered-too-long').inc() return if len(ns.notes) > self._filters['max_num_notes']: logging.info('Skipping %s: num_notes=%d', ns.id, len(ns.notes)) beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-too-many-notes').inc() return try: qns = note_seq.quantize_note_sequence(ns, steps_per_quarter=16) except (note_seq.BadTimeSignatureError, note_seq.NonIntegerStepsPerBarError, note_seq.NegativeTimeError): beam_metrics.counter('ExtractExamplesDoFn', 'quantize-failed').inc() return vels = set() metric_positions = set() drums_only = True for note in qns.notes: drums_only &= note.is_drum if ((self._filters['is_drum'] is None or note.is_drum == self._filters['is_drum']) and note.velocity > 0): vels.add(note.velocity) metric_positions.add(note.quantized_start_step % 16) if len(vels) < self._filters['min_velocities']: beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-min-velocities').inc() return if len(metric_positions) < self._filters['min_metric_positions']: beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-min-metric-positions').inc() return if self._filters['drums_only'] and not drums_only: beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-drums-only').inc() return beam_metrics.counter('ExtractExamplesDoFn', 'unfiltered-sequences').inc() logging.info('Converting %s to tensors', ns.id) extracted_examples = self._config.data_converter.to_tensors(ns) if not extracted_examples.outputs: beam_metrics.counter('ExtractExamplesDoFn', 'empty-extractions').inc() return beam_metrics.counter('ExtractExamplesDoFn', 'extracted-examples').inc( len(extracted_examples.outputs)) for _, outputs, controls, _ in zip(*extracted_examples): if controls.size: example_ns = self._config.data_converter.from_tensors( [outputs], [controls])[0] else: example_ns = self._config.data_converter.from_tensors([outputs])[0] # Try to re-encode. # TODO(adarob): For now we filter and count examples that cannot be # re-extracted, but ultimately the converter should filter these or avoid # producing them all together. reextracted_examples = self._config.data_converter.to_tensors( example_ns).inputs assert len(reextracted_examples) <= 1 if not reextracted_examples: logging.warning( 'Extracted example NoteSequence does not reproduce example. ' 'Skipping: %s', example_ns) beam_metrics.counter('ExtractExamplesDoFn', 'empty-reextraction').inc() continue # Extra checks if the code returns multiple segments. # TODO(fjord): should probably make this recursive for cases with more # than 1 level of hierarchy. if isinstance(outputs, list): if len(outputs) != len(reextracted_examples[0]): logging.warning( 'Re-extracted example tensor has different number of segments. ' 'ID: %s. original %d, reextracted %d. Skipping.', ns.id, len(outputs), len(reextracted_examples[0])) beam_metrics.counter( 'ExtractExamplesDoFn', 'different-reextraction-count').inc() continue for i in range(len(outputs)): if not np.array_equal(reextracted_examples[0][i], outputs[i]): logging.warning( 'Re-extracted example tensor does not equal original example. ' 'ID: %s. Index %d. NoteSequence: %s', ns.id, i, example_ns) beam_metrics.counter( 'ExtractExamplesDoFn', 'different-reextraction').inc() yield example_ns, ns.id
def __init__(self, vals): self._vals = vals self._output_counter = Metrics.counter('main', 'outputs')
def process(self, kv): # Seed random number generator based on key so that hop times are # deterministic. key, ns_str = kv m = hashlib.md5(key) random.seed(int(m.hexdigest(), 16)) # Deserialize NoteSequence proto. ns = music_pb2.NoteSequence.FromString(ns_str) # Apply sustain pedal. ns = sequences_lib.apply_sustain_control_changes(ns) # Remove control changes as there are potentially a lot of them and they are # no longer needed. del ns.control_changes[:] for _ in range(self._num_replications): for augment_fn in self._augment_fns: # Augment and encode the performance. try: augmented_performance_sequence = augment_fn(ns) except DataAugmentationError: Metrics.counter( 'extract_examples', 'augment_performance_failed').inc() continue seq = self._encode_performance_fn(augmented_performance_sequence) # feed in performance as both input/output to music transformer # chopping sequence into length 2048 (throw out shorter sequences) if len(seq) >= 2048: max_offset = len(seq) - 2048 offset = random.randrange(max_offset + 1) cropped_seq = seq[offset:offset + 2048] example_dict = { 'inputs': cropped_seq, 'targets': cropped_seq } if self._melody: # decode truncated performance sequence for melody inference decoded_midi = self._decode_performance_fn(cropped_seq) decoded_ns = mm.midi_io.midi_file_to_note_sequence(decoded_midi) # extract melody from cropped performance sequence melody_instrument = melody_inference.infer_melody_for_sequence( decoded_ns, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15) # remove non-melody notes from score score_sequence = copy.deepcopy(decoded_ns) score_notes = [] for note in score_sequence.notes: if note.instrument == melody_instrument: score_notes.append(note) del score_sequence.notes[:] score_sequence.notes.extend(score_notes) # encode melody encode_score_fn = self._encode_score_fns['melody'] example_dict['melody'] = encode_score_fn(score_sequence) # make sure performance input also matches targets; needed for # compatibility of both perf and (mel & perf) autoencoders if self._noisy: # randomly sample a pitch shift to construct noisy performance all_pitches = [x.pitch for x in decoded_ns.notes] min_val = min(all_pitches) max_val = max(all_pitches) transpose_range = range(-(min_val - 21), 108 - max_val + 1) try: transpose_range.remove(0) # make sure you transpose except ValueError: pass transpose_amount = random.choice(transpose_range) augmented_ns, _ = sequences_lib.transpose_note_sequence( decoded_ns, transpose_amount, min_allowed_pitch=21, max_allowed_pitch=108, in_place=False) aug_seq = self._encode_performance_fn(augmented_ns) example_dict['performance'] = aug_seq else: example_dict['performance'] = example_dict['targets'] del example_dict['inputs'] Metrics.counter('extract_examples', 'encoded_example').inc() Metrics.distribution( 'extract_examples', 'performance_length_in_seconds').update( int(augmented_performance_sequence.total_time)) yield generator_utils.to_example(example_dict)
import json import logging import os import random import sys import apache_beam as beam from apache_beam.metrics import Metrics import six import textwrap from tensorflow.python.lib.io import file_io from tensorflow_transform import coders from tensorflow_transform.beam import impl as tft from tensorflow_transform.beam import tft_beam_io from tensorflow_transform.tf_metadata import metadata_io img_error_count = Metrics.counter('main', 'ImgErrorCount') # Files SCHEMA_FILE = 'schema.json' FEATURES_FILE = 'features.json' TRANSFORMED_METADATA_DIR = 'transformed_metadata' RAW_METADATA_DIR = 'raw_metadata' TRANSFORM_FN_DIR = 'transform_fn' # Individual transforms TARGET_TRANSFORM = 'target' IMAGE_TRANSFORM = 'image_to_vec' def parse_arguments(argv):
def __init__(self, image_uri_key: str): """Constructor.""" super().__init__() self.image_uri_key = image_uri_key self.image_good_counter = Metrics.counter(self.__class__, 'image_good') self.image_bad_counter = Metrics.counter(self.__class__, 'image_bad')
def start_bundle(self): self.count = Metrics.counter(self.__class__, 'elementsplusone')
def __init__(self, pattern): super(FilterTextFn, self).__init__() self.pattern = pattern cls = self.__class__ self.matched_words = Metrics.counter(cls, 'matched_words') self.unmatched_words = Metrics.counter(cls, 'unmatched_words')
def preprocess_data(input_example, hparams, process_for_training): """Preprocess example using data.preprocess_data.""" with tf.Graph().as_default(): audio = tf.constant( input_example.features.feature['audio'].bytes_list.value[0]) sequence = tf.constant( input_example.features.feature['sequence'].bytes_list.value[0]) sequence_id = tf.constant( input_example.features.feature['id'].bytes_list.value[0]) velocity_range = tf.constant( input_example.features.feature['velocity_range'].bytes_list.value[0]) input_tensors = data.preprocess_data( sequence_id, sequence, audio, velocity_range, hparams, is_training=process_for_training) with tf.Session() as sess: preprocessed = sess.run(input_tensors) example = tf.train.Example( features=tf.train.Features( feature={ 'spec': tf.train.Feature( float_list=tf.train.FloatList( value=preprocessed.spec.flatten())), 'spectrogram_hash': tf.train.Feature( int64_list=tf.train.Int64List( value=[preprocessed.spectrogram_hash])), 'labels': tf.train.Feature( float_list=tf.train.FloatList( value=preprocessed.labels.flatten())), 'label_weights': tf.train.Feature( float_list=tf.train.FloatList( value=preprocessed.label_weights.flatten())), 'length': tf.train.Feature( int64_list=tf.train.Int64List( value=[preprocessed.length])), 'onsets': tf.train.Feature( float_list=tf.train.FloatList( value=preprocessed.onsets.flatten())), 'offsets': tf.train.Feature( float_list=tf.train.FloatList( value=preprocessed.offsets.flatten())), 'velocities': tf.train.Feature( float_list=tf.train.FloatList( value=preprocessed.velocities.flatten())), 'sequence_id': tf.train.Feature( bytes_list=tf.train.BytesList( value=[preprocessed.sequence_id])), 'note_sequence': tf.train.Feature( bytes_list=tf.train.BytesList( value=[preprocessed.note_sequence])), })) Metrics.counter('preprocess_data', 'preprocess_example').inc() return example
def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) logging.info('counter: %s' % self.counter.metric_name)
try: from apache_beam.options.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.options import PipelineOptions from PIL import Image import tensorflow as tf from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception from tensorflow.python.framework import errors from tensorflow.python.lib.io import file_io slim = tf.contrib.slim error_count = Metrics.counter('main', 'errorCount') missing_label_count = Metrics.counter('main', 'missingLabelCount') csv_rows_count = Metrics.counter('main', 'csvRowsCount') labels_count = Metrics.counter('main', 'labelsCount') labels_without_ids = Metrics.counter('main', 'labelsWithoutIds') existing_file = Metrics.counter('main', 'existingFile') non_existing_file = Metrics.counter('main', 'nonExistingFile') skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') embedding_good = Metrics.counter('main', 'embedding_good') embedding_bad = Metrics.counter('main', 'embedding_bad') incompatible_image = Metrics.counter('main', 'incompatible_image') invalid_uri = Metrics.counter('main', 'invalid_file_name') unlabeled_image = Metrics.counter('main', 'unlabeled_image') unknown_label = Metrics.counter('main', 'unknown_label')
def __init__(self, namespace): self.namespace = namespace self.counter = Metrics.counter(self.namespace, self.LABEL)
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') row_conflicts = Metrics.counter('main', 'row_conflicts') multiple_histograms_for_row = Metrics.counter( 'main', 'multiple_histograms_for_row') orphaned_histogram = Metrics.counter('main', 'orphaned_histogram') """ CREATE TABLE `chromeperf.chromeperf_dashboard_rows.<MASTER>` (revision INT64 NOT NULL, value FLOAT64 NOT NULL, std_error FLOAT64, `timestamp` TIMESTAMP NOT NULL, master STRING NOT NULL, bot STRING NOT NULL, measurement STRING, test STRING NOT NULL, properties STRING, sample_values ARRAY<FLOAT64>) PARTITION BY DATE(`timestamp`) CLUSTER BY master, bot, measurement; """ # pylint: disable=pointless-string-statement bq_row_schema = { 'fields': [ { 'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED' }, { 'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'master', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bot', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'sample_values', 'type': 'FLOAT', 'mode': 'REPEATED' }, ] } def RowEntityToRowDict(entity): entities_read.inc() try: d = { 'revision': entity.key.id, 'value': FloatHack(entity['value']), 'std_error': FloatHack(entity.get('error')), 'timestamp': entity['timestamp'].isoformat(), 'test': entity.key.parent.name, } # Add the expando properties as a JSON-encoded dict. properties = {} for key, value in entity.items(): if key in d or key in ['parent_test', 'error']: # skip properties with dedicated columns. continue if isinstance(value, float): value = FloatHack(value) properties[key] = value d['properties'] = json.dumps(properties) if properties else None # Add columns derived from test: master, bot. test_path_parts = d['test'].split('/', 2) if len(test_path_parts) >= 3: d['master'] = test_path_parts[0] d['bot'] = test_path_parts[1] d['measurement'] = '/'.join(test_path_parts[2:]) return [d] except KeyError: logging.getLogger().exception('Failed to convert Row') failed_entity_transforms.inc() return [] row_query_params = dict(project=project, kind='Row') row_entities = ( p | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore( row_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) row_dicts = (row_entities | 'ConvertEntityToDict(Row)' >> FlatMap(RowEntityToRowDict)) # The sample_values are not found in the Row entity. So we have to fetch all # the corresponding Histogram entities and join them with our collection of # Rows (by using test + revision as the join key). We also need to unpack the # sample values arrays out of the zlib-compressed JSON stored in the # Histogram's "data" property. def HistogramEntityToDict(entity): """Returns dicts with keys: 'test', 'revision', 'sample_values'.""" entities_read.inc() try: data = entity['data'] except KeyError: logging.getLogger().exception('Histogram missing "data" field') failed_entity_transforms.inc() return [] try: json_str = zlib.decompress(data) except zlib.error: logging.getLogger().exception('Histogram data not valid zlib: %r', data) failed_entity_transforms.inc() return [] try: data_dict = json.loads(json_str) except json.JSONDecodeError: logging.getLogger().exception('Histogram data not valid json.') failed_entity_transforms.inc() return [] sample_values = data_dict.get('sampleValues', []) if not isinstance(sample_values, list): logging.getLogger().exception( 'Histogram data.sampleValues not valid list.') failed_entity_transforms.inc() return [] count = len(sample_values) sample_values = [v for v in sample_values if v is not None] if len(sample_values) != count: logging.getLogger().warn( 'Histogram data.sampleValues contains null: %r', entity.key) for v in sample_values: if not isinstance(v, (int, float)): logging.getLogger().exception( 'Histogram data.sampleValues contains non-numeric: %r', v) failed_entity_transforms.inc() return [] try: return [{ 'test': entity['test'].name, 'revision': entity['revision'], 'sample_values': sample_values, }] except KeyError: logging.getLogger().exception( 'Histogram missing test or revision field/s') failed_entity_transforms.inc() return [] histogram_query_params = dict(project=project, kind='Histogram') histogram_entities = ( p | 'ReadFromDatastore(Histogram)' >> ReadTimestampRangeFromDatastore( histogram_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) histogram_dicts = ( histogram_entities | 'ConvertEntityToDict(Histogram)' >> FlatMap(HistogramEntityToDict)) def TestRevision(element): return (element['test'], element['revision']) rows_with_key = (row_dicts | 'WithKeys(Row)' >> beam.WithKeys(TestRevision)) histograms_with_key = ( histogram_dicts | 'WithKeys(Histogram)' >> beam.WithKeys(TestRevision)) def MergeRowAndSampleValues(element): group_key, join_values = element rows, histograms = join_values if len(rows) == 0: orphaned_histogram.inc() logging.getLogger().error("No Row for Histogram(s) (%r)", group_key) return [] elif len(rows) > 1: row_conflicts.inc() logging.getLogger().error("Multiple rows (%d) for %r", len(rows), group_key) return rows row = rows[0] if len(histograms) > 1: # We'll merge these, so this isn't an error. multiple_histograms_for_row.inc() elif len(histograms) == 0: # No sample values to annotate the row with. This is common. return [row] # Merge multiple histogram's values into a single row. row['sample_values'] = list( itertools.chain.from_iterable(h['sample_values'] for h in histograms)) return [row] joined_and_annotated = ((rows_with_key, histograms_with_key) | beam.CoGroupByKey() | beam.FlatMap(MergeRowAndSampleValues)) def TableNameFn(unused_element): return '{project}:{dataset}.rows{suffix}'.format( project=project, dataset=bq_export_options.dataset.get(), suffix=bq_export_options.table_suffix) _ = (joined_and_annotated | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery( TableNameFn, bq_row_schema, additional_bq_parameters={ 'clustering': { 'fields': ['master', 'bot', 'measurement'] } })) result = p.run() result.wait_until_finish() PrintCounters(result)
def __init__(self): self.word_length_counter = Metrics.counter('main','word_lengths')
def combine_matching_seqs(ns_ids): ns, ids = ns_ids beam_metrics.counter('ExtractExamplesDoFn', 'unique-examples').inc() ns.id = ','.join(ids) return ns
def __init__(self): self.word_counter = Metrics.counter('main','total_words')
def __init__(self, number_of_counters, number_of_operations): self.number_of_operations = number_of_operations self.counters = [] for i in range(number_of_counters): self.counters.append( Metrics.counter('do-not-publish', 'name-{}'.format(i)))
def __init__(self): self.empty_line_counter = Metrics.counter('main','empty_lines')
import apache_beam as beam from apache_beam.metrics import Metrics try: from apache_beam.utils.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.options import PipelineOptions from PIL import Image import tensorflow as tf from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception from tensorflow.python.framework import errors from tensorflow.python.lib.io import file_io slim = tf.contrib.slim error_count = Metrics.counter('main', 'errorCount') missing_label_count = Metrics.counter('main', 'missingLabelCount') csv_rows_count = Metrics.counter('main', 'csvRowsCount') labels_count = Metrics.counter('main', 'labelsCount') labels_without_ids = Metrics.counter('main', 'labelsWithoutIds') existing_file = Metrics.counter('main', 'existingFile') non_existing_file = Metrics.counter('main', 'nonExistingFile') skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') embedding_good = Metrics.counter('main', 'embedding_good') embedding_bad = Metrics.counter('main', 'embedding_bad') incompatible_image = Metrics.counter('main', 'incompatible_image') invalid_uri = Metrics.counter('main', 'invalid_file_name') unlabeled_image = Metrics.counter('main', 'unlabeled_image') unknown_label = Metrics.counter('main', 'unknown_label')
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.rows_test` (revision INT64 NOT NULL, value FLOAT64 NOT NULL, std_error FLOAT64, `timestamp` TIMESTAMP NOT NULL, test STRING NOT NULL, master STRING, bot STRING, properties STRING) PARTITION BY DATE(`timestamp`); """ # pylint: disable=pointless-string-statement bq_row_schema = {'fields': [ {'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED'}, {'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED'}, {'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE'}, {'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'}, {'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED'}, {'name': 'master', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'bot', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE'}, ]} def RowEntityToRowDict(entity): entities_read.inc() try: d = { 'revision': entity.key.id, 'value': FloatHack(entity['value']), 'std_error': FloatHack(entity.get('error')), 'timestamp': entity['timestamp'].isoformat(), 'test': entity.key.parent.name, } # Add the expando properties as a JSON-encoded dict. properties = {} for key, value in entity.items(): if key in d or key in ['parent_test', 'error']: # skip properties with dedicated columns. continue if isinstance(value, float): value = FloatHack(value) properties[key] = value d['properties'] = json.dumps(properties) if properties else None # Add columns derived from test: master, bot. test_path_parts = d['test'].split('/', 2) if len(test_path_parts) >= 3: d['master'] = test_path_parts[0] d['bot'] = test_path_parts[1] return [d] except KeyError: logging.getLogger().exception('Failed to convert Row') failed_entity_transforms.inc() return [] row_query_params = dict(project=project, kind='Row') row_entities = ( p | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore( row_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) row_dicts = ( row_entities | 'ConvertEntityToRow(Row)' >> FlatMap(RowEntityToRowDict)) table_name = '{}:chromeperf_dashboard_data.rows{}'.format( project, bq_export_options.table_suffix) _ = row_dicts | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery( table_name, bq_row_schema) result = p.run() result.wait_until_finish() PrintCounters(result)
def __init__(self, count): self.records_read = Metrics.counter(self.__class__, 'recordsRead') self._count = count
def __setstate__(self, options): self.beam_options = options self.table = None self.batcher = None self.written = Metrics.counter(self.__class__, 'Written Row')
import uuid from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 from googledatastore import helper as datastore_helper, PropertyFilter import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io.google_cloud_platform.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.google_cloud_platform.datastore.v1.datastoreio import WriteToDatastore from apache_beam.metrics import Metrics from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import PipelineOptions from apache_beam.utils.pipeline_options import SetupOptions empty_line_counter = Metrics.counter('main', 'empty_lines') word_length_counter = Metrics.counter('main', 'word_lengths') word_counter = Metrics.counter('main', 'total_words') class WordExtractingDoFn(beam.DoFn): """Parse each line of input text into words.""" def process(self, element): """Returns an iterator over words in contents of Cloud Datastore entity. The element is a line of text. If the line is blank, note that, too. Args: element: the input element to be processed Returns: The processed element. """ content_value = element.properties.get('content', None)
def __init__(self): super(ParseEventFn, self).__init__() self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
""" import apache_beam as beam from apache_beam.io import tfrecordio from apache_beam.metrics import Metrics import cStringIO import logging import os from PIL import Image from . import _inceptionlib from . import _util error_count = Metrics.counter('main', 'errorCount') rows_count = Metrics.counter('main', 'rowsCount') skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') embedding_good = Metrics.counter('main', 'embedding_good') embedding_bad = Metrics.counter('main', 'embedding_bad') incompatible_image = Metrics.counter('main', 'incompatible_image') invalid_uri = Metrics.counter('main', 'invalid_file_name') unlabeled_image = Metrics.counter('main', 'unlabeled_image') class ExtractLabelIdsDoFn(beam.DoFn): """Extracts (uri, label_ids) tuples from CSV rows. """ def start_bundle(self, context=None): self.label_to_id_map = {}
def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
def __init__(self, allow_errors): self._allow_errors = allow_errors self._counter = Metrics.counter(self.__class__, 'ml-extract-features') self._error_counter = Metrics.counter(self.__class__, 'ml-extract-features-errors')
from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 from googledatastore import helper as datastore_helper, PropertyFilter import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore from apache_beam.metrics import Metrics from apache_beam.metrics.metric import MetricsFilter from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import PipelineOptions from apache_beam.utils.pipeline_options import SetupOptions empty_line_counter = Metrics.counter('main', 'empty_lines') word_length_counter = Metrics.counter('main', 'word_lengths') word_counter = Metrics.counter('main', 'total_words') class WordExtractingDoFn(beam.DoFn): """Parse each line of input text into words.""" def process(self, element): """Returns an iterator over words in contents of Cloud Datastore entity. The element is a line of text. If the line is blank, note that, too. Args: element: the input element to be processed Returns: The processed element. """
def __init__(self): self.counter = Metrics.counter('pardo', 'total_bytes.count')
def __init__(self): super(WordExtractingDoFn, self).__init__() self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
import logging import os import sys from datetime import datetime import numpy as np import apache_beam as beam from apache_beam.metrics import Metrics from tensorflow_transform import coders from trainer.config import PROJECT_ID, DATA_DIR, TFRECORD_DIR, NUM_LABELS from trainer.util import schema, read_image logging.warning('running preprocess') partition_train = Metrics.counter('partition', 'train') partition_validation = Metrics.counter('partition', 'validation') partition_test = Metrics.counter('partition', 'test') examples_failed = Metrics.counter('build', 'failed') def build_example((key, label, img_bytes)): """Build a dictionary that contains all the features and label to store as TFRecord Args: raw_in: raw data to build the example from Returns: dict: A dictionary of features
def inc_counter(self, name): Metrics.counter(self.__class__.__name__, name).inc()
def process(self, kv): # Seed random number generator based on key so that hop times are # deterministic. key, ns_str = kv print(key) print("********************************************") #m = hashlib.md5(key) m = hashlib.md5(key.encode('utf-8')) random.seed(int(m.hexdigest(), 16)) # Deserialize NoteSequence proto. ns = music_pb2.NoteSequence.FromString(ns_str) # Apply sustain pedal. ns = sequences_lib.apply_sustain_control_changes(ns) # Remove control changes as there are potentially a lot of them and they are # no longer needed. del ns.control_changes[:] if (self._min_hop_size_seconds and ns.total_time < self._min_hop_size_seconds): Metrics.counter('extract_examples', 'sequence_too_short').inc() return sequences = [] for _ in range(self._num_replications): if self._max_hop_size_seconds: if self._max_hop_size_seconds == self._min_hop_size_seconds: # Split using fixed hop size. sequences += sequences_lib.split_note_sequence( ns, self._max_hop_size_seconds) else: # Sample random hop positions such that each segment size is within # the specified range. hop_times = [0.0] while hop_times[ -1] <= ns.total_time - self._min_hop_size_seconds: if hop_times[ -1] + self._max_hop_size_seconds < ns.total_time: # It's important that we get a valid hop size here, since the # remainder of the sequence is too long. max_offset = min( self._max_hop_size_seconds, ns.total_time - self._min_hop_size_seconds - hop_times[-1]) else: # It's okay if the next hop time is invalid (in which case we'll # just stop). max_offset = self._max_hop_size_seconds offset = random.uniform(self._min_hop_size_seconds, max_offset) hop_times.append(hop_times[-1] + offset) # Split at the chosen hop times (ignoring zero and the final invalid # time). sequences += sequences_lib.split_note_sequence( ns, hop_times[1:-1]) else: sequences += [ns] for performance_sequence in sequences: if self._encode_score_fns: # We need to extract a score. if not self._absolute_timing: # Beats are required to extract a score with metric timing. beats = [ ta for ta in performance_sequence.text_annotations if (ta.annotation_type == music_pb2.NoteSequence.TextAnnotation.BEAT) and ta.time <= performance_sequence.total_time ] if len(beats) < 2: Metrics.counter('extract_examples', 'not_enough_beats').inc() continue # Ensure the sequence starts and ends on a beat. performance_sequence = sequences_lib.extract_subsequence( performance_sequence, start_time=min(beat.time for beat in beats), end_time=max(beat.time for beat in beats)) # Infer beat-aligned chords (only for relative timing). try: chord_inference.infer_chords_for_sequence( performance_sequence, chord_change_prob=0.25, chord_note_concentration=50.0, add_key_signatures=True) except chord_inference.ChordInferenceError: Metrics.counter('extract_examples', 'chord_inference_failed').inc() continue # Infer melody regardless of relative/absolute timing. try: melody_instrument = melody_inference.infer_melody_for_sequence( performance_sequence, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15) except melody_inference.MelodyInferenceError: Metrics.counter('extract_examples', 'melody_inference_failed').inc() continue if not self._absolute_timing: # Now rectify detected beats to occur at fixed tempo. # TODO(iansimon): also include the alignment score_sequence, unused_alignment = sequences_lib.rectify_beats( performance_sequence, beats_per_minute=SCORE_BPM) else: # Score uses same timing as performance. score_sequence = copy.deepcopy(performance_sequence) # Remove melody notes from performance. performance_notes = [] for note in performance_sequence.notes: if note.instrument != melody_instrument: performance_notes.append(note) del performance_sequence.notes[:] performance_sequence.notes.extend(performance_notes) # Remove non-melody notes from score. score_notes = [] for note in score_sequence.notes: if note.instrument == melody_instrument: score_notes.append(note) del score_sequence.notes[:] score_sequence.notes.extend(score_notes) # Remove key signatures and beat/chord annotations from performance. del performance_sequence.key_signatures[:] del performance_sequence.text_annotations[:] Metrics.counter('extract_examples', 'extracted_score').inc() for augment_fn in self._augment_fns: # Augment and encode the performance. try: augmented_performance_sequence = augment_fn( performance_sequence) except DataAugmentationError: Metrics.counter('extract_examples', 'augment_performance_failed').inc() continue example_dict = { 'targets': self._encode_performance_fn(augmented_performance_sequence) } if not example_dict['targets']: Metrics.counter('extract_examples', 'skipped_empty_targets').inc() continue if (self._random_crop_length and len(example_dict['targets']) > self._random_crop_length): # Take a random crop of the encoded performance. max_offset = len( example_dict['targets']) - self._random_crop_length offset = random.randrange(max_offset + 1) example_dict['targets'] = example_dict['targets'][ offset:offset + self._random_crop_length] if self._encode_score_fns: # Augment the extracted score. try: augmented_score_sequence = augment_fn(score_sequence) except DataAugmentationError: Metrics.counter('extract_examples', 'augment_score_failed').inc() continue # Apply all score encoding functions. skip = False for name, encode_score_fn in self._encode_score_fns.items( ): example_dict[name] = encode_score_fn( augmented_score_sequence) if not example_dict[name]: Metrics.counter('extract_examples', 'skipped_empty_%s' % name).inc() skip = True break if skip: continue Metrics.counter('extract_examples', 'encoded_example').inc() Metrics.distribution( 'extract_examples', 'performance_length_in_seconds').update( int(augmented_performance_sequence.total_time)) yield generator_utils.to_example(example_dict)
def process(self, kv): # Seed random number generator based on key so that hop times are # deterministic. key, ns_str = kv m = hashlib.md5(key) random.seed(int(m.hexdigest(), 16)) # Deserialize NoteSequence proto. ns = music_pb2.NoteSequence.FromString(ns_str) # Apply sustain pedal. ns = sequences_lib.apply_sustain_control_changes(ns) # Remove control changes as there are potentially a lot of them and they are # no longer needed. del ns.control_changes[:] if (self._min_hop_size_seconds and ns.total_time < self._min_hop_size_seconds): Metrics.counter('extract_examples', 'sequence_too_short').inc() return sequences = [] for _ in range(self._num_replications): if self._max_hop_size_seconds: if self._max_hop_size_seconds == self._min_hop_size_seconds: # Split using fixed hop size. sequences += sequences_lib.split_note_sequence( ns, self._max_hop_size_seconds) else: # Sample random hop positions such that each segment size is within # the specified range. hop_times = [0.0] while hop_times[-1] <= ns.total_time - self._min_hop_size_seconds: if hop_times[-1] + self._max_hop_size_seconds < ns.total_time: # It's important that we get a valid hop size here, since the # remainder of the sequence is too long. max_offset = min( self._max_hop_size_seconds, ns.total_time - self._min_hop_size_seconds - hop_times[-1]) else: # It's okay if the next hop time is invalid (in which case we'll # just stop). max_offset = self._max_hop_size_seconds offset = random.uniform(self._min_hop_size_seconds, max_offset) hop_times.append(hop_times[-1] + offset) # Split at the chosen hop times (ignoring zero and the final invalid # time). sequences += sequences_lib.split_note_sequence(ns, hop_times[1:-1]) else: sequences += [ns] for performance_sequence in sequences: if self._encode_score_fns: # We need to extract a score. if not self._absolute_timing: # Beats are required to extract a score with metric timing. beats = [ ta for ta in performance_sequence.text_annotations if (ta.annotation_type == music_pb2.NoteSequence.TextAnnotation.BEAT) and ta.time <= performance_sequence.total_time ] if len(beats) < 2: Metrics.counter('extract_examples', 'not_enough_beats').inc() continue # Ensure the sequence starts and ends on a beat. performance_sequence = sequences_lib.extract_subsequence( performance_sequence, start_time=min(beat.time for beat in beats), end_time=max(beat.time for beat in beats) ) # Infer beat-aligned chords (only for relative timing). try: chord_inference.infer_chords_for_sequence( performance_sequence, chord_change_prob=0.25, chord_note_concentration=50.0, add_key_signatures=True) except chord_inference.ChordInferenceError: Metrics.counter('extract_examples', 'chord_inference_failed').inc() continue # Infer melody regardless of relative/absolute timing. try: melody_instrument = melody_inference.infer_melody_for_sequence( performance_sequence, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15) except melody_inference.MelodyInferenceError: Metrics.counter('extract_examples', 'melody_inference_failed').inc() continue if not self._absolute_timing: # Now rectify detected beats to occur at fixed tempo. # TODO(iansimon): also include the alignment score_sequence, unused_alignment = sequences_lib.rectify_beats( performance_sequence, beats_per_minute=SCORE_BPM) else: # Score uses same timing as performance. score_sequence = copy.deepcopy(performance_sequence) # Remove melody notes from performance. performance_notes = [] for note in performance_sequence.notes: if note.instrument != melody_instrument: performance_notes.append(note) del performance_sequence.notes[:] performance_sequence.notes.extend(performance_notes) # Remove non-melody notes from score. score_notes = [] for note in score_sequence.notes: if note.instrument == melody_instrument: score_notes.append(note) del score_sequence.notes[:] score_sequence.notes.extend(score_notes) # Remove key signatures and beat/chord annotations from performance. del performance_sequence.key_signatures[:] del performance_sequence.text_annotations[:] Metrics.counter('extract_examples', 'extracted_score').inc() for augment_fn in self._augment_fns: # Augment and encode the performance. try: augmented_performance_sequence = augment_fn(performance_sequence) except DataAugmentationError: Metrics.counter( 'extract_examples', 'augment_performance_failed').inc() continue example_dict = { 'targets': self._encode_performance_fn( augmented_performance_sequence) } if not example_dict['targets']: Metrics.counter('extract_examples', 'skipped_empty_targets').inc() continue if self._encode_score_fns: # Augment the extracted score. try: augmented_score_sequence = augment_fn(score_sequence) except DataAugmentationError: Metrics.counter('extract_examples', 'augment_score_failed').inc() continue # Apply all score encoding functions. skip = False for name, encode_score_fn in self._encode_score_fns.items(): example_dict[name] = encode_score_fn(augmented_score_sequence) if not example_dict[name]: Metrics.counter('extract_examples', 'skipped_empty_%s' % name).inc() skip = True break if skip: continue Metrics.counter('extract_examples', 'encoded_example').inc() Metrics.distribution( 'extract_examples', 'performance_length_in_seconds').update( int(augmented_performance_sequence.total_time)) yield generator_utils.to_example(example_dict)