def __init__(self): self.total_metric = Metrics.counter(self.__class__, 'total_values') self.dist_metric = Metrics.distribution( self.__class__, 'distribution_values') # TODO(ajamato): Add a verifier for gauge once it is supported by the SDKs # and runners. self.latest_metric = Metrics.gauge(self.__class__, 'latest_value')
def __init__(self): super(BitcoinTxnCountDoFn, self).__init__() self.txn_counter = Metrics.counter(self.__class__, 'txns') self.inputs_dist = Metrics.distribution(self.__class__, 'inputs_per_txn') self.outputs_dist = Metrics.distribution(self.__class__, 'outputs_per_txn') self.output_amts_dist = Metrics.distribution(self.__class__, 'output_amts') self.txn_amts_dist = Metrics.distribution(self.__class__, 'txn_amts')
def __init__(self, min_batch_size=1, max_batch_size=1000, target_batch_overhead=.1, target_batch_duration_secs=1, clock=time.time): if min_batch_size > max_batch_size: raise ValueError("Minimum (%s) must not be greater than maximum (%s)" % ( min_batch_size, max_batch_size)) if target_batch_overhead and not 0 < target_batch_overhead <= 1: raise ValueError("target_batch_overhead (%s) must be between 0 and 1" % ( target_batch_overhead)) if target_batch_duration_secs and target_batch_duration_secs <= 0: raise ValueError("target_batch_duration_secs (%s) must be positive" % ( target_batch_duration_secs)) if max(0, target_batch_overhead, target_batch_duration_secs) == 0: raise ValueError("At least one of target_batch_overhead or " "target_batch_duration_secs must be positive.") self._min_batch_size = min_batch_size self._max_batch_size = max_batch_size self._target_batch_overhead = target_batch_overhead self._target_batch_duration_secs = target_batch_duration_secs self._clock = clock self._data = [] self._ignore_next_timing = False self._size_distribution = Metrics.distribution( 'BatchElements', 'batch_size') self._time_distribution = Metrics.distribution( 'BatchElements', 'msec_per_batch') # Beam distributions only accept integer values, so we use this to # accumulate under-reported values until they add up to whole milliseconds. # (Milliseconds are chosen because that's conventionally used elsewhere in # profiling-style counters.) self._remainder_msecs = 0
def __init__(self): super(WordExtractingDoFn, self).__init__() self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.word_lengths_dist = Metrics.distribution( self.__class__, 'word_len_dist') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics to count unmatched words, and know the distribution of # word lengths in the input PCollection. self.word_len_dist = Metrics.distribution(self.__class__, 'word_len_dist') self.unmatched_words = Metrics.counter(self.__class__, 'unmatched_words')
def __init__(self, pattern): super(FilterTextFn, self).__init__() self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Those # values will be available in the monitoring system of the runner used # to run the pipeline. These metrics below track the number of # matched and unmatched words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
def select_split(cumulative_splits, kv, unused_num_partitions): """Select split for an `(id, _)` tuple using a hash of `id`.""" key, _ = kv m = hashlib.md5(key) r = int(m.hexdigest(), 16) / (2 ** (8 * m.digest_size)) for i, (name, p) in enumerate(cumulative_splits): if r < p: Metrics.counter('select_split', name).inc() return i assert False
def filter_invalid_notes(min_pitch, max_pitch, kv): """Filter notes with out-of-range pitch from NoteSequence protos.""" key, ns_str = kv ns = music_pb2.NoteSequence.FromString(ns_str) valid_notes = [note for note in ns.notes if min_pitch <= note.pitch <= max_pitch] if len(valid_notes) < len(ns.notes): del ns.notes[:] ns.notes.extend(valid_notes) Metrics.counter('filter_invalid_notes', 'out_of_range_pitch').inc() return key, ns.SerializeToString()
def repl(*args): namespace = args[2] counter = Metrics.counter(namespace, counter_name) element = args[1] _, value = element for i in range(len(value)): counter.inc(i) return f(*args)
def prepare_image_transforms(element, image_columns): """Replace an images url with its jpeg bytes. Args: element: one input row, as a dict image_columns: list of columns that are image paths Return: element, where each image file path has been replaced by a base64 image. """ import base64 import cStringIO from PIL import Image from tensorflow.python.lib.io import file_io as tf_file_io from apache_beam.metrics import Metrics img_error_count = Metrics.counter('main', 'ImgErrorCount') img_missing_count = Metrics.counter('main', 'ImgMissingCount') for name in image_columns: uri = element[name] if not uri: img_missing_count.inc() continue try: with tf_file_io.FileIO(uri, 'r') as f: img = Image.open(f).convert('RGB') # A variety of different calling libraries throw different exceptions here. # They all correspond to an unreadable file so we treat them equivalently. # pylint: disable broad-except except Exception as e: logging.exception('Error processing image %s: %s', uri, str(e)) img_error_count.inc() return # Convert to desired format and output. output = cStringIO.StringIO() img.save(output, 'jpeg') element[name] = base64.urlsafe_b64encode(output.getvalue()) return element
def process(self, input_example): tf.logging.info('Splitting %s', input_example.features.feature['id'].bytes_list.value[0]) wav_data = input_example.features.feature['audio'].bytes_list.value[0] ns = music_pb2.NoteSequence.FromString( input_example.features.feature['sequence'].bytes_list.value[0]) Metrics.counter('split_wav', 'read_midi_wav_to_split').inc() if self._split == 'test': # For the 'test' split, use the full length audio and midi. split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, min_length=0, max_length=-1, sample_rate=self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'full_example').inc() yield example else: split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, self._min_length, self._max_length, self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example
def __init__(self, project_id, instance_id, table_id): """ Constructor of the Write connector of Bigtable Args: project_id(str): GCP Project of to write the Rows instance_id(str): GCP Instance to write the Rows table_id(str): GCP Table to write the `DirectRows` """ super(_BigTableWriteFn, self).__init__() self.beam_options = {'project_id': project_id, 'instance_id': instance_id, 'table_id': table_id} self.table = None self.batcher = None self.written = Metrics.counter(self.__class__, 'Written Row')
def process(self, input_example): tf.logging.info('Splitting %s', input_example.features.feature['id'].bytes_list.value[0]) wav_data = input_example.features.feature['audio'].bytes_list.value[0] ns = music_pb2.NoteSequence.FromString( input_example.features.feature['sequence'].bytes_list.value[0]) Metrics.counter('split_wav', 'read_midi_wav_to_split').inc() if not self._chunk_files: split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, min_length=0, max_length=-1, sample_rate=self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'full_example').inc() yield example else: try: split_examples = split_audio_and_label_data.process_record( wav_data, ns, ns.id, self._min_length, self._max_length, self._sample_rate) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example except AssertionError: output_file = 'badexample-' + hashlib.md5(ns.id).hexdigest() + '.proto' output_path = os.path.join(self._output_directory, output_file) tf.logging.error('Exception processing %s. Writing file to %s', ns.id, output_path) with tf.gfile.Open(output_path, 'w') as f: f.write(input_example.SerializeToString()) raise
def __init__(self, namespace): self.namespace = namespace self.counter = Metrics.counter(self.namespace, self.LABEL)
def __init__(self): self.empty_line_counter = Metrics.counter('main', 'empty_lines') self.word_length_counter = Metrics.counter('main', 'word_lengths') self.word_counter = Metrics.counter('main', 'total_words') self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
def __init__(self, count): self.records_read = Metrics.counter(self.__class__, 'recordsRead') self._count = count
def __setstate__(self, options): self.generate_row = Metrics.counter(self.__class__, 'generate_row')
def __init__(self): self.words_counter = Metrics.counter(self.__class__, "words") self.word_lengths_counter = Metrics.counter(self.__class__, "word_lengths") self.word_lengths_dist = Metrics.distribution(self.__class__, "word_len_dist") self.empty_line_counter = Metrics.counter(self.__class__, "empty_lines")
def __init__(self): super(FlattenFn, self).__init__() self.instance_counter = Metrics.counter(self.__class__, 'invalid_instance_counter')
def start_bundle(self): self.count = Metrics.counter(self.__class__, 'elementsplusone')
def __setstate__(self, options): self.beam_options = options self.table = None self.batcher = None self.service_call_metric = None self.written = Metrics.counter(self.__class__, 'Written Row')
""" import apache_beam as beam from apache_beam.io import tfrecordio from apache_beam.metrics import Metrics import cStringIO import logging import os from PIL import Image from . import _inceptionlib from . import _util error_count = Metrics.counter('main', 'errorCount') rows_count = Metrics.counter('main', 'rowsCount') skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') embedding_good = Metrics.counter('main', 'embedding_good') embedding_bad = Metrics.counter('main', 'embedding_bad') incompatible_image = Metrics.counter('main', 'incompatible_image') invalid_uri = Metrics.counter('main', 'invalid_file_name') unlabeled_image = Metrics.counter('main', 'unlabeled_image') class ExtractLabelIdsDoFn(beam.DoFn): """Extracts (uri, label_ids) tuples from CSV rows. """ def start_bundle(self, context=None): self.label_to_id_map = {}
def __init__(self): self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.word_lengths_dist = Metrics.distribution( self.__class__, 'word_len_dist') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
def __init__(self, number_of_counters, number_of_operations): self.number_of_operations = number_of_operations self.counters = [] for i in range(number_of_counters): self.counters.append( Metrics.counter('do-not-publish', 'name-{}'.format(i)))
def process(self, kv): # Seed random number generator based on key so that hop times are # deterministic. key, ns_str = kv m = hashlib.md5(key) random.seed(int(m.hexdigest(), 16)) # Deserialize NoteSequence proto. ns = music_pb2.NoteSequence.FromString(ns_str) # Apply sustain pedal. ns = sequences_lib.apply_sustain_control_changes(ns) # Remove control changes as there are potentially a lot of them and they are # no longer needed. del ns.control_changes[:] if (self._min_hop_size_seconds and ns.total_time < self._min_hop_size_seconds): Metrics.counter('extract_examples', 'sequence_too_short').inc() return sequences = [] for _ in range(self._num_replications): if self._max_hop_size_seconds: if self._max_hop_size_seconds == self._min_hop_size_seconds: # Split using fixed hop size. sequences += sequences_lib.split_note_sequence( ns, self._max_hop_size_seconds) else: # Sample random hop positions such that each segment size is within # the specified range. hop_times = [0.0] while hop_times[-1] <= ns.total_time - self._min_hop_size_seconds: if hop_times[-1] + self._max_hop_size_seconds < ns.total_time: # It's important that we get a valid hop size here, since the # remainder of the sequence is too long. max_offset = min( self._max_hop_size_seconds, ns.total_time - self._min_hop_size_seconds - hop_times[-1]) else: # It's okay if the next hop time is invalid (in which case we'll # just stop). max_offset = self._max_hop_size_seconds offset = random.uniform(self._min_hop_size_seconds, max_offset) hop_times.append(hop_times[-1] + offset) # Split at the chosen hop times (ignoring zero and the final invalid # time). sequences += sequences_lib.split_note_sequence(ns, hop_times[1:-1]) else: sequences += [ns] for performance_sequence in sequences: if self._encode_score_fns: # We need to extract a score. if not self._absolute_timing: # Beats are required to extract a score with metric timing. beats = [ ta for ta in performance_sequence.text_annotations if (ta.annotation_type == music_pb2.NoteSequence.TextAnnotation.BEAT) and ta.time <= performance_sequence.total_time ] if len(beats) < 2: Metrics.counter('extract_examples', 'not_enough_beats').inc() continue # Ensure the sequence starts and ends on a beat. performance_sequence = sequences_lib.extract_subsequence( performance_sequence, start_time=min(beat.time for beat in beats), end_time=max(beat.time for beat in beats) ) # Infer beat-aligned chords (only for relative timing). try: chord_inference.infer_chords_for_sequence( performance_sequence, chord_change_prob=0.25, chord_note_concentration=50.0, add_key_signatures=True) except chord_inference.ChordInferenceError: Metrics.counter('extract_examples', 'chord_inference_failed').inc() continue # Infer melody regardless of relative/absolute timing. try: melody_instrument = melody_inference.infer_melody_for_sequence( performance_sequence, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15) except melody_inference.MelodyInferenceError: Metrics.counter('extract_examples', 'melody_inference_failed').inc() continue if not self._absolute_timing: # Now rectify detected beats to occur at fixed tempo. # TODO(iansimon): also include the alignment score_sequence, unused_alignment = sequences_lib.rectify_beats( performance_sequence, beats_per_minute=SCORE_BPM) else: # Score uses same timing as performance. score_sequence = copy.deepcopy(performance_sequence) # Remove melody notes from performance. performance_notes = [] for note in performance_sequence.notes: if note.instrument != melody_instrument: performance_notes.append(note) del performance_sequence.notes[:] performance_sequence.notes.extend(performance_notes) # Remove non-melody notes from score. score_notes = [] for note in score_sequence.notes: if note.instrument == melody_instrument: score_notes.append(note) del score_sequence.notes[:] score_sequence.notes.extend(score_notes) # Remove key signatures and beat/chord annotations from performance. del performance_sequence.key_signatures[:] del performance_sequence.text_annotations[:] Metrics.counter('extract_examples', 'extracted_score').inc() for augment_fn in self._augment_fns: # Augment and encode the performance. try: augmented_performance_sequence = augment_fn(performance_sequence) except DataAugmentationError: Metrics.counter( 'extract_examples', 'augment_performance_failed').inc() continue example_dict = { 'targets': self._encode_performance_fn( augmented_performance_sequence) } if not example_dict['targets']: Metrics.counter('extract_examples', 'skipped_empty_targets').inc() continue if self._encode_score_fns: # Augment the extracted score. try: augmented_score_sequence = augment_fn(score_sequence) except DataAugmentationError: Metrics.counter('extract_examples', 'augment_score_failed').inc() continue # Apply all score encoding functions. skip = False for name, encode_score_fn in self._encode_score_fns.items(): example_dict[name] = encode_score_fn(augmented_score_sequence) if not example_dict[name]: Metrics.counter('extract_examples', 'skipped_empty_%s' % name).inc() skip = True break if skip: continue Metrics.counter('extract_examples', 'encoded_example').inc() Metrics.distribution( 'extract_examples', 'performance_length_in_seconds').update( int(augmented_performance_sequence.total_time)) yield generator_utils.to_example(example_dict)
from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 from googledatastore import helper as datastore_helper, PropertyFilter import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore from apache_beam.metrics import Metrics from apache_beam.metrics.metric import MetricsFilter from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import PipelineOptions from apache_beam.utils.pipeline_options import SetupOptions empty_line_counter = Metrics.counter('main', 'empty_lines') word_length_counter = Metrics.counter('main', 'word_lengths') word_counter = Metrics.counter('main', 'total_words') class WordExtractingDoFn(beam.DoFn): """Parse each line of input text into words.""" def process(self, element): """Returns an iterator over words in contents of Cloud Datastore entity. The element is a line of text. If the line is blank, note that, too. Args: element: the input element to be processed Returns: The processed element. """
def __init__(self): super(WordExtractingDoFn, self).__init__() self.words_counter = Metrics.counter(self.__class__, 'words') self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
def __init__(self): self.counter = Metrics.counter('pardo', 'total_bytes.count')
def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
import uuid from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 from googledatastore import helper as datastore_helper, PropertyFilter import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore from apache_beam.metrics import Metrics from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import PipelineOptions from apache_beam.utils.pipeline_options import SetupOptions empty_line_counter = Metrics.counter('main', 'empty_lines') word_length_counter = Metrics.counter('main', 'word_lengths') word_counter = Metrics.counter('main', 'total_words') class WordExtractingDoFn(beam.DoFn): """Parse each line of input text into words.""" def process(self, element): """Returns an iterator over words in contents of Cloud Datastore entity. The element is a line of text. If the line is blank, note that, too. Args: element: the input element to be processed Returns: The processed element. """
import datetime import numpy as np import apache_beam as beam from apache_beam.io import filesystem from apache_beam.io import tfrecordio from apache_beam.metrics import Metrics import tensorflow as tf from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform import coders from config import PROJECT_ID, DATA_DIR, TFRECORD_DIR from util import schema partition_train = Metrics.counter('partition', 'train') partition_validation = Metrics.counter('partition', 'validation') partition_test = Metrics.counter('partition', 'test') examples_failed = Metrics.counter('build', 'failed') def buildExample(raw_input): ''' Build a dictionary that contains all the features&label to store as TFRecord Args: tuple: a tuple containing the data to build the example from Returns: a dictionary of features ''' try: elements = raw_input.split(',')
def __init__(self): super(CalcFreqFn, self).__init__() self.error_counter = Metrics.counter(self.__class__, 'calc_error_counter')
def process(self, inputs): """Generates the BLS periodogram for a light curve. Args: inputs: A tuple (key, light_curve_pb2.LightCurve) Yields: A tuple (key, box_least_squares_pb2.Periodogram) """ Metrics.counter(self.__class__.__name__, "inputs-seen").inc() # Unpack the light curve. lc = inputs["light_curve"] time = np.array(lc.light_curve.time, dtype=np.float) flux = np.array(lc.light_curve.flux, dtype=np.float) norm_curve = np.array(lc.light_curve.norm_curve, dtype=np.float) flux /= norm_curve # Normalize flux. # Fit periodogram. bls = box_least_squares.BoxLeastSquares(time, flux, capacity=self.max_nbins) results = [] for period, nbins in itertools.izip(self.all_periods, self.all_nbins): bin_width = period / nbins # Compute the minimum number of bins for a transit. duration_min = 0 if self.duration_density_max: duration_min = self.duration_min_fraction * _max_duration( period, density_star=self.duration_density_max) if self.duration_min_days: duration_min = max(self.duration_min_days, duration_min) width_min = int(np.maximum(1, np.floor(duration_min / bin_width))) # Compute the maximum number of bins for a transit. if self.duration_density_min: duration_max = _max_duration( period, density_star=self.duration_density_min) width_max = int(np.ceil(duration_max / bin_width)) else: width_max = int(np.ceil(0.25 * nbins)) weight_min = self.weight_min_factor * width_min / nbins weight_max = 1 options = bls_pb2.BlsOptions(width_min=width_min, width_max=width_max, weight_min=weight_min, weight_max=weight_max) try: result = bls.fit(period, nbins, options) except ValueError: Metrics.counter(self.__class__.__name__, "bls-error-{}".format( inputs["kepler_id"])).inc() return results.append(result) inputs["periodogram"] = bls_pb2.Periodogram(results=results) yield inputs
def __init__(self): self.generate_row = Metrics.counter(self.__class__, 'generate_row')
def __init__(self, bootstrap_servers, topic, expansion_service=None): self.bootstrap_servers = bootstrap_servers self.topic = topic self.expansion_service = expansion_service self.sum_counter = Metrics.counter('source', 'elements_sum')
def __init__(self): from apache_beam.metrics import Metrics self.print_row = Metrics.counter(self.__class__.__name__, 'Print Row')
def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) logging.info('counter: %s' % self.counter.metric_name)
def CreateAggregatorsDict(namespace="main"): """Creates metrics dict.""" return {name: Metrics.counter(namespace, name) for name in CONFIG_}
def process(self, kv): # Seed random number generator based on key so that hop times are # deterministic. key, ns_str = kv m = hashlib.md5(key.encode('utf-8')) random.seed(int(m.hexdigest(), 16)) # Deserialize NoteSequence proto. ns = note_seq.NoteSequence.FromString(ns_str) # Apply sustain pedal. ns = sequences_lib.apply_sustain_control_changes(ns) # Remove control changes as there are potentially a lot of them and they are # no longer needed. del ns.control_changes[:] if (self._min_hop_size_seconds and ns.total_time < self._min_hop_size_seconds): Metrics.counter('extract_examples', 'sequence_too_short').inc() return sequences = [] for _ in range(self._num_replications): if self._max_hop_size_seconds: if self._max_hop_size_seconds == self._min_hop_size_seconds: # Split using fixed hop size. sequences += sequences_lib.split_note_sequence( ns, self._max_hop_size_seconds) else: # Sample random hop positions such that each segment size is within # the specified range. hop_times = [0.0] while hop_times[-1] <= ns.total_time - self._min_hop_size_seconds: if hop_times[-1] + self._max_hop_size_seconds < ns.total_time: # It's important that we get a valid hop size here, since the # remainder of the sequence is too long. max_offset = min( self._max_hop_size_seconds, ns.total_time - self._min_hop_size_seconds - hop_times[-1]) else: # It's okay if the next hop time is invalid (in which case we'll # just stop). max_offset = self._max_hop_size_seconds offset = random.uniform(self._min_hop_size_seconds, max_offset) hop_times.append(hop_times[-1] + offset) # Split at the chosen hop times (ignoring zero and the final invalid # time). sequences += sequences_lib.split_note_sequence(ns, hop_times[1:-1]) else: sequences += [ns] for performance_sequence in sequences: if self._encode_score_fns: # We need to extract a score. if not self._absolute_timing: # Beats are required to extract a score with metric timing. beats = [ ta for ta in performance_sequence.text_annotations if ta.annotation_type == BEAT and ta.time <= performance_sequence.total_time ] if len(beats) < 2: Metrics.counter('extract_examples', 'not_enough_beats').inc() continue # Ensure the sequence starts and ends on a beat. performance_sequence = sequences_lib.extract_subsequence( performance_sequence, start_time=min(beat.time for beat in beats), end_time=max(beat.time for beat in beats) ) # Infer beat-aligned chords (only for relative timing). try: chord_inference.infer_chords_for_sequence( performance_sequence, chord_change_prob=0.25, chord_note_concentration=50.0, add_key_signatures=True) except chord_inference.ChordInferenceError: Metrics.counter('extract_examples', 'chord_inference_failed').inc() continue # Infer melody regardless of relative/absolute timing. try: melody_instrument = melody_inference.infer_melody_for_sequence( performance_sequence, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15) except melody_inference.MelodyInferenceError: Metrics.counter('extract_examples', 'melody_inference_failed').inc() continue if not self._absolute_timing: # Now rectify detected beats to occur at fixed tempo. # TODO(iansimon): also include the alignment score_sequence, unused_alignment = sequences_lib.rectify_beats( performance_sequence, beats_per_minute=SCORE_BPM) else: # Score uses same timing as performance. score_sequence = copy.deepcopy(performance_sequence) # Remove melody notes from performance. performance_notes = [] for note in performance_sequence.notes: if note.instrument != melody_instrument: performance_notes.append(note) del performance_sequence.notes[:] performance_sequence.notes.extend(performance_notes) # Remove non-melody notes from score. score_notes = [] for note in score_sequence.notes: if note.instrument == melody_instrument: score_notes.append(note) del score_sequence.notes[:] score_sequence.notes.extend(score_notes) # Remove key signatures and beat/chord annotations from performance. del performance_sequence.key_signatures[:] del performance_sequence.text_annotations[:] Metrics.counter('extract_examples', 'extracted_score').inc() for augment_fn in self._augment_fns: # Augment and encode the performance. try: augmented_performance_sequence = augment_fn(performance_sequence) except DataAugmentationError: Metrics.counter( 'extract_examples', 'augment_performance_failed').inc() continue example_dict = { 'targets': self._encode_performance_fn( augmented_performance_sequence) } if not example_dict['targets']: Metrics.counter('extract_examples', 'skipped_empty_targets').inc() continue if (self._random_crop_length and len(example_dict['targets']) > self._random_crop_length): # Take a random crop of the encoded performance. max_offset = len(example_dict['targets']) - self._random_crop_length offset = random.randrange(max_offset + 1) example_dict['targets'] = example_dict['targets'][ offset:offset + self._random_crop_length] if self._encode_score_fns: # Augment the extracted score. try: augmented_score_sequence = augment_fn(score_sequence) except DataAugmentationError: Metrics.counter('extract_examples', 'augment_score_failed').inc() continue # Apply all score encoding functions. skip = False for name, encode_score_fn in self._encode_score_fns.items(): example_dict[name] = encode_score_fn(augmented_score_sequence) if not example_dict[name]: Metrics.counter('extract_examples', 'skipped_empty_%s' % name).inc() skip = True break if skip: continue Metrics.counter('extract_examples', 'encoded_example').inc() Metrics.distribution( 'extract_examples', 'performance_length_in_seconds').update( int(augmented_performance_sequence.total_time)) yield generator_utils.to_example(example_dict)
def __init__(self): super(FormatCsvRowFn, self).__init__() self.num_parse_errors = Metrics.counter(self.__class__, 'num_format_csv_row_errors')
def process(self, kv): # Seed random number generator based on key so that hop times are # deterministic. key, ns_str = kv m = hashlib.md5(key) random.seed(int(m.hexdigest(), 16)) # Deserialize NoteSequence proto. ns = note_seq.NoteSequence.FromString(ns_str) # Apply sustain pedal. ns = sequences_lib.apply_sustain_control_changes(ns) # Remove control changes as there are potentially a lot of them and they are # no longer needed. del ns.control_changes[:] for _ in range(self._num_replications): for augment_fn in self._augment_fns: # Augment and encode the performance. try: augmented_performance_sequence = augment_fn(ns) except DataAugmentationError: Metrics.counter( 'extract_examples', 'augment_performance_failed').inc() continue seq = self._encode_performance_fn(augmented_performance_sequence) # feed in performance as both input/output to music transformer # chopping sequence into length 2048 (throw out shorter sequences) if len(seq) >= 2048: max_offset = len(seq) - 2048 offset = random.randrange(max_offset + 1) cropped_seq = seq[offset:offset + 2048] example_dict = { 'inputs': cropped_seq, 'targets': cropped_seq } if self._melody: # decode truncated performance sequence for melody inference decoded_midi = self._decode_performance_fn(cropped_seq) decoded_ns = note_seq.midi_io.midi_file_to_note_sequence( decoded_midi) # extract melody from cropped performance sequence melody_instrument = melody_inference.infer_melody_for_sequence( decoded_ns, melody_interval_scale=2.0, rest_prob=0.1, instantaneous_non_max_pitch_prob=1e-15, instantaneous_non_empty_rest_prob=0.0, instantaneous_missing_pitch_prob=1e-15) # remove non-melody notes from score score_sequence = copy.deepcopy(decoded_ns) score_notes = [] for note in score_sequence.notes: if note.instrument == melody_instrument: score_notes.append(note) del score_sequence.notes[:] score_sequence.notes.extend(score_notes) # encode melody encode_score_fn = self._encode_score_fns['melody'] example_dict['melody'] = encode_score_fn(score_sequence) # make sure performance input also matches targets; needed for # compatibility of both perf and (mel & perf) autoencoders if self._noisy: # randomly sample a pitch shift to construct noisy performance all_pitches = [x.pitch for x in decoded_ns.notes] min_val = min(all_pitches) max_val = max(all_pitches) transpose_range = range(-(min_val - 21), 108 - max_val + 1) try: transpose_range.remove(0) # make sure you transpose except ValueError: pass transpose_amount = random.choice(transpose_range) augmented_ns, _ = sequences_lib.transpose_note_sequence( decoded_ns, transpose_amount, min_allowed_pitch=21, max_allowed_pitch=108, in_place=False) aug_seq = self._encode_performance_fn(augmented_ns) example_dict['performance'] = aug_seq else: example_dict['performance'] = example_dict['targets'] del example_dict['inputs'] Metrics.counter('extract_examples', 'encoded_example').inc() Metrics.distribution( 'extract_examples', 'performance_length_in_seconds').update( int(augmented_performance_sequence.total_time)) yield generator_utils.to_example(example_dict)
def __init__(self, vals): self._vals = vals self._output_counter = Metrics.counter('main', 'outputs')
def __init__(self): self.runtime_start = Metrics.distribution('pardo', 'runtime.start') self.runtime_end = Metrics.distribution('pardo', 'runtime.end')
def __init__(self): super(ParseEventFn, self).__init__() self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
def combine_matching_seqs(ns_ids): ns, ids = ns_ids beam_metrics.counter('ExtractExamplesDoFn', 'unique-examples').inc() ns.id = ','.join(ids) return ns
try: from apache_beam.options.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.options import PipelineOptions from PIL import Image import tensorflow as tf from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception from tensorflow.python.framework import errors from tensorflow.python.lib.io import file_io slim = tf.contrib.slim error_count = Metrics.counter('main', 'errorCount') missing_label_count = Metrics.counter('main', 'missingLabelCount') csv_rows_count = Metrics.counter('main', 'csvRowsCount') labels_count = Metrics.counter('main', 'labelsCount') labels_without_ids = Metrics.counter('main', 'labelsWithoutIds') existing_file = Metrics.counter('main', 'existingFile') non_existing_file = Metrics.counter('main', 'nonExistingFile') skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') embedding_good = Metrics.counter('main', 'embedding_good') embedding_bad = Metrics.counter('main', 'embedding_bad') incompatible_image = Metrics.counter('main', 'incompatible_image') invalid_uri = Metrics.counter('main', 'invalid_file_name') unlabeled_image = Metrics.counter('main', 'unlabeled_image') unknown_label = Metrics.counter('main', 'unknown_label')
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') # Read 'Job' entities from datastore. job_entities = ( p | 'ReadFromDatastore(Job)' >> ReadTimestampRangeFromDatastore( { 'project': project, 'kind': 'Job' }, time_range_provider=bq_export_options.GetTimeRangeProvider(), timestamp_property='created')) def ConvertEntity(entity): entities_read.inc() try: row_dict = JobEntityToRowDict(entity) except UnconvertibleJobError: logging.getLogger().exception('Failed to convert Job') failed_entity_transforms.inc() return [] return [row_dict] job_dicts = (job_entities | 'ConvertEntityToRow(Job)' >> beam.FlatMap(ConvertEntity)) """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.jobs` (id INT64 NOT NULL, arguments STRING NOT NULL, bug_id INT64, comparison_mode STRING, gerrit STRUCT<server STRING, change_id STRING>, name STRING, tags STRING, user_email STRING, create_time TIMESTAMP NOT NULL, start_time TIMESTAMP, update_time TIMESTAMP NOT NULL, started BOOLEAN NOT NULL, done BOOLEAN NOT NULL, cancelled BOOLEAN NOT NULL, cancel_reason STRING, task STRING, exception STRING, exception_details STRING, difference_count INT64, retry_count INT64 NOT NULL, benchmark_arguments STRUCT<benchmark STRING, story STRING, story_tags STRING, chart STRING, statistic STRING>, use_execution_engine BOOLEAN NOT NULL, completed BOOLEAN NOT NULL, failed BOOLEAN NOT NULL, running BOOLEAN NOT NULL, configuration STRING) PARTITION BY DATE(`create_time`); """ # pylint: disable=pointless-string-statement bq_job_schema = { 'fields': [ { 'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'arguments', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bug_id', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'comparison_mode', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'gerrit', 'type': 'RECORD', 'mode': 'NULLABLE', 'fields': [ { 'name': 'server', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'change_id', 'type': 'STRING', 'mode': 'NULLABLE' }, ] }, { 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'tags', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'user_email', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'create_time', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'start_time', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }, { 'name': 'update_time', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'started', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'done', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'cancelled', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'cancel_reason', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'task', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'exception', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'exception_details', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'difference_count', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'retry_count', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'benchmark_arguments', 'type': 'RECORD', 'mode': 'NULLABLE', 'fields': [ { 'name': 'benchmark', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'story', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'story_tags', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'chart', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'statistic', 'type': 'STRING', 'mode': 'NULLABLE' }, ] }, { 'name': 'use_execution_engine', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'completed', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'failed', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'running', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'configuration', 'type': 'STRING', 'mode': 'NULLABLE' }, ] } # 'dataset' may be a RuntimeValueProvider, so we have to defer calculating # the table name until runtime. The simplest way to do this is by passing a # function for the table name rather than a string. def TableNameFn(unused_element): return '{}:{}.jobs{}'.format(project, bq_export_options.dataset.get(), bq_export_options.table_suffix) _ = job_dicts | 'WriteToBigQuery(jobs)' >> WriteToPartitionedBigQuery( TableNameFn, bq_job_schema, element_to_yyyymmdd_fn=_JobToYYYYMMDD) result = p.run() result.wait_until_finish() PrintCounters(result)
from apache_beam.metrics import Metrics try: from apache_beam.utils.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.options import PipelineOptions from PIL import Image import tensorflow as tf from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception from tensorflow.python.framework import errors from tensorflow.python.lib.io import file_io from google.cloud.ml.io import SaveFeatures slim = tf.contrib.slim error_count = Metrics.counter('main', 'errorCount') missing_label_count = Metrics.counter('main', 'missingLabelCount') csv_rows_count = Metrics.counter('main', 'csvRowsCount') labels_count = Metrics.counter('main', 'labelsCount') labels_without_ids = Metrics.counter('main', 'labelsWithoutIds') existing_file = Metrics.counter('main', 'existingFile') non_existing_file = Metrics.counter('main', 'nonExistingFile') skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') embedding_good = Metrics.counter('main', 'embedding_good') embedding_bad = Metrics.counter('main', 'embedding_bad') incompatible_image = Metrics.counter('main', 'incompatible_image') invalid_uri = Metrics.counter('main', 'invalid_file_name') unlabeled_image = Metrics.counter('main', 'unlabeled_image') unknown_label = Metrics.counter('main', 'unknown_label') def _is_production_tensorflow():
def __setstate__(self, options): self.beam_options = options self.table = None self.batcher = None self.written = Metrics.counter(self.__class__, 'Written Row')
def _process_ns(self, ns): if self._filters: if ns.total_time > self._filters['max_total_time']: logging.info('Skipping %s: total_time=%f', ns.id, ns.total_time) beam_metrics.counter('ExtractExamplesDoFn', 'filtered-too-long').inc() return if len(ns.notes) > self._filters['max_num_notes']: logging.info('Skipping %s: num_notes=%d', ns.id, len(ns.notes)) beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-too-many-notes').inc() return try: qns = note_seq.quantize_note_sequence(ns, steps_per_quarter=16) except (note_seq.BadTimeSignatureError, note_seq.NonIntegerStepsPerBarError, note_seq.NegativeTimeError): beam_metrics.counter('ExtractExamplesDoFn', 'quantize-failed').inc() return vels = set() metric_positions = set() drums_only = True for note in qns.notes: drums_only &= note.is_drum if ((self._filters['is_drum'] is None or note.is_drum == self._filters['is_drum']) and note.velocity > 0): vels.add(note.velocity) metric_positions.add(note.quantized_start_step % 16) if len(vels) < self._filters['min_velocities']: beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-min-velocities').inc() return if len(metric_positions) < self._filters['min_metric_positions']: beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-min-metric-positions').inc() return if self._filters['drums_only'] and not drums_only: beam_metrics.counter( 'ExtractExamplesDoFn', 'filtered-drums-only').inc() return beam_metrics.counter('ExtractExamplesDoFn', 'unfiltered-sequences').inc() logging.info('Converting %s to tensors', ns.id) extracted_examples = self._config.data_converter.to_tensors(ns) if not extracted_examples.outputs: beam_metrics.counter('ExtractExamplesDoFn', 'empty-extractions').inc() return beam_metrics.counter('ExtractExamplesDoFn', 'extracted-examples').inc( len(extracted_examples.outputs)) for _, outputs, controls, _ in zip(*extracted_examples): if controls.size: example_ns = self._config.data_converter.from_tensors( [outputs], [controls])[0] else: example_ns = self._config.data_converter.from_tensors([outputs])[0] # Try to re-encode. # TODO(adarob): For now we filter and count examples that cannot be # re-extracted, but ultimately the converter should filter these or avoid # producing them all together. reextracted_examples = self._config.data_converter.to_tensors( example_ns).inputs assert len(reextracted_examples) <= 1 if not reextracted_examples: logging.warning( 'Extracted example NoteSequence does not reproduce example. ' 'Skipping: %s', example_ns) beam_metrics.counter('ExtractExamplesDoFn', 'empty-reextraction').inc() continue # Extra checks if the code returns multiple segments. # TODO(fjord): should probably make this recursive for cases with more # than 1 level of hierarchy. if isinstance(outputs, list): if len(outputs) != len(reextracted_examples[0]): logging.warning( 'Re-extracted example tensor has different number of segments. ' 'ID: %s. original %d, reextracted %d. Skipping.', ns.id, len(outputs), len(reextracted_examples[0])) beam_metrics.counter( 'ExtractExamplesDoFn', 'different-reextraction-count').inc() continue for i in range(len(outputs)): if not np.array_equal(reextracted_examples[0][i], outputs[i]): logging.warning( 'Re-extracted example tensor does not equal original example. ' 'ID: %s. Index %d. NoteSequence: %s', ns.id, i, example_ns) beam_metrics.counter( 'ExtractExamplesDoFn', 'different-reextraction').inc() yield example_ns, ns.id
def __init__(self, namespace): self.namespace = namespace self.runtime = Metrics.distribution(self.namespace, RUNTIME_LABEL)