def __init__(self):
   self.total_metric = Metrics.counter(self.__class__, 'total_values')
   self.dist_metric = Metrics.distribution(
       self.__class__, 'distribution_values')
   # TODO(ajamato): Add a verifier for gauge once it is supported by the SDKs
   # and runners.
   self.latest_metric = Metrics.gauge(self.__class__, 'latest_value')
예제 #2
0
 def __init__(self):
   super(BitcoinTxnCountDoFn, self).__init__()
   self.txn_counter = Metrics.counter(self.__class__, 'txns')
   self.inputs_dist = Metrics.distribution(self.__class__, 'inputs_per_txn')
   self.outputs_dist = Metrics.distribution(self.__class__, 'outputs_per_txn')
   self.output_amts_dist = Metrics.distribution(self.__class__, 'output_amts')
   self.txn_amts_dist = Metrics.distribution(self.__class__, 'txn_amts')
예제 #3
0
파일: util.py 프로젝트: JavierRoger/beam
  def __init__(self,
               min_batch_size=1,
               max_batch_size=1000,
               target_batch_overhead=.1,
               target_batch_duration_secs=1,
               clock=time.time):
    if min_batch_size > max_batch_size:
      raise ValueError("Minimum (%s) must not be greater than maximum (%s)" % (
          min_batch_size, max_batch_size))
    if target_batch_overhead and not 0 < target_batch_overhead <= 1:
      raise ValueError("target_batch_overhead (%s) must be between 0 and 1" % (
          target_batch_overhead))
    if target_batch_duration_secs and target_batch_duration_secs <= 0:
      raise ValueError("target_batch_duration_secs (%s) must be positive" % (
          target_batch_duration_secs))
    if max(0, target_batch_overhead, target_batch_duration_secs) == 0:
      raise ValueError("At least one of target_batch_overhead or "
                       "target_batch_duration_secs must be positive.")
    self._min_batch_size = min_batch_size
    self._max_batch_size = max_batch_size
    self._target_batch_overhead = target_batch_overhead
    self._target_batch_duration_secs = target_batch_duration_secs
    self._clock = clock
    self._data = []
    self._ignore_next_timing = False

    self._size_distribution = Metrics.distribution(
        'BatchElements', 'batch_size')
    self._time_distribution = Metrics.distribution(
        'BatchElements', 'msec_per_batch')
    # Beam distributions only accept integer values, so we use this to
    # accumulate under-reported values until they add up to whole milliseconds.
    # (Milliseconds are chosen because that's conventionally used elsewhere in
    # profiling-style counters.)
    self._remainder_msecs = 0
예제 #4
0
 def __init__(self):
   super(WordExtractingDoFn, self).__init__()
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.word_lengths_dist = Metrics.distribution(
       self.__class__, 'word_len_dist')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
예제 #5
0
 def __init__(self, pattern):
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Create
   # custom metrics to count unmatched words, and know the distribution of
   # word lengths in the input PCollection.
   self.word_len_dist = Metrics.distribution(self.__class__,
                                             'word_len_dist')
   self.unmatched_words = Metrics.counter(self.__class__,
                                          'unmatched_words')
예제 #6
0
 def __init__(self, pattern):
   super(FilterTextFn, self).__init__()
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Those
   # values will be available in the monitoring system of the runner used
   # to run the pipeline. These metrics below track the number of
   # matched and unmatched words.
   self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
예제 #7
0
def select_split(cumulative_splits, kv, unused_num_partitions):
  """Select split for an `(id, _)` tuple using a hash of `id`."""
  key, _ = kv
  m = hashlib.md5(key)
  r = int(m.hexdigest(), 16) / (2 ** (8 * m.digest_size))
  for i, (name, p) in enumerate(cumulative_splits):
    if r < p:
      Metrics.counter('select_split', name).inc()
      return i
  assert False
예제 #8
0
def filter_invalid_notes(min_pitch, max_pitch, kv):
  """Filter notes with out-of-range pitch from NoteSequence protos."""
  key, ns_str = kv
  ns = music_pb2.NoteSequence.FromString(ns_str)
  valid_notes = [note for note in ns.notes
                 if min_pitch <= note.pitch <= max_pitch]
  if len(valid_notes) < len(ns.notes):
    del ns.notes[:]
    ns.notes.extend(valid_notes)
    Metrics.counter('filter_invalid_notes', 'out_of_range_pitch').inc()
  return key, ns.SerializeToString()
예제 #9
0
 def repl(*args):
   namespace = args[2]
   counter = Metrics.counter(namespace, counter_name)
   element = args[1]
   _, value = element
   for i in range(len(value)):
     counter.inc(i)
   return f(*args)
예제 #10
0
def prepare_image_transforms(element, image_columns):
  """Replace an images url with its jpeg bytes.

  Args: 
    element: one input row, as a dict
    image_columns: list of columns that are image paths

  Return:
    element, where each image file path has been replaced by a base64 image.
  """
  import base64
  import cStringIO
  from PIL import Image
  from tensorflow.python.lib.io import file_io as tf_file_io
  from apache_beam.metrics import Metrics

  img_error_count = Metrics.counter('main', 'ImgErrorCount')
  img_missing_count = Metrics.counter('main', 'ImgMissingCount')

  for name in image_columns:
    uri = element[name]
    if not uri:
      img_missing_count.inc()
      continue
    try:
      with tf_file_io.FileIO(uri, 'r') as f:
        img = Image.open(f).convert('RGB')

    # A variety of different calling libraries throw different exceptions here.
    # They all correspond to an unreadable file so we treat them equivalently.
    # pylint: disable broad-except
    except Exception as e:
      logging.exception('Error processing image %s: %s', uri, str(e))
      img_error_count.inc()
      return

    # Convert to desired format and output.
    output = cStringIO.StringIO()
    img.save(output, 'jpeg')
    element[name] = base64.urlsafe_b64encode(output.getvalue())

  return element
예제 #11
0
  def process(self, input_example):
    tf.logging.info('Splitting %s',
                    input_example.features.feature['id'].bytes_list.value[0])

    wav_data = input_example.features.feature['audio'].bytes_list.value[0]

    ns = music_pb2.NoteSequence.FromString(
        input_example.features.feature['sequence'].bytes_list.value[0])

    Metrics.counter('split_wav', 'read_midi_wav_to_split').inc()

    if self._split == 'test':
      # For the 'test' split, use the full length audio and midi.
      split_examples = split_audio_and_label_data.process_record(
          wav_data,
          ns,
          ns.id,
          min_length=0,
          max_length=-1,
          sample_rate=self._sample_rate)

      for example in split_examples:
        Metrics.counter('split_wav', 'full_example').inc()
        yield example
    else:
      split_examples = split_audio_and_label_data.process_record(
          wav_data, ns, ns.id, self._min_length, self._max_length,
          self._sample_rate)

      for example in split_examples:
        Metrics.counter('split_wav', 'split_example').inc()
        yield example
예제 #12
0
파일: bigtableio.py 프로젝트: eralmas7/beam
 def __init__(self, project_id, instance_id, table_id):
   """ Constructor of the Write connector of Bigtable
   Args:
     project_id(str): GCP Project of to write the Rows
     instance_id(str): GCP Instance to write the Rows
     table_id(str): GCP Table to write the `DirectRows`
   """
   super(_BigTableWriteFn, self).__init__()
   self.beam_options = {'project_id': project_id,
                        'instance_id': instance_id,
                        'table_id': table_id}
   self.table = None
   self.batcher = None
   self.written = Metrics.counter(self.__class__, 'Written Row')
예제 #13
0
  def process(self, input_example):
    tf.logging.info('Splitting %s',
                    input_example.features.feature['id'].bytes_list.value[0])

    wav_data = input_example.features.feature['audio'].bytes_list.value[0]

    ns = music_pb2.NoteSequence.FromString(
        input_example.features.feature['sequence'].bytes_list.value[0])

    Metrics.counter('split_wav', 'read_midi_wav_to_split').inc()

    if not self._chunk_files:
      split_examples = split_audio_and_label_data.process_record(
          wav_data,
          ns,
          ns.id,
          min_length=0,
          max_length=-1,
          sample_rate=self._sample_rate)

      for example in split_examples:
        Metrics.counter('split_wav', 'full_example').inc()
        yield example
    else:
      try:
        split_examples = split_audio_and_label_data.process_record(
            wav_data, ns, ns.id, self._min_length, self._max_length,
            self._sample_rate)

        for example in split_examples:
          Metrics.counter('split_wav', 'split_example').inc()
          yield example
      except AssertionError:
        output_file = 'badexample-' + hashlib.md5(ns.id).hexdigest() + '.proto'
        output_path = os.path.join(self._output_directory, output_file)
        tf.logging.error('Exception processing %s. Writing file to %s',
                         ns.id, output_path)
        with tf.gfile.Open(output_path, 'w') as f:
          f.write(input_example.SerializeToString())
        raise
예제 #14
0
 def __init__(self, namespace):
     self.namespace = namespace
     self.counter = Metrics.counter(self.namespace, self.LABEL)
예제 #15
0
 def __init__(self):
     self.empty_line_counter = Metrics.counter('main', 'empty_lines')
     self.word_length_counter = Metrics.counter('main', 'word_lengths')
     self.word_counter = Metrics.counter('main', 'total_words')
     self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
예제 #16
0
파일: snippets.py 프로젝트: zhujk/beam
 def __init__(self, count):
     self.records_read = Metrics.counter(self.__class__, 'recordsRead')
     self._count = count
예제 #17
0
 def __setstate__(self, options):
   self.generate_row = Metrics.counter(self.__class__, 'generate_row')
예제 #18
0
 def __init__(self):
     self.words_counter = Metrics.counter(self.__class__, "words")
     self.word_lengths_counter = Metrics.counter(self.__class__, "word_lengths")
     self.word_lengths_dist = Metrics.distribution(self.__class__, "word_len_dist")
     self.empty_line_counter = Metrics.counter(self.__class__, "empty_lines")
예제 #19
0
 def __init__(self):
     super(FlattenFn, self).__init__()
     self.instance_counter = Metrics.counter(self.__class__,
                                             'invalid_instance_counter')
 def start_bundle(self):
   self.count = Metrics.counter(self.__class__, 'elementsplusone')
예제 #21
0
파일: snippets.py 프로젝트: eralmas7/beam
 def __init__(self, count):
   self.records_read = Metrics.counter(self.__class__, 'recordsRead')
   self._count = count
예제 #22
0
 def __init__(self):
   self.empty_line_counter = Metrics.counter('main', 'empty_lines')
   self.word_length_counter = Metrics.counter('main', 'word_lengths')
   self.word_counter = Metrics.counter('main', 'total_words')
   self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
예제 #23
0
 def __setstate__(self, options):
     self.beam_options = options
     self.table = None
     self.batcher = None
     self.service_call_metric = None
     self.written = Metrics.counter(self.__class__, 'Written Row')
예제 #24
0
"""


import apache_beam as beam
from apache_beam.io import tfrecordio
from apache_beam.metrics import Metrics
import cStringIO
import logging
import os
from PIL import Image

from . import _inceptionlib
from . import _util


error_count = Metrics.counter('main', 'errorCount')
rows_count = Metrics.counter('main', 'rowsCount')
skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
embedding_good = Metrics.counter('main', 'embedding_good')
embedding_bad = Metrics.counter('main', 'embedding_bad')
incompatible_image = Metrics.counter('main', 'incompatible_image')
invalid_uri = Metrics.counter('main', 'invalid_file_name')
unlabeled_image = Metrics.counter('main', 'unlabeled_image')


class ExtractLabelIdsDoFn(beam.DoFn):
  """Extracts (uri, label_ids) tuples from CSV rows.
  """

  def start_bundle(self, context=None):
    self.label_to_id_map = {}
예제 #25
0
파일: wordcount.py 프로젝트: eralmas7/beam
 def __init__(self):
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.word_lengths_dist = Metrics.distribution(
       self.__class__, 'word_len_dist')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
예제 #26
0
 def __init__(self, number_of_counters, number_of_operations):
     self.number_of_operations = number_of_operations
     self.counters = []
     for i in range(number_of_counters):
         self.counters.append(
             Metrics.counter('do-not-publish', 'name-{}'.format(i)))
예제 #27
0
  def process(self, kv):
    # Seed random number generator based on key so that hop times are
    # deterministic.
    key, ns_str = kv
    m = hashlib.md5(key)
    random.seed(int(m.hexdigest(), 16))

    # Deserialize NoteSequence proto.
    ns = music_pb2.NoteSequence.FromString(ns_str)

    # Apply sustain pedal.
    ns = sequences_lib.apply_sustain_control_changes(ns)

    # Remove control changes as there are potentially a lot of them and they are
    # no longer needed.
    del ns.control_changes[:]

    if (self._min_hop_size_seconds and
        ns.total_time < self._min_hop_size_seconds):
      Metrics.counter('extract_examples', 'sequence_too_short').inc()
      return

    sequences = []
    for _ in range(self._num_replications):
      if self._max_hop_size_seconds:
        if self._max_hop_size_seconds == self._min_hop_size_seconds:
          # Split using fixed hop size.
          sequences += sequences_lib.split_note_sequence(
              ns, self._max_hop_size_seconds)
        else:
          # Sample random hop positions such that each segment size is within
          # the specified range.
          hop_times = [0.0]
          while hop_times[-1] <= ns.total_time - self._min_hop_size_seconds:
            if hop_times[-1] + self._max_hop_size_seconds < ns.total_time:
              # It's important that we get a valid hop size here, since the
              # remainder of the sequence is too long.
              max_offset = min(
                  self._max_hop_size_seconds,
                  ns.total_time - self._min_hop_size_seconds - hop_times[-1])
            else:
              # It's okay if the next hop time is invalid (in which case we'll
              # just stop).
              max_offset = self._max_hop_size_seconds
            offset = random.uniform(self._min_hop_size_seconds, max_offset)
            hop_times.append(hop_times[-1] + offset)
          # Split at the chosen hop times (ignoring zero and the final invalid
          # time).
          sequences += sequences_lib.split_note_sequence(ns, hop_times[1:-1])
      else:
        sequences += [ns]

    for performance_sequence in sequences:
      if self._encode_score_fns:
        # We need to extract a score.
        if not self._absolute_timing:
          # Beats are required to extract a score with metric timing.
          beats = [
              ta for ta in performance_sequence.text_annotations
              if (ta.annotation_type ==
                  music_pb2.NoteSequence.TextAnnotation.BEAT)
              and ta.time <= performance_sequence.total_time
          ]
          if len(beats) < 2:
            Metrics.counter('extract_examples', 'not_enough_beats').inc()
            continue

          # Ensure the sequence starts and ends on a beat.
          performance_sequence = sequences_lib.extract_subsequence(
              performance_sequence,
              start_time=min(beat.time for beat in beats),
              end_time=max(beat.time for beat in beats)
          )

          # Infer beat-aligned chords (only for relative timing).
          try:
            chord_inference.infer_chords_for_sequence(
                performance_sequence,
                chord_change_prob=0.25,
                chord_note_concentration=50.0,
                add_key_signatures=True)
          except chord_inference.ChordInferenceError:
            Metrics.counter('extract_examples', 'chord_inference_failed').inc()
            continue

        # Infer melody regardless of relative/absolute timing.
        try:
          melody_instrument = melody_inference.infer_melody_for_sequence(
              performance_sequence,
              melody_interval_scale=2.0,
              rest_prob=0.1,
              instantaneous_non_max_pitch_prob=1e-15,
              instantaneous_non_empty_rest_prob=0.0,
              instantaneous_missing_pitch_prob=1e-15)
        except melody_inference.MelodyInferenceError:
          Metrics.counter('extract_examples', 'melody_inference_failed').inc()
          continue

        if not self._absolute_timing:
          # Now rectify detected beats to occur at fixed tempo.
          # TODO(iansimon): also include the alignment
          score_sequence, unused_alignment = sequences_lib.rectify_beats(
              performance_sequence, beats_per_minute=SCORE_BPM)
        else:
          # Score uses same timing as performance.
          score_sequence = copy.deepcopy(performance_sequence)

        # Remove melody notes from performance.
        performance_notes = []
        for note in performance_sequence.notes:
          if note.instrument != melody_instrument:
            performance_notes.append(note)
        del performance_sequence.notes[:]
        performance_sequence.notes.extend(performance_notes)

        # Remove non-melody notes from score.
        score_notes = []
        for note in score_sequence.notes:
          if note.instrument == melody_instrument:
            score_notes.append(note)
        del score_sequence.notes[:]
        score_sequence.notes.extend(score_notes)

        # Remove key signatures and beat/chord annotations from performance.
        del performance_sequence.key_signatures[:]
        del performance_sequence.text_annotations[:]

        Metrics.counter('extract_examples', 'extracted_score').inc()

      for augment_fn in self._augment_fns:
        # Augment and encode the performance.
        try:
          augmented_performance_sequence = augment_fn(performance_sequence)
        except DataAugmentationError:
          Metrics.counter(
              'extract_examples', 'augment_performance_failed').inc()
          continue
        example_dict = {
            'targets': self._encode_performance_fn(
                augmented_performance_sequence)
        }
        if not example_dict['targets']:
          Metrics.counter('extract_examples', 'skipped_empty_targets').inc()
          continue

        if self._encode_score_fns:
          # Augment the extracted score.
          try:
            augmented_score_sequence = augment_fn(score_sequence)
          except DataAugmentationError:
            Metrics.counter('extract_examples', 'augment_score_failed').inc()
            continue

          # Apply all score encoding functions.
          skip = False
          for name, encode_score_fn in self._encode_score_fns.items():
            example_dict[name] = encode_score_fn(augmented_score_sequence)
            if not example_dict[name]:
              Metrics.counter('extract_examples',
                              'skipped_empty_%s' % name).inc()
              skip = True
              break
          if skip:
            continue

        Metrics.counter('extract_examples', 'encoded_example').inc()
        Metrics.distribution(
            'extract_examples', 'performance_length_in_seconds').update(
                int(augmented_performance_sequence.total_time))

        yield generator_utils.to_example(example_dict)
예제 #28
0
from google.cloud.proto.datastore.v1 import entity_pb2
from google.cloud.proto.datastore.v1 import query_pb2
from googledatastore import helper as datastore_helper, PropertyFilter

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import PipelineOptions
from apache_beam.utils.pipeline_options import SetupOptions

empty_line_counter = Metrics.counter('main', 'empty_lines')
word_length_counter = Metrics.counter('main', 'word_lengths')
word_counter = Metrics.counter('main', 'total_words')


class WordExtractingDoFn(beam.DoFn):
  """Parse each line of input text into words."""

  def process(self, element):
    """Returns an iterator over words in contents of Cloud Datastore entity.
    The element is a line of text.  If the line is blank, note that, too.
    Args:
      element: the input element to be processed
    Returns:
      The processed element.
    """
예제 #29
0
 def __init__(self):
   super(WordExtractingDoFn, self).__init__()
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
예제 #30
0
 def __init__(self):
   self.counter = Metrics.counter('pardo', 'total_bytes.count')
예제 #31
0
파일: snippets.py 프로젝트: ssisk/beam
 def __init__(self, pattern):
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Create
   # custom metrics matched_word and unmatched_words.
   self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
예제 #32
0
import uuid

from google.cloud.proto.datastore.v1 import entity_pb2
from google.cloud.proto.datastore.v1 import query_pb2
from googledatastore import helper as datastore_helper, PropertyFilter

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore
from apache_beam.metrics import Metrics
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import PipelineOptions
from apache_beam.utils.pipeline_options import SetupOptions

empty_line_counter = Metrics.counter('main', 'empty_lines')
word_length_counter = Metrics.counter('main', 'word_lengths')
word_counter = Metrics.counter('main', 'total_words')


class WordExtractingDoFn(beam.DoFn):
  """Parse each line of input text into words."""

  def process(self, element):
    """Returns an iterator over words in contents of Cloud Datastore entity.
    The element is a line of text.  If the line is blank, note that, too.
    Args:
      element: the input element to be processed
    Returns:
      The processed element.
    """
예제 #33
0
import datetime
import numpy as np

import apache_beam as beam
from apache_beam.io import filesystem
from apache_beam.io import tfrecordio
from apache_beam.metrics import Metrics

import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform import coders

from config import PROJECT_ID, DATA_DIR, TFRECORD_DIR
from util import schema

partition_train = Metrics.counter('partition', 'train')
partition_validation = Metrics.counter('partition', 'validation')
partition_test = Metrics.counter('partition', 'test')
examples_failed = Metrics.counter('build', 'failed')


def buildExample(raw_input):
    '''
    Build a dictionary that contains all the features&label to store as TFRecord
    Args:
      tuple: a tuple containing the data to build the example from
    Returns:
      a dictionary of features
    '''
    try:
        elements = raw_input.split(',')
예제 #34
0
 def __init__(self):
     super(CalcFreqFn, self).__init__()
     self.error_counter = Metrics.counter(self.__class__,
                                          'calc_error_counter')
예제 #35
0
    def process(self, inputs):
        """Generates the BLS periodogram for a light curve.

    Args:
      inputs: A tuple (key, light_curve_pb2.LightCurve)

    Yields:
      A tuple (key, box_least_squares_pb2.Periodogram)
    """
        Metrics.counter(self.__class__.__name__, "inputs-seen").inc()

        # Unpack the light curve.
        lc = inputs["light_curve"]
        time = np.array(lc.light_curve.time, dtype=np.float)
        flux = np.array(lc.light_curve.flux, dtype=np.float)
        norm_curve = np.array(lc.light_curve.norm_curve, dtype=np.float)
        flux /= norm_curve  # Normalize flux.

        # Fit periodogram.
        bls = box_least_squares.BoxLeastSquares(time,
                                                flux,
                                                capacity=self.max_nbins)
        results = []
        for period, nbins in itertools.izip(self.all_periods, self.all_nbins):
            bin_width = period / nbins

            # Compute the minimum number of bins for a transit.
            duration_min = 0
            if self.duration_density_max:
                duration_min = self.duration_min_fraction * _max_duration(
                    period, density_star=self.duration_density_max)
            if self.duration_min_days:
                duration_min = max(self.duration_min_days, duration_min)
            width_min = int(np.maximum(1, np.floor(duration_min / bin_width)))

            # Compute the maximum number of bins for a transit.
            if self.duration_density_min:
                duration_max = _max_duration(
                    period, density_star=self.duration_density_min)
                width_max = int(np.ceil(duration_max / bin_width))
            else:
                width_max = int(np.ceil(0.25 * nbins))

            weight_min = self.weight_min_factor * width_min / nbins
            weight_max = 1

            options = bls_pb2.BlsOptions(width_min=width_min,
                                         width_max=width_max,
                                         weight_min=weight_min,
                                         weight_max=weight_max)
            try:
                result = bls.fit(period, nbins, options)
            except ValueError:
                Metrics.counter(self.__class__.__name__, "bls-error-{}".format(
                    inputs["kepler_id"])).inc()
                return

            results.append(result)

        inputs["periodogram"] = bls_pb2.Periodogram(results=results)

        yield inputs
예제 #36
0
 def __init__(self):
   self.generate_row = Metrics.counter(self.__class__, 'generate_row')
예제 #37
0
 def __init__(self, bootstrap_servers, topic, expansion_service=None):
     self.bootstrap_servers = bootstrap_servers
     self.topic = topic
     self.expansion_service = expansion_service
     self.sum_counter = Metrics.counter('source', 'elements_sum')
예제 #38
0
 def __init__(self):
   from apache_beam.metrics import Metrics
   self.print_row = Metrics.counter(self.__class__.__name__, 'Print Row')
예제 #39
0
 def start_bundle(self):
     self.count = Metrics.counter(self.__class__, 'elementsplusone')
예제 #40
0
 def __init__(self):
     self.counter = Metrics.counter(self.__class__,
                                    counter_name)
     logging.info('counter: %s' % self.counter.metric_name)
예제 #41
0
 def __init__(self):
   self.counter = Metrics.counter(self.__class__, counter_name)
   logging.info('counter: %s' % self.counter.metric_name)
예제 #42
0
def CreateAggregatorsDict(namespace="main"):
    """Creates metrics dict."""
    return {name: Metrics.counter(namespace, name) for name in CONFIG_}
예제 #43
0
  def process(self, kv):
    # Seed random number generator based on key so that hop times are
    # deterministic.
    key, ns_str = kv
    m = hashlib.md5(key.encode('utf-8'))
    random.seed(int(m.hexdigest(), 16))

    # Deserialize NoteSequence proto.
    ns = note_seq.NoteSequence.FromString(ns_str)

    # Apply sustain pedal.
    ns = sequences_lib.apply_sustain_control_changes(ns)

    # Remove control changes as there are potentially a lot of them and they are
    # no longer needed.
    del ns.control_changes[:]

    if (self._min_hop_size_seconds and
        ns.total_time < self._min_hop_size_seconds):
      Metrics.counter('extract_examples', 'sequence_too_short').inc()
      return

    sequences = []
    for _ in range(self._num_replications):
      if self._max_hop_size_seconds:
        if self._max_hop_size_seconds == self._min_hop_size_seconds:
          # Split using fixed hop size.
          sequences += sequences_lib.split_note_sequence(
              ns, self._max_hop_size_seconds)
        else:
          # Sample random hop positions such that each segment size is within
          # the specified range.
          hop_times = [0.0]
          while hop_times[-1] <= ns.total_time - self._min_hop_size_seconds:
            if hop_times[-1] + self._max_hop_size_seconds < ns.total_time:
              # It's important that we get a valid hop size here, since the
              # remainder of the sequence is too long.
              max_offset = min(
                  self._max_hop_size_seconds,
                  ns.total_time - self._min_hop_size_seconds - hop_times[-1])
            else:
              # It's okay if the next hop time is invalid (in which case we'll
              # just stop).
              max_offset = self._max_hop_size_seconds
            offset = random.uniform(self._min_hop_size_seconds, max_offset)
            hop_times.append(hop_times[-1] + offset)
          # Split at the chosen hop times (ignoring zero and the final invalid
          # time).
          sequences += sequences_lib.split_note_sequence(ns, hop_times[1:-1])
      else:
        sequences += [ns]

    for performance_sequence in sequences:
      if self._encode_score_fns:
        # We need to extract a score.
        if not self._absolute_timing:
          # Beats are required to extract a score with metric timing.
          beats = [
              ta for ta in performance_sequence.text_annotations
              if ta.annotation_type == BEAT
              and ta.time <= performance_sequence.total_time
          ]
          if len(beats) < 2:
            Metrics.counter('extract_examples', 'not_enough_beats').inc()
            continue

          # Ensure the sequence starts and ends on a beat.
          performance_sequence = sequences_lib.extract_subsequence(
              performance_sequence,
              start_time=min(beat.time for beat in beats),
              end_time=max(beat.time for beat in beats)
          )

          # Infer beat-aligned chords (only for relative timing).
          try:
            chord_inference.infer_chords_for_sequence(
                performance_sequence,
                chord_change_prob=0.25,
                chord_note_concentration=50.0,
                add_key_signatures=True)
          except chord_inference.ChordInferenceError:
            Metrics.counter('extract_examples', 'chord_inference_failed').inc()
            continue

        # Infer melody regardless of relative/absolute timing.
        try:
          melody_instrument = melody_inference.infer_melody_for_sequence(
              performance_sequence,
              melody_interval_scale=2.0,
              rest_prob=0.1,
              instantaneous_non_max_pitch_prob=1e-15,
              instantaneous_non_empty_rest_prob=0.0,
              instantaneous_missing_pitch_prob=1e-15)
        except melody_inference.MelodyInferenceError:
          Metrics.counter('extract_examples', 'melody_inference_failed').inc()
          continue

        if not self._absolute_timing:
          # Now rectify detected beats to occur at fixed tempo.
          # TODO(iansimon): also include the alignment
          score_sequence, unused_alignment = sequences_lib.rectify_beats(
              performance_sequence, beats_per_minute=SCORE_BPM)
        else:
          # Score uses same timing as performance.
          score_sequence = copy.deepcopy(performance_sequence)

        # Remove melody notes from performance.
        performance_notes = []
        for note in performance_sequence.notes:
          if note.instrument != melody_instrument:
            performance_notes.append(note)
        del performance_sequence.notes[:]
        performance_sequence.notes.extend(performance_notes)

        # Remove non-melody notes from score.
        score_notes = []
        for note in score_sequence.notes:
          if note.instrument == melody_instrument:
            score_notes.append(note)
        del score_sequence.notes[:]
        score_sequence.notes.extend(score_notes)

        # Remove key signatures and beat/chord annotations from performance.
        del performance_sequence.key_signatures[:]
        del performance_sequence.text_annotations[:]

        Metrics.counter('extract_examples', 'extracted_score').inc()

      for augment_fn in self._augment_fns:
        # Augment and encode the performance.
        try:
          augmented_performance_sequence = augment_fn(performance_sequence)
        except DataAugmentationError:
          Metrics.counter(
              'extract_examples', 'augment_performance_failed').inc()
          continue
        example_dict = {
            'targets': self._encode_performance_fn(
                augmented_performance_sequence)
        }
        if not example_dict['targets']:
          Metrics.counter('extract_examples', 'skipped_empty_targets').inc()
          continue

        if (self._random_crop_length and
            len(example_dict['targets']) > self._random_crop_length):
          # Take a random crop of the encoded performance.
          max_offset = len(example_dict['targets']) - self._random_crop_length
          offset = random.randrange(max_offset + 1)
          example_dict['targets'] = example_dict['targets'][
              offset:offset + self._random_crop_length]

        if self._encode_score_fns:
          # Augment the extracted score.
          try:
            augmented_score_sequence = augment_fn(score_sequence)
          except DataAugmentationError:
            Metrics.counter('extract_examples', 'augment_score_failed').inc()
            continue

          # Apply all score encoding functions.
          skip = False
          for name, encode_score_fn in self._encode_score_fns.items():
            example_dict[name] = encode_score_fn(augmented_score_sequence)
            if not example_dict[name]:
              Metrics.counter('extract_examples',
                              'skipped_empty_%s' % name).inc()
              skip = True
              break
          if skip:
            continue

        Metrics.counter('extract_examples', 'encoded_example').inc()
        Metrics.distribution(
            'extract_examples', 'performance_length_in_seconds').update(
                int(augmented_performance_sequence.total_time))

        yield generator_utils.to_example(example_dict)
 def __init__(self):
     super(FormatCsvRowFn, self).__init__()
     self.num_parse_errors = Metrics.counter(self.__class__,
                                             'num_format_csv_row_errors')
예제 #45
0
  def process(self, kv):
    # Seed random number generator based on key so that hop times are
    # deterministic.
    key, ns_str = kv
    m = hashlib.md5(key)
    random.seed(int(m.hexdigest(), 16))

    # Deserialize NoteSequence proto.
    ns = note_seq.NoteSequence.FromString(ns_str)

    # Apply sustain pedal.
    ns = sequences_lib.apply_sustain_control_changes(ns)

    # Remove control changes as there are potentially a lot of them and they are
    # no longer needed.
    del ns.control_changes[:]

    for _ in range(self._num_replications):
      for augment_fn in self._augment_fns:
        # Augment and encode the performance.
        try:
          augmented_performance_sequence = augment_fn(ns)
        except DataAugmentationError:
          Metrics.counter(
              'extract_examples', 'augment_performance_failed').inc()
          continue
        seq = self._encode_performance_fn(augmented_performance_sequence)
        # feed in performance as both input/output to music transformer
        # chopping sequence into length 2048 (throw out shorter sequences)
        if len(seq) >= 2048:
          max_offset = len(seq) - 2048
          offset = random.randrange(max_offset + 1)
          cropped_seq = seq[offset:offset + 2048]

          example_dict = {
              'inputs': cropped_seq,
              'targets': cropped_seq
          }

          if self._melody:
            # decode truncated performance sequence for melody inference
            decoded_midi = self._decode_performance_fn(cropped_seq)
            decoded_ns = note_seq.midi_io.midi_file_to_note_sequence(
                decoded_midi)

            # extract melody from cropped performance sequence
            melody_instrument = melody_inference.infer_melody_for_sequence(
                decoded_ns,
                melody_interval_scale=2.0,
                rest_prob=0.1,
                instantaneous_non_max_pitch_prob=1e-15,
                instantaneous_non_empty_rest_prob=0.0,
                instantaneous_missing_pitch_prob=1e-15)

            # remove non-melody notes from score
            score_sequence = copy.deepcopy(decoded_ns)
            score_notes = []
            for note in score_sequence.notes:
              if note.instrument == melody_instrument:
                score_notes.append(note)
            del score_sequence.notes[:]
            score_sequence.notes.extend(score_notes)

            # encode melody
            encode_score_fn = self._encode_score_fns['melody']
            example_dict['melody'] = encode_score_fn(score_sequence)
            # make sure performance input also matches targets; needed for
            # compatibility of both perf and (mel & perf) autoencoders

            if self._noisy:
              # randomly sample a pitch shift to construct noisy performance
              all_pitches = [x.pitch for x in decoded_ns.notes]
              min_val = min(all_pitches)
              max_val = max(all_pitches)
              transpose_range = range(-(min_val - 21), 108 - max_val + 1)
              try:
                transpose_range.remove(0)  # make sure you transpose
              except ValueError:
                pass
              transpose_amount = random.choice(transpose_range)
              augmented_ns, _ = sequences_lib.transpose_note_sequence(
                  decoded_ns, transpose_amount, min_allowed_pitch=21,
                  max_allowed_pitch=108, in_place=False)
              aug_seq = self._encode_performance_fn(augmented_ns)
              example_dict['performance'] = aug_seq
            else:
              example_dict['performance'] = example_dict['targets']
            del example_dict['inputs']

          Metrics.counter('extract_examples', 'encoded_example').inc()
          Metrics.distribution(
              'extract_examples', 'performance_length_in_seconds').update(
                  int(augmented_performance_sequence.total_time))

          yield generator_utils.to_example(example_dict)
예제 #46
0
 def __init__(self, vals):
   self._vals = vals
   self._output_counter = Metrics.counter('main', 'outputs')
예제 #47
0
 def __init__(self):
     self.counter = Metrics.counter('pardo', 'total_bytes.count')
 def __init__(self):
   self.runtime_start = Metrics.distribution('pardo', 'runtime.start')
   self.runtime_end = Metrics.distribution('pardo', 'runtime.end')
예제 #49
0
 def __init__(self):
     self.runtime_start = Metrics.distribution('pardo', 'runtime.start')
     self.runtime_end = Metrics.distribution('pardo', 'runtime.end')
예제 #50
0
 def __init__(self):
   super(ParseEventFn, self).__init__()
   self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
예제 #51
0
 def __init__(self, vals):
   self._vals = vals
   self._output_counter = Metrics.counter('main', 'outputs')
예제 #52
0
파일: snippets.py 프로젝트: eralmas7/beam
 def __init__(self, pattern):
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Create
   # custom metrics matched_word and unmatched_words.
   self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
예제 #53
0
def combine_matching_seqs(ns_ids):
  ns, ids = ns_ids
  beam_metrics.counter('ExtractExamplesDoFn', 'unique-examples').inc()
  ns.id = ','.join(ids)
  return ns
예제 #54
0
  try:
    from apache_beam.options.pipeline_options import PipelineOptions
  except ImportError:
    from apache_beam.utils.pipeline_options import PipelineOptions
except ImportError:
  from apache_beam.utils.options import PipelineOptions
from PIL import Image
import tensorflow as tf

from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception
from tensorflow.python.framework import errors
from tensorflow.python.lib.io import file_io

slim = tf.contrib.slim

error_count = Metrics.counter('main', 'errorCount')
missing_label_count = Metrics.counter('main', 'missingLabelCount')
csv_rows_count = Metrics.counter('main', 'csvRowsCount')
labels_count = Metrics.counter('main', 'labelsCount')
labels_without_ids = Metrics.counter('main', 'labelsWithoutIds')
existing_file = Metrics.counter('main', 'existingFile')
non_existing_file = Metrics.counter('main', 'nonExistingFile')
skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
embedding_good = Metrics.counter('main', 'embedding_good')
embedding_bad = Metrics.counter('main', 'embedding_bad')
incompatible_image = Metrics.counter('main', 'incompatible_image')
invalid_uri = Metrics.counter('main', 'invalid_file_name')
unlabeled_image = Metrics.counter('main', 'unlabeled_image')
unknown_label = Metrics.counter('main', 'unknown_label')

예제 #55
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')

    # Read 'Job' entities from datastore.
    job_entities = (
        p
        | 'ReadFromDatastore(Job)' >> ReadTimestampRangeFromDatastore(
            {
                'project': project,
                'kind': 'Job'
            },
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            timestamp_property='created'))

    def ConvertEntity(entity):
        entities_read.inc()
        try:
            row_dict = JobEntityToRowDict(entity)
        except UnconvertibleJobError:
            logging.getLogger().exception('Failed to convert Job')
            failed_entity_transforms.inc()
            return []
        return [row_dict]

    job_dicts = (job_entities
                 | 'ConvertEntityToRow(Job)' >> beam.FlatMap(ConvertEntity))

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.jobs`
  (id INT64 NOT NULL,
   arguments STRING NOT NULL,
   bug_id INT64,
   comparison_mode STRING,
   gerrit STRUCT<server STRING, change_id STRING>,
   name STRING,
   tags STRING,
   user_email STRING,
   create_time TIMESTAMP NOT NULL,
   start_time TIMESTAMP,
   update_time TIMESTAMP NOT NULL,
   started BOOLEAN NOT NULL,
   done BOOLEAN NOT NULL,
   cancelled BOOLEAN NOT NULL,
   cancel_reason STRING,
   task STRING,
   exception STRING,
   exception_details STRING,
   difference_count INT64,
   retry_count INT64 NOT NULL,
   benchmark_arguments STRUCT<benchmark STRING, story STRING,
                              story_tags STRING, chart STRING,
                              statistic STRING>,
   use_execution_engine BOOLEAN NOT NULL,
   completed BOOLEAN NOT NULL,
   failed BOOLEAN NOT NULL,
   running BOOLEAN NOT NULL,
   configuration STRING)
  PARTITION BY DATE(`create_time`);
  """  # pylint: disable=pointless-string-statement
    bq_job_schema = {
        'fields': [
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'arguments',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bug_id',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'comparison_mode',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name':
                'gerrit',
                'type':
                'RECORD',
                'mode':
                'NULLABLE',
                'fields': [
                    {
                        'name': 'server',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'change_id',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                ]
            },
            {
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'tags',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'user_email',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'create_time',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'start_time',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            },
            {
                'name': 'update_time',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'started',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'done',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'cancelled',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'cancel_reason',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'task',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'exception',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'exception_details',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'difference_count',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'retry_count',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name':
                'benchmark_arguments',
                'type':
                'RECORD',
                'mode':
                'NULLABLE',
                'fields': [
                    {
                        'name': 'benchmark',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'story',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'story_tags',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'chart',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                    {
                        'name': 'statistic',
                        'type': 'STRING',
                        'mode': 'NULLABLE'
                    },
                ]
            },
            {
                'name': 'use_execution_engine',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'completed',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'failed',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'running',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'configuration',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ]
    }

    # 'dataset' may be a RuntimeValueProvider, so we have to defer calculating
    # the table name until runtime.  The simplest way to do this is by passing a
    # function for the table name rather than a string.
    def TableNameFn(unused_element):
        return '{}:{}.jobs{}'.format(project, bq_export_options.dataset.get(),
                                     bq_export_options.table_suffix)

    _ = job_dicts | 'WriteToBigQuery(jobs)' >> WriteToPartitionedBigQuery(
        TableNameFn, bq_job_schema, element_to_yyyymmdd_fn=_JobToYYYYMMDD)

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
예제 #56
0
from apache_beam.metrics import Metrics
try:
  from apache_beam.utils.pipeline_options import PipelineOptions
except ImportError:
  from apache_beam.utils.options import PipelineOptions
from PIL import Image
import tensorflow as tf

from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception
from tensorflow.python.framework import errors
from tensorflow.python.lib.io import file_io
from google.cloud.ml.io import SaveFeatures

slim = tf.contrib.slim

error_count = Metrics.counter('main', 'errorCount')
missing_label_count = Metrics.counter('main', 'missingLabelCount')
csv_rows_count = Metrics.counter('main', 'csvRowsCount')
labels_count = Metrics.counter('main', 'labelsCount')
labels_without_ids = Metrics.counter('main', 'labelsWithoutIds')
existing_file = Metrics.counter('main', 'existingFile')
non_existing_file = Metrics.counter('main', 'nonExistingFile')
skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
embedding_good = Metrics.counter('main', 'embedding_good')
embedding_bad = Metrics.counter('main', 'embedding_bad')
incompatible_image = Metrics.counter('main', 'incompatible_image')
invalid_uri = Metrics.counter('main', 'invalid_file_name')
unlabeled_image = Metrics.counter('main', 'unlabeled_image')
unknown_label = Metrics.counter('main', 'unknown_label')

def _is_production_tensorflow():
예제 #57
0
파일: bigtableio.py 프로젝트: eralmas7/beam
 def __setstate__(self, options):
   self.beam_options = options
   self.table = None
   self.batcher = None
   self.written = Metrics.counter(self.__class__, 'Written Row')
예제 #58
0
  def _process_ns(self, ns):
    if self._filters:
      if ns.total_time > self._filters['max_total_time']:
        logging.info('Skipping %s: total_time=%f', ns.id, ns.total_time)
        beam_metrics.counter('ExtractExamplesDoFn', 'filtered-too-long').inc()
        return
      if len(ns.notes) > self._filters['max_num_notes']:
        logging.info('Skipping %s: num_notes=%d', ns.id, len(ns.notes))
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-too-many-notes').inc()
        return

      try:
        qns = note_seq.quantize_note_sequence(ns, steps_per_quarter=16)
      except (note_seq.BadTimeSignatureError,
              note_seq.NonIntegerStepsPerBarError, note_seq.NegativeTimeError):
        beam_metrics.counter('ExtractExamplesDoFn', 'quantize-failed').inc()
        return

      vels = set()
      metric_positions = set()
      drums_only = True
      for note in qns.notes:
        drums_only &= note.is_drum
        if ((self._filters['is_drum'] is None or
             note.is_drum == self._filters['is_drum'])
            and note.velocity > 0):
          vels.add(note.velocity)
          metric_positions.add(note.quantized_start_step % 16)

      if len(vels) < self._filters['min_velocities']:
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-min-velocities').inc()
        return
      if len(metric_positions) < self._filters['min_metric_positions']:
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-min-metric-positions').inc()
        return
      if self._filters['drums_only'] and not drums_only:
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-drums-only').inc()
        return

    beam_metrics.counter('ExtractExamplesDoFn', 'unfiltered-sequences').inc()
    logging.info('Converting %s to tensors', ns.id)
    extracted_examples = self._config.data_converter.to_tensors(ns)
    if not extracted_examples.outputs:
      beam_metrics.counter('ExtractExamplesDoFn', 'empty-extractions').inc()
      return
    beam_metrics.counter('ExtractExamplesDoFn', 'extracted-examples').inc(
        len(extracted_examples.outputs))
    for _, outputs, controls, _ in zip(*extracted_examples):
      if controls.size:
        example_ns = self._config.data_converter.from_tensors(
            [outputs], [controls])[0]
      else:
        example_ns = self._config.data_converter.from_tensors([outputs])[0]
      # Try to re-encode.
      # TODO(adarob): For now we filter and count examples that cannot be
      # re-extracted, but ultimately the converter should filter these or avoid
      # producing them all together.
      reextracted_examples = self._config.data_converter.to_tensors(
          example_ns).inputs
      assert len(reextracted_examples) <= 1
      if not reextracted_examples:
        logging.warning(
            'Extracted example NoteSequence does not reproduce example. '
            'Skipping: %s', example_ns)
        beam_metrics.counter('ExtractExamplesDoFn', 'empty-reextraction').inc()
        continue
      # Extra checks if the code returns multiple segments.
      # TODO(fjord): should probably make this recursive for cases with more
      # than 1 level of hierarchy.
      if isinstance(outputs, list):
        if len(outputs) != len(reextracted_examples[0]):
          logging.warning(
              'Re-extracted example tensor has different number of segments. '
              'ID: %s. original %d, reextracted %d. Skipping.', ns.id,
              len(outputs), len(reextracted_examples[0]))
          beam_metrics.counter(
              'ExtractExamplesDoFn', 'different-reextraction-count').inc()
          continue
        for i in range(len(outputs)):
          if not np.array_equal(reextracted_examples[0][i], outputs[i]):
            logging.warning(
                'Re-extracted example tensor does not equal original example. '
                'ID: %s. Index %d. NoteSequence: %s', ns.id, i, example_ns)
            beam_metrics.counter(
                'ExtractExamplesDoFn', 'different-reextraction').inc()
      yield example_ns, ns.id
예제 #59
0
 def __init__(self, namespace):
   self.namespace = namespace
   self.runtime = Metrics.distribution(self.namespace, RUNTIME_LABEL)