def expand(self, pcoll):
     if self._preserve_sample_order:
         return (pcoll
                 | 'GetSampleIds' >> beam.Map(self._get_sample_ids)
                 | 'RemoveDuplicates' >> beam.Distinct()
                 | 'Combine' >> beam.combiners.ToList()
                 | 'ExtractUniqueSampleIds' >> beam.ParDo(
                     self._extract_unique_sample_ids))
     else:
         return (pcoll
                 | 'GetSampleIds' >> beam.FlatMap(self._get_sample_ids)
                 | 'RemoveDuplicates' >> beam.Distinct()
                 | 'Combine' >> beam.combiners.ToList()
                 | 'SortSampleIds' >> beam.ParDo(sorted))
Beispiel #2
0
def ReadAndShuffleData(pcoll, filepatterns):
  """Read a train or test dataset from disk and shuffle it."""
  # NOTE: we pass filepatterns as a tuple instead of two args, as the current
  # version of beam assumes that if the first arg to a ptransfrom_fn is a
  # string, then that string is the label.
  neg_filepattern, pos_filepattern = filepatterns

  # Read from each file pattern and create a tuple of the review text and the
  # correct label.
  negative_examples = (
      pcoll
      | 'ReadNegativeExamples' >> beam.io.ReadFromText(neg_filepattern)
      | 'PairWithZero' >> beam.Map(lambda review: (review, 0)))
  positive_examples = (
      pcoll
      | 'ReadPositiveExamples' >> beam.io.ReadFromText(pos_filepattern)
      | 'PairWithOne' >> beam.Map(lambda review: (review, 1)))
  all_examples = (
      [negative_examples, positive_examples] | 'Merge' >> beam.Flatten())

  # Shuffle the data.  Note that the data does in fact contain duplicate reviews
  # for reasons that are unclear.  This means that NUM_TRAIN_INSTANCES and
  # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data.
  # pylint: disable=no-value-for-parameter
  shuffled_examples = (
      all_examples
      | 'Distinct' >> beam.Distinct()
      | 'Shuffle' >> Shuffle())

  # Put the data in the format that can be accepted directly by tf.Transform.
  return shuffled_examples | 'MakeInstances' >> beam.Map(
      lambda p: {REVIEW_KEY: p[0], LABEL_KEY: p[1]})
Beispiel #3
0
def _CreateCategoricalDict(pcoll, existing_dict_pairs):
    """
    For a specific column, creates a new "categorical dict"
    mapping values to unique ints.
    """
    existing_max_value = (existing_dict_pairs
                          | f"just values" >> beam.Map(lambda r: r[1])
                          | f"get max" >> beam.combiners.Top.Of(1)
                          | f"extract" >> beam.FlatMap(lambda r: r))

    new_pairs = (
        pcoll
        | "filter for unseen" >> beam.Filter(
            lambda row, existing: row not in existing,
            existing=beam.pvalue.AsDict(existing_dict_pairs),
        )
        | beam.Distinct()
        | "group into single list" >> beam.combiners.ToList()
        | "append unique values" >> beam.FlatMap(
            lambda row, max_v: [(key, max_v + 1 + i)
                                for i, key in enumerate(row)],
            max_v=beam.pvalue.AsSingleton(existing_max_value, default_value=0),
        ))

    return [new_pairs, existing_dict_pairs
            ] | "combine new/existing" >> beam.Flatten()
Beispiel #4
0
def run(argv=None, save_main_session=True):
    '''Main entry point; defines and runs the wordcount pipeline.'''
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    processed_users = (lines | 'splits' >> beam.Map(split_and_lower)
                       | 'noNum' >> beam.Map(no_num_format)
                       | 'formatOut' >> beam.Map(format_output))
    processed_users | 'uniqueUser' >> beam.Distinct(
    ) | 'writeUnique' >> WriteToText(known_args.output,
                                     file_name_suffix='.csv')

    schema = avro.schema.parse(open("user.avsc", "rb").read())
    processed_users | 'avro_write' >> beam.io.avroio.WriteToAvro(
        'output_avro', schema, file_name_suffix='.avro')

    reader = DataFileReader(open("output_avro-00000-of-00001.avro", "rb"),
                            DatumReader())
    for user in reader:
        print user
    reader.close()

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
        empty_lines_counter = query_result['counters'][0]
        logging.info('number of empty lines: %d', empty_lines_counter.result)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
        word_lengths_dist = query_result['distributions'][0]
        logging.info('average word length: %d', word_lengths_dist.result.mean)
    def test_distinct(self):
        expected = [1, 2, 3]

        inputs = [1, 1, 2, 3]

        with TestPipeline() as p:
            actual = (p | beam.Create(inputs) | beam.Distinct())

            assert_that(actual, equal_to(expected))
Beispiel #6
0
 def _sample_examples(pipeline):
     seeds = range(FLAGS.num_examples)
     examples = (
         pipeline
         | "Create" >> beam.Create(seeds)
         | "SampleExamples" >> beam.Map(sample_example, sampler=sampler)
         | "Format" >> beam.Map(lambda ex: "%s\t%s" % (ex[0], ex[1])))
     if not FLAGS.allow_duplicates:
         examples = examples | "RemoveDuplicates" >> beam.Distinct()
     _ = examples | "WriteExamples" >> beam.io.WriteToText(FLAGS.output)
def user_agent_threshold(pipeline, file_path):
    with pipeline as p:
        return (p
                | beam.io.ReadFromText(file_path)
                | 'ParseNetworkLogs' >> beam.ParDo(ParseNetworkLogs())
                | 'MapIPsToUserAgents' >>
                beam.Map(lambda elem: (elem['source.ip'], elem['agent.id']))
                | 'GetDistinct' >> beam.Distinct()
                | 'Group' >> beam.combiners.Count.PerKey()
                | 'Filter' >> beam.Filter(lambda entry: entry[1] > 1))
Beispiel #8
0
    def _add_metadata(
            self, rows: beam.pvalue.PCollection[Row]
    ) -> beam.pvalue.PCollection[Row]:
        """Add ip metadata to a collection of roundtrip rows.

    Args:
      rows: beam.PCollection[Row]

    Returns:
      PCollection[Row]
      The same rows as above with with additional metadata columns added.
    """

        # PCollection[Tuple[DateIpKey,Row]]
        rows_keyed_by_ip_and_date = (
            rows
            | 'key by ips and dates' >> beam.Map(lambda row: (make_date_ip_key(
                row), row)).with_output_types(Tuple[DateIpKey, Row]))

        # PCollection[DateIpKey]
        # pylint: disable=no-value-for-parameter
        ips_and_dates = (rows_keyed_by_ip_and_date
                         | 'get ip and date keys per row' >>
                         beam.Keys().with_output_types(DateIpKey))

        # PCollection[DateIpKey]
        deduped_ips_and_dates = (
            # pylint: disable=no-value-for-parameter
            ips_and_dates
            | 'dedup' >> beam.Distinct().with_output_types(DateIpKey))

        # PCollection[Tuple[date,List[ip]]]
        grouped_ips_by_dates = (
            deduped_ips_and_dates | 'group by date' >>
            beam.GroupByKey().with_output_types(Tuple[str, Iterable[str]]))

        # PCollection[Tuple[DateIpKey,Row]]
        ips_with_metadata = (grouped_ips_by_dates
                             | 'get ip metadata' >> beam.FlatMapTuple(
                                 self._add_ip_metadata).with_output_types(
                                     Tuple[DateIpKey, Row]))

        # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]]
        grouped_metadata_and_rows = (
            ({
                IP_METADATA_PCOLLECTION_NAME: ips_with_metadata,
                ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date
            }) | 'group by keys' >> beam.CoGroupByKey())

        # PCollection[Row]
        rows_with_metadata = (
            grouped_metadata_and_rows | 'merge metadata with rows' >>
            beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row))

        return rows_with_metadata
Beispiel #9
0
 def expand(self, pcoll):
     return (
         pcoll
         | "Start" >> beam.FlatMap(_start_stage, self.specs_by_target)
         | "CreateTasks" >> beam.FlatMapTuple(_copy_tasks)
         # prevent undesirable fusion
         # https://stackoverflow.com/a/54131856/809705
         | "Reshuffle" >> beam.Reshuffle()
         | "CopyChunks" >> beam.MapTuple(_copy_chunk)
         # prepare inputs for the next stage (if any)
         | "Finish" >> beam.Distinct())
Beispiel #10
0
 def expand(self, pcollection):
     return (
         pcollection
         | 'FilterNAs' >> beam.ParDo(keep_only_non_nulls_request_kv)
         | 'ExtractCrmId' >>
         beam.Map(lambda elt: (elt['UniqueIdentifier'],
                               parse_all_request_kv(elt['AllRequestKv'])))
         | 'Deduplicate' >> beam.Distinct()
         | 'FormatJoinData' >> beam.Map(lambda elt: (elt[0], {
             'FreewheelId': elt[1]
         })))
Beispiel #11
0
def _TrackDistinctSliceKeys(  # pylint: disable=invalid-name
    slice_keys_and_values: beam.PCollection[types.SlicedRecordBatch]
) -> beam.pvalue.PCollection[int]:
  """Gathers slice key telemetry post slicing."""

  return (slice_keys_and_values
          | 'ExtractSliceKeys' >> beam.Keys()
          | 'RemoveDuplicates' >> beam.Distinct()
          | 'Size' >> beam.combiners.Count.Globally()
          | 'IncrementCounter' >> beam.Map(
              lambda x: _increment_counter('num_distinct_slice_keys', x)))
Beispiel #12
0
def _TrackDistinctSliceKeys(  # pylint: disable=invalid-name
    slice_keys_and_values: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    """Gathers slice key telemetry post slicing."""
    def increment_counter(element):  # pylint: disable=invalid-name
        num_distinct_slice_keys = beam.metrics.Metrics.counter(
            constants.METRICS_NAMESPACE, 'num_distinct_slice_keys')
        num_distinct_slice_keys.inc(element)
        return element

    return (slice_keys_and_values
            | 'ExtractSliceKeys' >> beam.Keys()
            | 'RemoveDuplicates' >> beam.Distinct()
            | 'Size' >> beam.combiners.Count.Globally()
            | 'IncrementCounter' >> beam.Map(increment_counter))
Beispiel #13
0
def run(save_main_session=True):
    """main entry point"""

    opts = MyOptions()
    opts.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=opts) as pipeline:
        (
                pipeline
                | 'read lines' >> beam.io.ReadFromText(opts.input)
                | 'remove duplicates line' >> beam.Distinct()
                | ExtractCSVLine()
                | 'Write results' >> beam.io.WriteToText(opts.output)
        )
Beispiel #14
0
def distinct(test=None):
    # [START distinct]
    import apache_beam as beam

    with beam.Pipeline() as pipeline:
        unique_elements = (pipeline
                           | 'Create produce' >> beam.Create([
                               '🥕',
                               '🥕',
                               '🍆',
                               '🍅',
                               '🍅',
                               '🍅',
                           ])
                           | 'Deduplicate elements' >> beam.Distinct()
                           | beam.Map(print))
        # [END distinct]
        if test:
            test(unique_elements)
def filter_by_std_dev(pipeline, file_path):
    with pipeline as p:

        def std_dev_map(entry, means_and_counts):
            # Creating variables probably slows the pipeline a lot
            # but keep it for readability for now.
            ip, entries = entry
            mean = means_and_counts[ip][0]
            count = means_and_counts[ip][1]
            # Use numpy for array-wise operations
            # TODO: Loading python for variance is probably not the best way to go
            # TODO: Feels like this can be a separate CombinePerKey
            variance = np.sum(np.square(np.array(entries) - mean)) / count
            return ip, np.sqrt(variance if variance > 0 else 0)

        def filter_by_stddev(entry, means_and_counts, std_devs):
            # Refraining from creating new variables during pipeline
            return entry[1] > \
                   (means_and_counts[entry[0]][0] + std_devs[entry[0]])

        fields = (p
                  | 'ReadInputText' >> beam.io.ReadFromText(file_path)
                  | 'ParseLogs' >> NetworkUsage())

        # Combine mean and count to a single pipeline
        mean_and_count_per_key = beam.pvalue.AsDict(
            fields
            | 'GetMeanAndCountPerKey' >> beam.CombinePerKey(MeanAndCount()))

        # TODO: This part needs improvements
        std_dev_per_key = beam.pvalue.AsDict(
            fields
            | 'GroupByKey' >> beam.GroupByKey()
            | 'GetDifferencesPerKey' >> beam.Map(std_dev_map,
                                                 mean_and_count_per_key))

        return (fields
                | 'FilterZeroBytes' >> beam.Filter(lambda entry: entry[1] > 0)
                | 'FilterByStdDev' >> beam.Filter(
                    filter_by_stddev, mean_and_count_per_key, std_dev_per_key)
                | 'GetIPs' >> beam.Keys()
                | 'GetDistinctIPs' >> beam.Distinct())
Beispiel #16
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    # parser.add_argument('--project', type=str, required=False, help='project')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='./',
                        help='Output file to write results to.')
    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Store the CLI arguments to variables
    # project_id = known_args.project

    # Setup the dataflow pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # pipeline_options.view_as(SetupOptions).save_main_session = True
    # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    # google_cloud_options.project = project_id

    save_main_session = True
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    print(pipeline_args)

    with beam.Pipeline() as pipeline:
        total = pipeline | 'Create plant counts' >> beam.Create([
            ('1', 3),
            ('1', 2),
            ('2', 1),
            ('3', 4),
            ('4', 5),
            ('4', 3),
        ]) | 'Sum' >> beam.Distinct() | beam.Map(print)
Beispiel #17
0
    def expand(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | 'GetUris 1' >> beam.Keys()
                           | 'GetUniqueUris' >> beam.Distinct()
                           | 'CountUris' >> beam.combiners.Count.Globally())

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words(uri_line):
            (uri, line) = uri_line
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | 'SplitWords' >> beam.FlatMap(split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | 'GetUniqueWordsPerDoc' >> beam.Distinct()
            | 'GetWords' >> beam.Values()
            | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | 'GetUris 2' >> beam.Keys()
            | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        def shift_keys(uri_word_count):
            return (uri_word_count[0][0], (uri_word_count[0][1],
                                           uri_word_count[1]))

        uri_to_word_and_count = (uri_and_word_to_count
                                 | 'ShiftKeys' >> beam.Map(shift_keys))

        # Perform a CoGroupByKey (a sort of pre-join) on the prepared
        # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency(uri_count_and_total):
            (uri, count_and_total) = uri_count_and_total
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
            | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the function (called total---note that the first argument is a tuple)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        def div_word_count_by_total(word_count, total):
            (word, count) = word_count
            return (word, float(count) / total)

        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                div_word_count_by_total, AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.

        def compute_tf_idf(word_tf_and_df):
            (word, tf_and_df) = word_tf_and_df
            [docf] = tf_and_df['df']
            for uri, tf in tf_and_df['tf']:
                yield word, (uri, tf * math.log(1 / docf))

        word_to_uri_and_tfidf = (
            word_to_uri_and_tf_and_df
            | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))

        return word_to_uri_and_tfidf
def pipeline(config_map, dataset_config_map, preprocess_example_fn,
             input_tensors_to_example_fn):
    """Pipeline for dataset creation."""
    tf.flags.mark_flags_as_required(['output_directory'])

    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options.split(','))

    config = config_map[FLAGS.config]
    hparams = config.hparams
    hparams.parse(FLAGS.hparams)

    datasets = dataset_config_map[FLAGS.dataset_config]

    if tf.gfile.Exists(FLAGS.output_directory):
        raise ValueError('Output directory %s already exists!' %
                         FLAGS.output_directory)
    tf.gfile.MakeDirs(FLAGS.output_directory)
    with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'),
                       'w') as f:
        f.write('\n\n'.join([
            'min_length: {}'.format(FLAGS.min_length),
            'max_length: {}'.format(FLAGS.max_length),
            'sample_rate: {}'.format(FLAGS.sample_rate),
            'preprocess_examples: {}'.format(FLAGS.preprocess_examples),
            'preprocess_train_example_multiplier: {}'.format(
                FLAGS.preprocess_train_example_multiplier),
            'config: {}'.format(FLAGS.config),
            'hparams: {}'.format(hparams.to_json(sort_keys=True)),
            'dataset_config: {}'.format(FLAGS.dataset_config),
            'datasets: {}'.format(datasets),
        ]))

    with beam.Pipeline(options=pipeline_options) as p:
        for dataset in datasets:
            if isinstance(dataset.path, (list, tuple)):
                # If dataset.path is a list, then it's a list of sources to mix together
                # to form new examples. First, do the mixing, then pass the results to
                # the rest of the pipeline.
                id_exs = []
                sourceid_to_exids = []
                for source_id, stem_path in enumerate(dataset.path):
                    if dataset.num_mixes is None:
                        raise ValueError(
                            'If path is not a list, num_mixes must not be None: {}'
                            .format(dataset))
                    stem_p = p | 'tfrecord_list_%s_%d' % (
                        dataset.name, source_id) >> (beam.Create(
                            data.generate_sharded_filenames(stem_path)))

                    # Note that we do not specify a coder when reading here.
                    # This is so that the hashing in key_example below can work directly
                    # on the serialized version instead of having to re-serialize it.
                    # Also, deserializing with a coder and then re-serializing does not
                    # always generate the same hash for the same example (likely due to
                    # the map fields in tf.train.Example). This is important when reading
                    # the same dataset multiple times to mix it with itself.
                    stem_p |= 'read_tfrecord_%s_%d' % (
                        dataset.name, source_id) >> (
                            beam.io.tfrecordio.ReadAllFromTFRecord())
                    stem_p |= 'shuffle_stems_%s_%d' % (
                        dataset.name, source_id) >> (beam.Reshuffle())

                    # Key all examples with a hash.
                    def key_example(ex):
                        return (hashlib.sha256(ex).hexdigest(), ex)

                    stem_p |= 'add_id_key_%s_%d' % (
                        dataset.name, source_id) >> (beam.Map(key_example))
                    id_exs.append(stem_p)

                    # Create a list of source_id to example id.
                    def sourceid_to_exid(id_ex, source_id):
                        return (source_id, id_ex[0])

                    sourceid_to_exids.append(
                        stem_p | 'key_%s_%d' % (dataset.name, source_id) >>
                        (beam.Map(sourceid_to_exid, source_id=source_id)))

                # ('example_hash', serialized_example)
                id_exs = (
                    id_exs
                    | 'id_exs_flatten_%s' % dataset.name >> beam.Flatten()
                    | 'id_exs_distinct_%s' % dataset.name >> beam.Distinct())

                # ('source_id, 'example_hash')
                sourceid_to_exids = (sourceid_to_exids
                                     | 'sourceid_to_exids_flatten_%s' %
                                     dataset.name >> beam.Flatten())

                # Pass the list of source id to example IDs to generate_mixes,
                # which will create mixes by selecting random IDs from each source
                # (with replacement). This is represented as a list of example IDs
                # to Mix IDs.
                # Note: beam.Create([0]) is just a single dummy value to allow the
                # sourceid_to_exids to be passed in as a python list so we can do the
                # sampling with numpy.
                exid_to_mixids = (
                    p
                    | 'create_dummy_%s' % dataset.name >> beam.Create([0])
                    | 'generate_mixes_%s' % dataset.name >> beam.Map(
                        create_dataset_lib.generate_mixes,
                        num_mixes=dataset.num_mixes,
                        sourceid_to_exids=beam.pvalue.AsList(
                            sourceid_to_exids)))

                # Create a list of (Mix ID, Full Example proto). Note: Examples may be
                # present in more than one mix. Then, group by Mix ID.
                def mixid_to_exs(id_ex, exid_to_mixids):
                    exid, ex = id_ex
                    for mixid in exid_to_mixids[exid]:
                        yield mixid, ex

                mixid_exs = (
                    id_exs
                    | 'mixid_to_exs_%s' % dataset.name >> beam.FlatMap(
                        mixid_to_exs,
                        exid_to_mixids=beam.pvalue.AsSingleton(exid_to_mixids))
                    | 'group_by_key_%s' % dataset.name >> beam.GroupByKey())
                # Take these groups of Examples, mix their audio and sequences to return
                # a single new Example. Then, carry on with the rest of the pipeline
                # like normal.
                split_p = (mixid_exs
                           | 'mix_examples_%s' % dataset.name >> beam.Map(
                               mix_examples, FLAGS.sample_rate,
                               FLAGS.load_audio_with_librosa))
            else:
                if dataset.num_mixes is not None:
                    raise ValueError(
                        'If path is not a list, num_mixes must be None: {}'.
                        format(dataset))
                split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create(
                    data.generate_sharded_filenames(dataset.path))
                split_p |= 'read_tfrecord_%s' % dataset.name >> (
                    beam.io.tfrecordio.ReadAllFromTFRecord(
                        coder=beam.coders.ProtoCoder(tf.train.Example)))
            split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap(
                split_wav,
                min_length=FLAGS.min_length,
                max_length=FLAGS.max_length,
                sample_rate=FLAGS.sample_rate,
                debug_output_directory=FLAGS.output_directory,
                split_example=dataset.process_for_training,
                load_audio_with_librosa=FLAGS.load_audio_with_librosa)
            if FLAGS.preprocess_examples:
                if dataset.process_for_training:
                    mul_name = 'preprocess_multiply_%dx_%s' % (
                        FLAGS.preprocess_train_example_multiplier,
                        dataset.name)
                    split_p |= mul_name >> beam.FlatMap(
                        multiply_example,
                        FLAGS.preprocess_train_example_multiplier)
                split_p |= 'preprocess_%s' % dataset.name >> beam.Map(
                    preprocess_data, preprocess_example_fn,
                    input_tensors_to_example_fn, hparams,
                    dataset.process_for_training)
            split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord(
                os.path.join(FLAGS.output_directory,
                             '%s.tfrecord' % dataset.name),
                coder=beam.coders.ProtoCoder(tf.train.Example))
    # Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original
    def getSchema():
        df_schema = pyarrow.Schema.from_pandas(
            pd.read_parquet(user_options.schema_source.get()))
        for (key, value) in ast.literal_eval(
                user_options.rename_columns.get()).items():
            df_schema = df_schema.set(
                df_schema.get_field_index(key),
                pyarrow.field(value,
                              df_schema.types[df_schema.get_field_index(key)]))
        return df_schema

    # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar
    map_rename_cols = (
        p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw)
        | "Map rename cols" >> beam.Map(mapRenameCols)
        | "Rename cols to string" >> beam.Map(str)
        | "Deduplicate elements" >> beam.Distinct())
    # Este lee los datos desde los archivos fuente
    data = (p
            | "Read parquet for data" >> ReadFromParquet(user_options.url_raw))
    # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario
    rename_data = (data | "Rename columns" >> beam.Map(
        reColumns, rename_cols=AsList(map_rename_cols)))
    # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema
    _ = (rename_data | "Write to storage TRN" >> WriteToParquet(
        user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet"))

print("End Pipeline")
def run_pipeline(root, input_note_events, input_ratings, output_path, vocab,
                 section_markers, cur_augmentation_config):
    """Create beam pipeline to generate TF examples.

  Args:
    root: beam.Pipeline root.
    input_note_events: Path to csv of notes.
    input_ratings: Path to csv of ratings.
    output_path: Directory path to write output to.
    vocab: List of tokens in the vocabulary.
    section_markers: Dict of markers as accepted by note sectioning.
    cur_augmentation_config: AugmentationConfig dataclass instance, defines the
      kinds of augmentations to apply.
  """

    # Load and process ratings:
    raw_ratings = data_lib.read_raw_ratings(root, input_ratings)

    ratings = (raw_ratings
               | "GetLabels" >> beam.Map(data_lib.convert_ratings)
               | "GroupRatingsByNoteId" >> beam.GroupByKey()
               | "UnpackRatings" >> beam.Map(lambda x: (x[0], list(x[1]))))

    # Load and process notes:
    notes = data_lib.read_filter_notes(root, input_note_events)

    note_partitions = (
        raw_ratings
        | "PartitionMap" >>
        (beam.Map(lambda x: (str(x.note_id), x.partition))).with_output_types(
            Tuple[str, str])
        | "DedupPartitionMap" >> beam.Distinct())

    # Join.
    non_rated_notes, rated_notes = (
        ({
            "ratings": ratings,
            "notes": notes,
            "note_partition": note_partitions
        })
        | "Join" >> beam.CoGroupByKey().with_output_types(
            Tuple[str, Dict[str, Any]])
        | "SplitRated" >> beam.Partition(
            lambda x, n_part: int(bool(x[1]["ratings"])), 2))

    # Downsample non-rated.
    non_rated_notes = data_lib.downsample(non_rated_notes, _N_DOWNSAMPLE.value,
                                          _RANDOM_SEED.value)

    # Process notes.
    features_and_labels = (
        (non_rated_notes, rated_notes)
        | beam.Flatten()
        | "ReshuffleJoin" >> beam.Reshuffle()
        | "ProcessAPData" >> beam.ParDo(data_lib.ProcessAPData(),
                                        section_markers)
        | "FilterAPData" >> beam.Filter(data_lib.filter_by_labels)
        | "ReshuffleForSubjectId" >> beam.Reshuffle()
        | "RekeyBySubjectId" >> beam.Map(lambda x: (x[1].subject_id, x[1]))
        | "GroupBySubjectId" >> beam.GroupByKey()
        | "OneNoteIdPerRatedSubjectId" >> beam.ParDo(
            data_lib.OneNoteIdPerRatedSubjectId(), seed=_RANDOM_SEED.value)
        | "RekeyByNoteId" >> beam.Map(lambda x: (x.note_id, x))
        | "ApplyAugmentations" >> beam.ParDo(data_lib.ApplyAugmentations(),
                                             cur_augmentation_config,
                                             _RANDOM_SEED.value)
        | "GetFeaturesAndLabels" >> beam.ParDo(
            data_lib.ProcessFeaturesAndLabels(vocab, _MAX_SEQ_LENGTH.value))
        | "ReshuffleFeaturesAndLabels" >> beam.Reshuffle())

    # Convert and save tf examples:
    data_lib.convert_and_save_tf_examples(features_and_labels, output_path,
                                          _DEBUG_OUTPUT.value)
pipeline_options = PipelineOptions(
    # runner='DataflowRunner',
    project='gcp-nyc',
    # job_name='unique-job-name',
    temp_location=f'gs://{BUCKET}/temp',
    region='us-central1')

with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    read_gbq = p | 'Read' >> beam.io.Read(
        beam.io.BigQuerySource(query=QUERY, use_standard_sql=True))
    aliquot_count = (
        read_gbq
        | 'ExtractAliquot' >> beam.Map(lambda elt: elt['aliquot_barcode'])
        | 'Deduplicate' >> beam.Distinct()
        | 'ReplaceWithOnes' >> beam.Map(lambda elt: 1)
        | 'Sum' >> beam.CombineGlobally(sum))

    cpg_observation_count = (
        read_gbq
        | 'ExtractCpG' >> beam.Map(lambda elt: (elt['CpG_probe_id'], 1))
        | 'CombineCpG' >> beam.CombinePerKey(sum))
    # counts = (
    #     lines
    #     | 'Split' >>
    #     (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
    #     | 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
    #     | 'GroupAndSum' >> beam.CombinePerKey(sum))
    #
Beispiel #22
0
 def expand(self, pcoll):
     return (pcoll
             | beam.Distinct()
             | beam.Map(lambda r: (None, r))
             | beam.GroupByKey()
             | beam.ParDo(_PairWithIndexNumberDoFn()))
Beispiel #23
0
def run(in_pcol, job_config):
    # load 5 seconds of audio and get STFT
    stft = (in_pcol
            | aio.GcsLoadBinary()
            | audio.LoadAudio(offset=10, duration=5)
            | audio.GetSTFT())
    # get magnitude of audio
    magnitude = (stft | "Get magnitude" >> beam.ParDo(
        transforms.GetMagnitude()).with_outputs())
    # map the result to a key (the KlioMessage element)
    # so we can group all results by key
    magnitude_key = (
        magnitude.spectrogram
        | "element to spec" >> beam.Map(transforms.create_key_from_element))
    # get nearest neighbors and map the result to a key (the KlioMessage element)
    nn_filter = (
        magnitude.spectrogram
        | "Get nn filter" >> beam.ParDo(transforms.FilterNearestNeighbors())
        | "element to filter" >> beam.Map(transforms.create_key_from_element))
    # map together the full magnitude with its filter by key  (the KlioMessage element)
    merge = ({
        "full": magnitude_key,
        "nnfilter": nn_filter
    }
             | "merge" >> beam.CoGroupByKey())
    # calc the difference between full magnitude and the filter
    net = merge | beam.Map(transforms.subtract_filter_from_full)
    # create a mask from the filter minus the difference of full & filter
    first_mask = ({
        "first": nn_filter,
        "second": net,
        "full": magnitude_key
    }
                  | "first mask group" >> beam.CoGroupByKey()
                  |
                  "first mask" >> beam.ParDo(transforms.GetSoftMask(margin=2)))
    # create another mask from the difference of full & filter minus the filter
    second_mask = (
        {
            "first": net,
            "second": nn_filter,
            "full": magnitude_key
        }
        | "second mask group" >> beam.CoGroupByKey()
        | "second mask" >> beam.ParDo(transforms.GetSoftMask(margin=10)))
    # plot the full magnitude spectrogram
    magnitude_out = (magnitude.spectrogram
                     | "full spec" >> audio.GetSpec()
                     | "plot full spec" >> audio.SpecToPlot(
                         title="Full Spectrogam for {element}", y_axis="log")
                     | "save full" >> aio.GcsUploadPlot(suffix="-full"))
    # plot the first mask (background) spectrogram
    background_out = (
        first_mask
        | "background spec" >> audio.GetSpec()
        | "plot background spec" >> audio.SpecToPlot(
            title="Background Spectrogam for {element}", y_axis="log")
        | "save background" >> aio.GcsUploadPlot(suffix="-background"))
    # plot the second mask (foreground) spectrogram
    foreground_out = (
        second_mask
        | "foreground spec" >> audio.GetSpec()
        | "plot forground spec" >> audio.SpecToPlot(
            title="Foreground Spectrogam for {element}", y_axis="log")
        | "save foreground" >> aio.GcsUploadPlot(suffix="-foreground"))
    # flatten all outputs into one PCollection, then remove duplicates
    out_pcol = ((magnitude_out, background_out, foreground_out)
                | "flatten output paths" >> beam.Flatten()
                | "remove dups" >> beam.Distinct())
    return out_pcol
Beispiel #24
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    # parser.add_argument('--project', type=str, required=False, help='project')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='./',
                        help='Output file to write results to.')
    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Store the CLI arguments to variables
    # project_id = known_args.project

    # Setup the dataflow pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # pipeline_options.view_as(SetupOptions).save_main_session = True
    # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    # google_cloud_options.project = project_id

    save_main_session = True
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    print(pipeline_args)

    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name":
            "ACNO",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 20
                }
            ]
        }, {
            "name": "NUM_OF_MTHS_PD_30",
            "type": ["null", 'int', 'string']
        }, {
            "name":
            "FIELD_1",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_2",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_3",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_4",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_5",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_6",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_7",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_8",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_9",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_10",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }]
    }

    rec_cnt = known_args.records
    with beam.Pipeline(options=pipeline_options) as p:
        left_pcol_name = 'p1'
        file = p | 'read_source' >> beam.io.ReadFromAvro(
            "./data/Curr_account.avro") | beam.Distinct()
        file2 = p | 'read_source2' >> beam.io.ReadFromAvro(
            "./data/Prev_account.avro")
        p1 = file | 'filter fields' >> beam.Filter(
            lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0)
        p2 = file2 | 'filter fields2' >> beam.Filter(
            lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0)
        # P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv')
        # P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv')

        right_pcol_name = 'p2'

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2}
        test_pipeline = pipelines_dictionary | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)
        test_pipeline | 'add 1 to NUM_OF_MTHS_PD_30' >> beam.Map(
            add_one) | "write4" >> beam.io.WriteToText('./data4.csv')
        print(type(test_pipeline))
        compressIdc = True
        use_fastavro = True
        #

        test_pipeline | 'write_fastavro' >> WriteToAvro(
            known_args.output,
            parse_schema(SCHEMA),
            use_fastavro=use_fastavro,
            file_name_suffix='.avro',
            codec=('deflate' if compressIdc else 'null'),
        )
    result = p.run()
    result.wait_until_finish()
Beispiel #25
0
def dataflow_pipeline_run(BUCKET_NAME, options):

    with beam.Pipeline(options=options) as p:
        picks = p | 'ReadPickups' >> beam.io.ReadFromText(
            'gs://{}/pickups*.csv'.format(BUCKET_NAME))
        depls = p | 'ReadDeployments' >> beam.io.ReadFromText(
            'gs://{}/deployments*.csv'.format(BUCKET_NAME))
        rides = p | 'ReadRides' >> beam.io.ReadFromText(
            'gs://{}/rides*.csv'.format(BUCKET_NAME))

        picks | 'PickupsCountBeforeDedup' >> beam.combiners.Count.Globally(
        ) | 'PickupsNameIt1' >> beam.Map(
            lambda x: (x, 'PickupsBeforeDedup')
        ) | 'PickupsPrintCount1' >> beam.io.WriteToText(
            'gs://{}/etl_logs/pickups_before_dedup.txt'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")
        depls | 'DeploymentsCountBeforeDedup' >> beam.combiners.Count.Globally(
        ) | 'DeploymentsNameIt1' >> beam.Map(
            lambda x: (x, 'DeploymentsBeforeDedup')
        ) | 'DeploymentsPrintCount1' >> beam.io.WriteToText(
            'gs://{}/etl_logs/deployments_before_dedup.txt'.format(
                BUCKET_NAME),
            num_shards=1,
            shard_name_template="")
        rides | 'RidesCountBeforeDedup' >> beam.combiners.Count.Globally(
        ) | 'RidesNameIt1' >> beam.Map(
            lambda x: (x, 'RidesBeforeDedup')
        ) | 'RidesPrintCount1' >> beam.io.WriteToText(
            'gs://{}/etl_logs/rides_before_dedup.txt'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")

        picks_dedup = picks | 'DeDupPickups' >> beam.Distinct()
        depls_dedup = depls | 'DeDupDeployments' >> beam.Distinct()
        rides_dedup = rides | 'DeDupRides' >> beam.Distinct()

        picks_dedup | 'PickupsCountAfterDedup' >> beam.combiners.Count.Globally(
        ) | 'PickupsNameIt2' >> beam.Map(
            lambda x: (x, 'PickupsAfterDedup')
        ) | 'PickupsPrintCount2' >> beam.io.WriteToText(
            'gs://{}/etl_logs/pickups_after_dedup.txt'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")
        depls_dedup | 'DeploymentsCountAfterDedup' >> beam.combiners.Count.Globally(
        ) | 'DeploymentsNameIt2' >> beam.Map(
            lambda x: (x, 'DeploymentsAfterDedup')
        ) | 'DeploymentsPrintCount2' >> beam.io.WriteToText(
            'gs://{}/etl_logs/deployments_after_dedup.txt'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")
        rides_dedup | 'RidesCountAfterDedup' >> beam.combiners.Count.Globally(
        ) | 'RidesNameIt2' >> beam.Map(lambda x: (
            x, 'RidesAfterDedup')) | 'RidesPrintCount2' >> beam.io.WriteToText(
                'gs://{}/etl_logs/rides_after_dedup.txt'.format(BUCKET_NAME),
                num_shards=1,
                shard_name_template="")

        picks_dedup | 'WritePickups' >> beam.io.WriteToText(
            'gs://{}/final_pickups.csv'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")
        depls_dedup | 'WriteDeployments' >> beam.io.WriteToText(
            'gs://{}/final_deployments.csv'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")
        rides_dedup | 'WriteRides' >> beam.io.WriteToText(
            'gs://{}/final_rides.csv'.format(BUCKET_NAME),
            num_shards=1,
            shard_name_template="")