def expand(self, pcoll): if self._preserve_sample_order: return (pcoll | 'GetSampleIds' >> beam.Map(self._get_sample_ids) | 'RemoveDuplicates' >> beam.Distinct() | 'Combine' >> beam.combiners.ToList() | 'ExtractUniqueSampleIds' >> beam.ParDo( self._extract_unique_sample_ids)) else: return (pcoll | 'GetSampleIds' >> beam.FlatMap(self._get_sample_ids) | 'RemoveDuplicates' >> beam.Distinct() | 'Combine' >> beam.combiners.ToList() | 'SortSampleIds' >> beam.ParDo(sorted))
def ReadAndShuffleData(pcoll, filepatterns): """Read a train or test dataset from disk and shuffle it.""" # NOTE: we pass filepatterns as a tuple instead of two args, as the current # version of beam assumes that if the first arg to a ptransfrom_fn is a # string, then that string is the label. neg_filepattern, pos_filepattern = filepatterns # Read from each file pattern and create a tuple of the review text and the # correct label. negative_examples = ( pcoll | 'ReadNegativeExamples' >> beam.io.ReadFromText(neg_filepattern) | 'PairWithZero' >> beam.Map(lambda review: (review, 0))) positive_examples = ( pcoll | 'ReadPositiveExamples' >> beam.io.ReadFromText(pos_filepattern) | 'PairWithOne' >> beam.Map(lambda review: (review, 1))) all_examples = ( [negative_examples, positive_examples] | 'Merge' >> beam.Flatten()) # Shuffle the data. Note that the data does in fact contain duplicate reviews # for reasons that are unclear. This means that NUM_TRAIN_INSTANCES and # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data. # pylint: disable=no-value-for-parameter shuffled_examples = ( all_examples | 'Distinct' >> beam.Distinct() | 'Shuffle' >> Shuffle()) # Put the data in the format that can be accepted directly by tf.Transform. return shuffled_examples | 'MakeInstances' >> beam.Map( lambda p: {REVIEW_KEY: p[0], LABEL_KEY: p[1]})
def _CreateCategoricalDict(pcoll, existing_dict_pairs): """ For a specific column, creates a new "categorical dict" mapping values to unique ints. """ existing_max_value = (existing_dict_pairs | f"just values" >> beam.Map(lambda r: r[1]) | f"get max" >> beam.combiners.Top.Of(1) | f"extract" >> beam.FlatMap(lambda r: r)) new_pairs = ( pcoll | "filter for unseen" >> beam.Filter( lambda row, existing: row not in existing, existing=beam.pvalue.AsDict(existing_dict_pairs), ) | beam.Distinct() | "group into single list" >> beam.combiners.ToList() | "append unique values" >> beam.FlatMap( lambda row, max_v: [(key, max_v + 1 + i) for i, key in enumerate(row)], max_v=beam.pvalue.AsSingleton(existing_max_value, default_value=0), )) return [new_pairs, existing_dict_pairs ] | "combine new/existing" >> beam.Flatten()
def run(argv=None, save_main_session=True): '''Main entry point; defines and runs the wordcount pipeline.''' parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) processed_users = (lines | 'splits' >> beam.Map(split_and_lower) | 'noNum' >> beam.Map(no_num_format) | 'formatOut' >> beam.Map(format_output)) processed_users | 'uniqueUser' >> beam.Distinct( ) | 'writeUnique' >> WriteToText(known_args.output, file_name_suffix='.csv') schema = avro.schema.parse(open("user.avsc", "rb").read()) processed_users | 'avro_write' >> beam.io.avroio.WriteToAvro( 'output_avro', schema, file_name_suffix='.avro') reader = DataFileReader(open("output_avro-00000-of-00001.avro", "rb"), DatumReader()) for user in reader: print user reader.close() result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def test_distinct(self): expected = [1, 2, 3] inputs = [1, 1, 2, 3] with TestPipeline() as p: actual = (p | beam.Create(inputs) | beam.Distinct()) assert_that(actual, equal_to(expected))
def _sample_examples(pipeline): seeds = range(FLAGS.num_examples) examples = ( pipeline | "Create" >> beam.Create(seeds) | "SampleExamples" >> beam.Map(sample_example, sampler=sampler) | "Format" >> beam.Map(lambda ex: "%s\t%s" % (ex[0], ex[1]))) if not FLAGS.allow_duplicates: examples = examples | "RemoveDuplicates" >> beam.Distinct() _ = examples | "WriteExamples" >> beam.io.WriteToText(FLAGS.output)
def user_agent_threshold(pipeline, file_path): with pipeline as p: return (p | beam.io.ReadFromText(file_path) | 'ParseNetworkLogs' >> beam.ParDo(ParseNetworkLogs()) | 'MapIPsToUserAgents' >> beam.Map(lambda elem: (elem['source.ip'], elem['agent.id'])) | 'GetDistinct' >> beam.Distinct() | 'Group' >> beam.combiners.Count.PerKey() | 'Filter' >> beam.Filter(lambda entry: entry[1] > 1))
def _add_metadata( self, rows: beam.pvalue.PCollection[Row] ) -> beam.pvalue.PCollection[Row]: """Add ip metadata to a collection of roundtrip rows. Args: rows: beam.PCollection[Row] Returns: PCollection[Row] The same rows as above with with additional metadata columns added. """ # PCollection[Tuple[DateIpKey,Row]] rows_keyed_by_ip_and_date = ( rows | 'key by ips and dates' >> beam.Map(lambda row: (make_date_ip_key( row), row)).with_output_types(Tuple[DateIpKey, Row])) # PCollection[DateIpKey] # pylint: disable=no-value-for-parameter ips_and_dates = (rows_keyed_by_ip_and_date | 'get ip and date keys per row' >> beam.Keys().with_output_types(DateIpKey)) # PCollection[DateIpKey] deduped_ips_and_dates = ( # pylint: disable=no-value-for-parameter ips_and_dates | 'dedup' >> beam.Distinct().with_output_types(DateIpKey)) # PCollection[Tuple[date,List[ip]]] grouped_ips_by_dates = ( deduped_ips_and_dates | 'group by date' >> beam.GroupByKey().with_output_types(Tuple[str, Iterable[str]])) # PCollection[Tuple[DateIpKey,Row]] ips_with_metadata = (grouped_ips_by_dates | 'get ip metadata' >> beam.FlatMapTuple( self._add_ip_metadata).with_output_types( Tuple[DateIpKey, Row])) # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]] grouped_metadata_and_rows = ( ({ IP_METADATA_PCOLLECTION_NAME: ips_with_metadata, ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date }) | 'group by keys' >> beam.CoGroupByKey()) # PCollection[Row] rows_with_metadata = ( grouped_metadata_and_rows | 'merge metadata with rows' >> beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row)) return rows_with_metadata
def expand(self, pcoll): return ( pcoll | "Start" >> beam.FlatMap(_start_stage, self.specs_by_target) | "CreateTasks" >> beam.FlatMapTuple(_copy_tasks) # prevent undesirable fusion # https://stackoverflow.com/a/54131856/809705 | "Reshuffle" >> beam.Reshuffle() | "CopyChunks" >> beam.MapTuple(_copy_chunk) # prepare inputs for the next stage (if any) | "Finish" >> beam.Distinct())
def expand(self, pcollection): return ( pcollection | 'FilterNAs' >> beam.ParDo(keep_only_non_nulls_request_kv) | 'ExtractCrmId' >> beam.Map(lambda elt: (elt['UniqueIdentifier'], parse_all_request_kv(elt['AllRequestKv']))) | 'Deduplicate' >> beam.Distinct() | 'FormatJoinData' >> beam.Map(lambda elt: (elt[0], { 'FreewheelId': elt[1] })))
def _TrackDistinctSliceKeys( # pylint: disable=invalid-name slice_keys_and_values: beam.PCollection[types.SlicedRecordBatch] ) -> beam.pvalue.PCollection[int]: """Gathers slice key telemetry post slicing.""" return (slice_keys_and_values | 'ExtractSliceKeys' >> beam.Keys() | 'RemoveDuplicates' >> beam.Distinct() | 'Size' >> beam.combiners.Count.Globally() | 'IncrementCounter' >> beam.Map( lambda x: _increment_counter('num_distinct_slice_keys', x)))
def _TrackDistinctSliceKeys( # pylint: disable=invalid-name slice_keys_and_values: beam.pvalue.PCollection) -> beam.pvalue.PCollection: """Gathers slice key telemetry post slicing.""" def increment_counter(element): # pylint: disable=invalid-name num_distinct_slice_keys = beam.metrics.Metrics.counter( constants.METRICS_NAMESPACE, 'num_distinct_slice_keys') num_distinct_slice_keys.inc(element) return element return (slice_keys_and_values | 'ExtractSliceKeys' >> beam.Keys() | 'RemoveDuplicates' >> beam.Distinct() | 'Size' >> beam.combiners.Count.Globally() | 'IncrementCounter' >> beam.Map(increment_counter))
def run(save_main_session=True): """main entry point""" opts = MyOptions() opts.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=opts) as pipeline: ( pipeline | 'read lines' >> beam.io.ReadFromText(opts.input) | 'remove duplicates line' >> beam.Distinct() | ExtractCSVLine() | 'Write results' >> beam.io.WriteToText(opts.output) )
def distinct(test=None): # [START distinct] import apache_beam as beam with beam.Pipeline() as pipeline: unique_elements = (pipeline | 'Create produce' >> beam.Create([ '🥕', '🥕', '🍆', '🍅', '🍅', '🍅', ]) | 'Deduplicate elements' >> beam.Distinct() | beam.Map(print)) # [END distinct] if test: test(unique_elements)
def filter_by_std_dev(pipeline, file_path): with pipeline as p: def std_dev_map(entry, means_and_counts): # Creating variables probably slows the pipeline a lot # but keep it for readability for now. ip, entries = entry mean = means_and_counts[ip][0] count = means_and_counts[ip][1] # Use numpy for array-wise operations # TODO: Loading python for variance is probably not the best way to go # TODO: Feels like this can be a separate CombinePerKey variance = np.sum(np.square(np.array(entries) - mean)) / count return ip, np.sqrt(variance if variance > 0 else 0) def filter_by_stddev(entry, means_and_counts, std_devs): # Refraining from creating new variables during pipeline return entry[1] > \ (means_and_counts[entry[0]][0] + std_devs[entry[0]]) fields = (p | 'ReadInputText' >> beam.io.ReadFromText(file_path) | 'ParseLogs' >> NetworkUsage()) # Combine mean and count to a single pipeline mean_and_count_per_key = beam.pvalue.AsDict( fields | 'GetMeanAndCountPerKey' >> beam.CombinePerKey(MeanAndCount())) # TODO: This part needs improvements std_dev_per_key = beam.pvalue.AsDict( fields | 'GroupByKey' >> beam.GroupByKey() | 'GetDifferencesPerKey' >> beam.Map(std_dev_map, mean_and_count_per_key)) return (fields | 'FilterZeroBytes' >> beam.Filter(lambda entry: entry[1] > 0) | 'FilterByStdDev' >> beam.Filter( filter_by_stddev, mean_and_count_per_key, std_dev_per_key) | 'GetIPs' >> beam.Keys() | 'GetDistinctIPs' >> beam.Distinct())
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() # parser.add_argument('--project', type=str, required=False, help='project') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument('--output', dest='output', required=False, default='./', help='Output file to write results to.') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # Store the CLI arguments to variables # project_id = known_args.project # Setup the dataflow pipeline options pipeline_options = PipelineOptions(pipeline_args) # pipeline_options.view_as(SetupOptions).save_main_session = True # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) # google_cloud_options.project = project_id save_main_session = True pipeline_options.view_as( SetupOptions).save_main_session = save_main_session print(pipeline_args) with beam.Pipeline() as pipeline: total = pipeline | 'Create plant counts' >> beam.Create([ ('1', 3), ('1', 2), ('2', 1), ('3', 4), ('4', 5), ('4', 3), ]) | 'Sum' >> beam.Distinct() | beam.Map(print)
def expand(self, uri_to_content): # Compute the total number of documents, and prepare a singleton # PCollection to use as side input. total_documents = (uri_to_content | 'GetUris 1' >> beam.Keys() | 'GetUniqueUris' >> beam.Distinct() | 'CountUris' >> beam.combiners.Count.Globally()) # Create a collection of pairs mapping a URI to each of the words # in the document associated with that that URI. def split_into_words(uri_line): (uri, line) = uri_line return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)] uri_to_words = (uri_to_content | 'SplitWords' >> beam.FlatMap(split_into_words)) # Compute a mapping from each word to the total number of documents # in which it appears. word_to_doc_count = ( uri_to_words | 'GetUniqueWordsPerDoc' >> beam.Distinct() | 'GetWords' >> beam.Values() | 'CountDocsPerWord' >> beam.combiners.Count.PerElement()) # Compute a mapping from each URI to the total number of words in the # document associated with that URI. uri_to_word_total = ( uri_to_words | 'GetUris 2' >> beam.Keys() | 'CountWordsInDoc' >> beam.combiners.Count.PerElement()) # Count, for each (URI, word) pair, the number of occurrences of that word # in the document associated with the URI. uri_and_word_to_count = ( uri_to_words | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement()) # Adjust the above collection to a mapping from (URI, word) pairs to counts # into an isomorphic mapping from URI to (word, count) pairs, to prepare # for a join by the URI key. def shift_keys(uri_word_count): return (uri_word_count[0][0], (uri_word_count[0][1], uri_word_count[1])) uri_to_word_and_count = (uri_and_word_to_count | 'ShiftKeys' >> beam.Map(shift_keys)) # Perform a CoGroupByKey (a sort of pre-join) on the prepared # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and # 'word counts' strings. This yields a mapping from URI to a dictionary # that maps the above mentioned tag strings to an iterable containing the # word total for that URI and word and count respectively. # # A diagram (in which '[]' just means 'iterable'): # # URI: {'word totals': [count], # Total words within this URI's document. # 'word counts': [(word, count), # Counts of specific words # (word, count), # within this URI's document. # ... ]} uri_to_word_and_count_and_total = ( { 'word totals': uri_to_word_total, 'word counts': uri_to_word_and_count } | 'CoGroupByUri' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, term frequency) pair for each # URI. A word's term frequency for a document is simply the number of times # that word occurs in the document divided by the total number of words in # the document. def compute_term_frequency(uri_count_and_total): (uri, count_and_total) = uri_count_and_total word_and_count = count_and_total['word counts'] # We have an iterable for one element that we want extracted. [word_total] = count_and_total['word totals'] for word, count in word_and_count: yield word, (uri, float(count) / word_total) word_to_uri_and_tf = ( uri_to_word_and_count_and_total | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency)) # Compute a mapping from each word to its document frequency. # A word's document frequency in a corpus is the number of # documents in which the word appears divided by the total # number of documents in the corpus. # # This calculation uses a side input, a Dataflow-computed auxiliary value # presented to each invocation of our MapFn lambda. The second argument to # the function (called total---note that the first argument is a tuple) # receives the value we listed after the lambda in Map(). Additional side # inputs (and ordinary Python values, too) can be provided to MapFns and # DoFns in this way. def div_word_count_by_total(word_count, total): (word, count) = word_count return (word, float(count) / total) word_to_df = ( word_to_doc_count | 'ComputeDocFrequencies' >> beam.Map( div_word_count_by_total, AsSingleton(total_documents))) # Join the term frequency and document frequency collections, # each keyed on the word. word_to_uri_and_tf_and_df = ( { 'tf': word_to_uri_and_tf, 'df': word_to_df } | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. # There are a variety of definitions of TF-IDF # ("term frequency - inverse document frequency") score; here we use a # basic version that is the term frequency divided by the log of the # document frequency. def compute_tf_idf(word_tf_and_df): (word, tf_and_df) = word_tf_and_df [docf] = tf_and_df['df'] for uri, tf in tf_and_df['tf']: yield word, (uri, tf * math.log(1 / docf)) word_to_uri_and_tfidf = ( word_to_uri_and_tf_and_df | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf)) return word_to_uri_and_tfidf
def pipeline(config_map, dataset_config_map, preprocess_example_fn, input_tensors_to_example_fn): """Pipeline for dataset creation.""" tf.flags.mark_flags_as_required(['output_directory']) pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options.split(',')) config = config_map[FLAGS.config] hparams = config.hparams hparams.parse(FLAGS.hparams) datasets = dataset_config_map[FLAGS.dataset_config] if tf.gfile.Exists(FLAGS.output_directory): raise ValueError('Output directory %s already exists!' % FLAGS.output_directory) tf.gfile.MakeDirs(FLAGS.output_directory) with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'), 'w') as f: f.write('\n\n'.join([ 'min_length: {}'.format(FLAGS.min_length), 'max_length: {}'.format(FLAGS.max_length), 'sample_rate: {}'.format(FLAGS.sample_rate), 'preprocess_examples: {}'.format(FLAGS.preprocess_examples), 'preprocess_train_example_multiplier: {}'.format( FLAGS.preprocess_train_example_multiplier), 'config: {}'.format(FLAGS.config), 'hparams: {}'.format(hparams.to_json(sort_keys=True)), 'dataset_config: {}'.format(FLAGS.dataset_config), 'datasets: {}'.format(datasets), ])) with beam.Pipeline(options=pipeline_options) as p: for dataset in datasets: if isinstance(dataset.path, (list, tuple)): # If dataset.path is a list, then it's a list of sources to mix together # to form new examples. First, do the mixing, then pass the results to # the rest of the pipeline. id_exs = [] sourceid_to_exids = [] for source_id, stem_path in enumerate(dataset.path): if dataset.num_mixes is None: raise ValueError( 'If path is not a list, num_mixes must not be None: {}' .format(dataset)) stem_p = p | 'tfrecord_list_%s_%d' % ( dataset.name, source_id) >> (beam.Create( data.generate_sharded_filenames(stem_path))) # Note that we do not specify a coder when reading here. # This is so that the hashing in key_example below can work directly # on the serialized version instead of having to re-serialize it. # Also, deserializing with a coder and then re-serializing does not # always generate the same hash for the same example (likely due to # the map fields in tf.train.Example). This is important when reading # the same dataset multiple times to mix it with itself. stem_p |= 'read_tfrecord_%s_%d' % ( dataset.name, source_id) >> ( beam.io.tfrecordio.ReadAllFromTFRecord()) stem_p |= 'shuffle_stems_%s_%d' % ( dataset.name, source_id) >> (beam.Reshuffle()) # Key all examples with a hash. def key_example(ex): return (hashlib.sha256(ex).hexdigest(), ex) stem_p |= 'add_id_key_%s_%d' % ( dataset.name, source_id) >> (beam.Map(key_example)) id_exs.append(stem_p) # Create a list of source_id to example id. def sourceid_to_exid(id_ex, source_id): return (source_id, id_ex[0]) sourceid_to_exids.append( stem_p | 'key_%s_%d' % (dataset.name, source_id) >> (beam.Map(sourceid_to_exid, source_id=source_id))) # ('example_hash', serialized_example) id_exs = ( id_exs | 'id_exs_flatten_%s' % dataset.name >> beam.Flatten() | 'id_exs_distinct_%s' % dataset.name >> beam.Distinct()) # ('source_id, 'example_hash') sourceid_to_exids = (sourceid_to_exids | 'sourceid_to_exids_flatten_%s' % dataset.name >> beam.Flatten()) # Pass the list of source id to example IDs to generate_mixes, # which will create mixes by selecting random IDs from each source # (with replacement). This is represented as a list of example IDs # to Mix IDs. # Note: beam.Create([0]) is just a single dummy value to allow the # sourceid_to_exids to be passed in as a python list so we can do the # sampling with numpy. exid_to_mixids = ( p | 'create_dummy_%s' % dataset.name >> beam.Create([0]) | 'generate_mixes_%s' % dataset.name >> beam.Map( create_dataset_lib.generate_mixes, num_mixes=dataset.num_mixes, sourceid_to_exids=beam.pvalue.AsList( sourceid_to_exids))) # Create a list of (Mix ID, Full Example proto). Note: Examples may be # present in more than one mix. Then, group by Mix ID. def mixid_to_exs(id_ex, exid_to_mixids): exid, ex = id_ex for mixid in exid_to_mixids[exid]: yield mixid, ex mixid_exs = ( id_exs | 'mixid_to_exs_%s' % dataset.name >> beam.FlatMap( mixid_to_exs, exid_to_mixids=beam.pvalue.AsSingleton(exid_to_mixids)) | 'group_by_key_%s' % dataset.name >> beam.GroupByKey()) # Take these groups of Examples, mix their audio and sequences to return # a single new Example. Then, carry on with the rest of the pipeline # like normal. split_p = (mixid_exs | 'mix_examples_%s' % dataset.name >> beam.Map( mix_examples, FLAGS.sample_rate, FLAGS.load_audio_with_librosa)) else: if dataset.num_mixes is not None: raise ValueError( 'If path is not a list, num_mixes must be None: {}'. format(dataset)) split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create( data.generate_sharded_filenames(dataset.path)) split_p |= 'read_tfrecord_%s' % dataset.name >> ( beam.io.tfrecordio.ReadAllFromTFRecord( coder=beam.coders.ProtoCoder(tf.train.Example))) split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle() split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap( split_wav, min_length=FLAGS.min_length, max_length=FLAGS.max_length, sample_rate=FLAGS.sample_rate, debug_output_directory=FLAGS.output_directory, split_example=dataset.process_for_training, load_audio_with_librosa=FLAGS.load_audio_with_librosa) if FLAGS.preprocess_examples: if dataset.process_for_training: mul_name = 'preprocess_multiply_%dx_%s' % ( FLAGS.preprocess_train_example_multiplier, dataset.name) split_p |= mul_name >> beam.FlatMap( multiply_example, FLAGS.preprocess_train_example_multiplier) split_p |= 'preprocess_%s' % dataset.name >> beam.Map( preprocess_data, preprocess_example_fn, input_tensors_to_example_fn, hparams, dataset.process_for_training) split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle() split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord( os.path.join(FLAGS.output_directory, '%s.tfrecord' % dataset.name), coder=beam.coders.ProtoCoder(tf.train.Example))
# Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original def getSchema(): df_schema = pyarrow.Schema.from_pandas( pd.read_parquet(user_options.schema_source.get())) for (key, value) in ast.literal_eval( user_options.rename_columns.get()).items(): df_schema = df_schema.set( df_schema.get_field_index(key), pyarrow.field(value, df_schema.types[df_schema.get_field_index(key)])) return df_schema # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar map_rename_cols = ( p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw) | "Map rename cols" >> beam.Map(mapRenameCols) | "Rename cols to string" >> beam.Map(str) | "Deduplicate elements" >> beam.Distinct()) # Este lee los datos desde los archivos fuente data = (p | "Read parquet for data" >> ReadFromParquet(user_options.url_raw)) # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario rename_data = (data | "Rename columns" >> beam.Map( reColumns, rename_cols=AsList(map_rename_cols))) # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema _ = (rename_data | "Write to storage TRN" >> WriteToParquet( user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet")) print("End Pipeline")
def run_pipeline(root, input_note_events, input_ratings, output_path, vocab, section_markers, cur_augmentation_config): """Create beam pipeline to generate TF examples. Args: root: beam.Pipeline root. input_note_events: Path to csv of notes. input_ratings: Path to csv of ratings. output_path: Directory path to write output to. vocab: List of tokens in the vocabulary. section_markers: Dict of markers as accepted by note sectioning. cur_augmentation_config: AugmentationConfig dataclass instance, defines the kinds of augmentations to apply. """ # Load and process ratings: raw_ratings = data_lib.read_raw_ratings(root, input_ratings) ratings = (raw_ratings | "GetLabels" >> beam.Map(data_lib.convert_ratings) | "GroupRatingsByNoteId" >> beam.GroupByKey() | "UnpackRatings" >> beam.Map(lambda x: (x[0], list(x[1])))) # Load and process notes: notes = data_lib.read_filter_notes(root, input_note_events) note_partitions = ( raw_ratings | "PartitionMap" >> (beam.Map(lambda x: (str(x.note_id), x.partition))).with_output_types( Tuple[str, str]) | "DedupPartitionMap" >> beam.Distinct()) # Join. non_rated_notes, rated_notes = ( ({ "ratings": ratings, "notes": notes, "note_partition": note_partitions }) | "Join" >> beam.CoGroupByKey().with_output_types( Tuple[str, Dict[str, Any]]) | "SplitRated" >> beam.Partition( lambda x, n_part: int(bool(x[1]["ratings"])), 2)) # Downsample non-rated. non_rated_notes = data_lib.downsample(non_rated_notes, _N_DOWNSAMPLE.value, _RANDOM_SEED.value) # Process notes. features_and_labels = ( (non_rated_notes, rated_notes) | beam.Flatten() | "ReshuffleJoin" >> beam.Reshuffle() | "ProcessAPData" >> beam.ParDo(data_lib.ProcessAPData(), section_markers) | "FilterAPData" >> beam.Filter(data_lib.filter_by_labels) | "ReshuffleForSubjectId" >> beam.Reshuffle() | "RekeyBySubjectId" >> beam.Map(lambda x: (x[1].subject_id, x[1])) | "GroupBySubjectId" >> beam.GroupByKey() | "OneNoteIdPerRatedSubjectId" >> beam.ParDo( data_lib.OneNoteIdPerRatedSubjectId(), seed=_RANDOM_SEED.value) | "RekeyByNoteId" >> beam.Map(lambda x: (x.note_id, x)) | "ApplyAugmentations" >> beam.ParDo(data_lib.ApplyAugmentations(), cur_augmentation_config, _RANDOM_SEED.value) | "GetFeaturesAndLabels" >> beam.ParDo( data_lib.ProcessFeaturesAndLabels(vocab, _MAX_SEQ_LENGTH.value)) | "ReshuffleFeaturesAndLabels" >> beam.Reshuffle()) # Convert and save tf examples: data_lib.convert_and_save_tf_examples(features_and_labels, output_path, _DEBUG_OUTPUT.value)
pipeline_options = PipelineOptions( # runner='DataflowRunner', project='gcp-nyc', # job_name='unique-job-name', temp_location=f'gs://{BUCKET}/temp', region='us-central1') with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. read_gbq = p | 'Read' >> beam.io.Read( beam.io.BigQuerySource(query=QUERY, use_standard_sql=True)) aliquot_count = ( read_gbq | 'ExtractAliquot' >> beam.Map(lambda elt: elt['aliquot_barcode']) | 'Deduplicate' >> beam.Distinct() | 'ReplaceWithOnes' >> beam.Map(lambda elt: 1) | 'Sum' >> beam.CombineGlobally(sum)) cpg_observation_count = ( read_gbq | 'ExtractCpG' >> beam.Map(lambda elt: (elt['CpG_probe_id'], 1)) | 'CombineCpG' >> beam.CombinePerKey(sum)) # counts = ( # lines # | 'Split' >> # (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) # | 'PairWIthOne' >> beam.Map(lambda x: (x, 1)) # | 'GroupAndSum' >> beam.CombinePerKey(sum)) #
def expand(self, pcoll): return (pcoll | beam.Distinct() | beam.Map(lambda r: (None, r)) | beam.GroupByKey() | beam.ParDo(_PairWithIndexNumberDoFn()))
def run(in_pcol, job_config): # load 5 seconds of audio and get STFT stft = (in_pcol | aio.GcsLoadBinary() | audio.LoadAudio(offset=10, duration=5) | audio.GetSTFT()) # get magnitude of audio magnitude = (stft | "Get magnitude" >> beam.ParDo( transforms.GetMagnitude()).with_outputs()) # map the result to a key (the KlioMessage element) # so we can group all results by key magnitude_key = ( magnitude.spectrogram | "element to spec" >> beam.Map(transforms.create_key_from_element)) # get nearest neighbors and map the result to a key (the KlioMessage element) nn_filter = ( magnitude.spectrogram | "Get nn filter" >> beam.ParDo(transforms.FilterNearestNeighbors()) | "element to filter" >> beam.Map(transforms.create_key_from_element)) # map together the full magnitude with its filter by key (the KlioMessage element) merge = ({ "full": magnitude_key, "nnfilter": nn_filter } | "merge" >> beam.CoGroupByKey()) # calc the difference between full magnitude and the filter net = merge | beam.Map(transforms.subtract_filter_from_full) # create a mask from the filter minus the difference of full & filter first_mask = ({ "first": nn_filter, "second": net, "full": magnitude_key } | "first mask group" >> beam.CoGroupByKey() | "first mask" >> beam.ParDo(transforms.GetSoftMask(margin=2))) # create another mask from the difference of full & filter minus the filter second_mask = ( { "first": net, "second": nn_filter, "full": magnitude_key } | "second mask group" >> beam.CoGroupByKey() | "second mask" >> beam.ParDo(transforms.GetSoftMask(margin=10))) # plot the full magnitude spectrogram magnitude_out = (magnitude.spectrogram | "full spec" >> audio.GetSpec() | "plot full spec" >> audio.SpecToPlot( title="Full Spectrogam for {element}", y_axis="log") | "save full" >> aio.GcsUploadPlot(suffix="-full")) # plot the first mask (background) spectrogram background_out = ( first_mask | "background spec" >> audio.GetSpec() | "plot background spec" >> audio.SpecToPlot( title="Background Spectrogam for {element}", y_axis="log") | "save background" >> aio.GcsUploadPlot(suffix="-background")) # plot the second mask (foreground) spectrogram foreground_out = ( second_mask | "foreground spec" >> audio.GetSpec() | "plot forground spec" >> audio.SpecToPlot( title="Foreground Spectrogam for {element}", y_axis="log") | "save foreground" >> aio.GcsUploadPlot(suffix="-foreground")) # flatten all outputs into one PCollection, then remove duplicates out_pcol = ((magnitude_out, background_out, foreground_out) | "flatten output paths" >> beam.Flatten() | "remove dups" >> beam.Distinct()) return out_pcol
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() # parser.add_argument('--project', type=str, required=False, help='project') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument('--output', dest='output', required=False, default='./', help='Output file to write results to.') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # Store the CLI arguments to variables # project_id = known_args.project # Setup the dataflow pipeline options pipeline_options = PipelineOptions(pipeline_args) # pipeline_options.view_as(SetupOptions).save_main_session = True # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) # google_cloud_options.project = project_id save_main_session = True pipeline_options.view_as( SetupOptions).save_main_session = save_main_session print(pipeline_args) SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 20 } ] }, { "name": "NUM_OF_MTHS_PD_30", "type": ["null", 'int', 'string'] }, { "name": "FIELD_1", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_2", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_3", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_4", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_5", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_6", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_7", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_8", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_9", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_10", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }] } rec_cnt = known_args.records with beam.Pipeline(options=pipeline_options) as p: left_pcol_name = 'p1' file = p | 'read_source' >> beam.io.ReadFromAvro( "./data/Curr_account.avro") | beam.Distinct() file2 = p | 'read_source2' >> beam.io.ReadFromAvro( "./data/Prev_account.avro") p1 = file | 'filter fields' >> beam.Filter( lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0) p2 = file2 | 'filter fields2' >> beam.Filter( lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0) # P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv') # P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv') right_pcol_name = 'p2' join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2} test_pipeline = pipelines_dictionary | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) test_pipeline | 'add 1 to NUM_OF_MTHS_PD_30' >> beam.Map( add_one) | "write4" >> beam.io.WriteToText('./data4.csv') print(type(test_pipeline)) compressIdc = True use_fastavro = True # test_pipeline | 'write_fastavro' >> WriteToAvro( known_args.output, parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), ) result = p.run() result.wait_until_finish()
def dataflow_pipeline_run(BUCKET_NAME, options): with beam.Pipeline(options=options) as p: picks = p | 'ReadPickups' >> beam.io.ReadFromText( 'gs://{}/pickups*.csv'.format(BUCKET_NAME)) depls = p | 'ReadDeployments' >> beam.io.ReadFromText( 'gs://{}/deployments*.csv'.format(BUCKET_NAME)) rides = p | 'ReadRides' >> beam.io.ReadFromText( 'gs://{}/rides*.csv'.format(BUCKET_NAME)) picks | 'PickupsCountBeforeDedup' >> beam.combiners.Count.Globally( ) | 'PickupsNameIt1' >> beam.Map( lambda x: (x, 'PickupsBeforeDedup') ) | 'PickupsPrintCount1' >> beam.io.WriteToText( 'gs://{}/etl_logs/pickups_before_dedup.txt'.format(BUCKET_NAME), num_shards=1, shard_name_template="") depls | 'DeploymentsCountBeforeDedup' >> beam.combiners.Count.Globally( ) | 'DeploymentsNameIt1' >> beam.Map( lambda x: (x, 'DeploymentsBeforeDedup') ) | 'DeploymentsPrintCount1' >> beam.io.WriteToText( 'gs://{}/etl_logs/deployments_before_dedup.txt'.format( BUCKET_NAME), num_shards=1, shard_name_template="") rides | 'RidesCountBeforeDedup' >> beam.combiners.Count.Globally( ) | 'RidesNameIt1' >> beam.Map( lambda x: (x, 'RidesBeforeDedup') ) | 'RidesPrintCount1' >> beam.io.WriteToText( 'gs://{}/etl_logs/rides_before_dedup.txt'.format(BUCKET_NAME), num_shards=1, shard_name_template="") picks_dedup = picks | 'DeDupPickups' >> beam.Distinct() depls_dedup = depls | 'DeDupDeployments' >> beam.Distinct() rides_dedup = rides | 'DeDupRides' >> beam.Distinct() picks_dedup | 'PickupsCountAfterDedup' >> beam.combiners.Count.Globally( ) | 'PickupsNameIt2' >> beam.Map( lambda x: (x, 'PickupsAfterDedup') ) | 'PickupsPrintCount2' >> beam.io.WriteToText( 'gs://{}/etl_logs/pickups_after_dedup.txt'.format(BUCKET_NAME), num_shards=1, shard_name_template="") depls_dedup | 'DeploymentsCountAfterDedup' >> beam.combiners.Count.Globally( ) | 'DeploymentsNameIt2' >> beam.Map( lambda x: (x, 'DeploymentsAfterDedup') ) | 'DeploymentsPrintCount2' >> beam.io.WriteToText( 'gs://{}/etl_logs/deployments_after_dedup.txt'.format(BUCKET_NAME), num_shards=1, shard_name_template="") rides_dedup | 'RidesCountAfterDedup' >> beam.combiners.Count.Globally( ) | 'RidesNameIt2' >> beam.Map(lambda x: ( x, 'RidesAfterDedup')) | 'RidesPrintCount2' >> beam.io.WriteToText( 'gs://{}/etl_logs/rides_after_dedup.txt'.format(BUCKET_NAME), num_shards=1, shard_name_template="") picks_dedup | 'WritePickups' >> beam.io.WriteToText( 'gs://{}/final_pickups.csv'.format(BUCKET_NAME), num_shards=1, shard_name_template="") depls_dedup | 'WriteDeployments' >> beam.io.WriteToText( 'gs://{}/final_deployments.csv'.format(BUCKET_NAME), num_shards=1, shard_name_template="") rides_dedup | 'WriteRides' >> beam.io.WriteToText( 'gs://{}/final_rides.csv'.format(BUCKET_NAME), num_shards=1, shard_name_template="")