def expand(self, pcoll): top_k = self._spec.top_k frequency_threshold = self._spec.frequency_threshold assert top_k is None or top_k >= 0 assert frequency_threshold is None or frequency_threshold >= 0 # Creates a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). counts = ( pcoll | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list) | 'CountWithinList' >> # Specification of with_output_types allows for combiner optimizations. (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))). with_output_types(KV[common.PRIMITIVE_TYPE, int])) | 'CountGlobally' >> beam.CombinePerKey(sum)) counts = (counts | 'FilterProblematicStrings' >> beam.Filter(lambda kv: kv[ 0] and '\n' not in kv[0] and '\r' not in kv[0]) | 'SwapElementsAndCounts' >> beam.KvSwap()) # Filter is cheaper than TopK computation and the two commute, so # filter first. if frequency_threshold is not None: counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) if top_k is not None: counts = (counts | 'Top(%s)' % top_k >> beam.transforms.combiners.Top.Largest(top_k) | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) # Performance optimization to obviate reading from finely sharded files # via AsIter. By forcing all data into a single group we end up reading # from a single file. # @beam.ptransform_fn def Reshard(pcoll): # pylint: disable=invalid-name return (pcoll | 'PairWithNone' >> beam.Map(lambda x: (None, x)) | 'GroupByNone' >> beam.GroupByKey() | 'ExtractValues' >> beam.FlatMap(lambda x: x[1])) counts |= 'ReshardToOneGroup' >> Reshard() # pylint: disable=no-value-for-parameter # Using AsIter instead of AsList below in order to reduce max memory # usage (due to AsList caching). def order_by_decreasing_counts(ignored, counts_iter, store_frequency): """Sort the vocabulary by frequency count.""" del ignored counts = list(counts_iter) if not counts: counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')] counts.sort(reverse=True) # Largest first. if store_frequency: # Returns ['count1 element1', ... ] return [ '{} {}'.format(count, element) for count, element in counts ] else: return [element for _, element in counts] vocabulary_file = os.path.join(self._temp_assets_dir, self._spec.vocab_filename) vocab_is_written = (pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderByDecreasingCounts' >> beam.FlatMap( order_by_decreasing_counts, counts_iter=beam.pvalue.AsIter(counts), store_frequency=self._spec.store_frequency) | 'WriteToFile' >> beam.io.WriteToText( vocabulary_file, shard_name_template='')) # Return the vocabulary path. wait_for_vocabulary_transform = ( pcoll.pipeline | 'CreatePath' >> beam.Create([[vocabulary_file]]) # Ensure that the analysis returns only after the file is written. | 'WaitForVocabularyFile' >> beam.Map( lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written))) return wait_for_vocabulary_transform
def expand(self, pcoll): return pcoll | "IsAuction" >> beam.Filter(is_auction)
fields["DEP_AIRPORT_LON"] = airport_timezones[dep_airport_id][1] fields["DEP_AIRPORT_TZOFFSET"] = deptz fields["ARR_AIRPORT_LAT"] = airport_timezones[arr_airport_id][0] fields["ARR_AIRPORT_LON"] = airport_timezones[arr_airport_id][1] fields["ARR_AIRPORT_TZOFFSET"] = arrtz yield json.dumps(fields) except KeyError as e: logging.exception(" Ignoring " + line + " because airport is not known") if __name__ == '__main__': with beam.Pipeline('DirectRunner') as pipeline: airports = (pipeline | 'airports:read' >> beam.io.ReadFromText('airports.csv.gz') | beam.Filter(lambda line: "United States" in line) | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line]))) | 'airports:tz' >> beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26])))) flights = ( pipeline | 'flights:read' >> beam.io.ReadFromText('flights_sample.json') | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports))) flights | beam.io.textio.WriteToText('all_flights')
def expand(self, inputs): pcoll, = inputs if self._top_k is not None and self._top_k < 0: raise ValueError( 'top_k for VocabularyImpl should be >= 0 or None, got ' '{}.'.format(self._top_k)) if self._frequency_threshold is not None and self._frequency_threshold < 0: raise ValueError( 'frequency_threshold for VocabularyImpl should be >= 0 or None, ' 'got {}.'.format(self._frequency_threshold)) # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and b'\n' not in string and b'\r' not in string if (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): flatten_map_fn = ( _flatten_positive_label_weights_total_weights_and_counts) # count_and_means is a pcollection that contains a # _CountAndWeightsMeansAccumulator where: # `weighted_mean` is the weighted mean of positive labels # for all features. # `count` is the count for all features. # `weights_mean` is the mean of the weights for all features. count_and_means = ( pcoll | 'SumBatchCountAndWeightsMeans' >> beam.Map(_count_and_means) | 'ComputeCountAndWeightsMeansGlobally' >> beam.CombineGlobally(CountAndWeightsMeansCombineFn())) # CountAndWeightsMeansCombineFn returns a tuple of the form: # (feature,_CountAndWeightsMeansAccumulator) where: # `feature` is a single string, which is the word in the vocabulary # whose mutual information with the label is being computed. # `weighted_mean` is the weighted mean of y positive given x. # `count` is the count of weights for a feature. # `weights_mean` is the mean of the weights for a feature. combine_transform = ( 'ComputeCountAndWeightsMeansPerUniqueWord' >> beam.CombinePerKey(CountAndWeightsMeansCombineFn()) | 'CalculateMutualInformationPerUniqueWord' >> beam.Map( _calculate_mutual_information, global_accumulator=beam.pvalue.AsSingleton( count_and_means), use_adjusted_mutual_info=self._use_adjusted_mutual_info)) elif (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY): flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) else: flatten_map_fn = _flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() raw_counts = ( pcoll | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn) | 'CountPerString' >> combine_transform | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) | 'SwapStringsAndCounts' >> beam.KvSwap()) counts = ( raw_counts | 'ApplyFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._frequency_threshold, self._top_k))) return counts | 'WriteVocabFile' >> ( _WriteVocabFile( # pylint: disable=no-value-for-parameter self._base_temp_dir, self._vocab_filename, self._store_frequency))
'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP' ] def make_string(array): return (map(lambda tup: '{},{}'.format(tup[0], round(tup[1], 2)), array)) print '\n-----Starting Pipeline-----\n\n' pipeline = beam.Pipeline('DirectRunner') (pipeline | beam.io.ReadFromText('headless_battingext.csv') | beam.Map(lambda line: next(csv.reader([line]))) | beam.Map(lambda d_array: dict(zip(header, d_array))) | beam.Map(lambda d_dict: (d_dict['playerID'], int(d_dict['HR']))) | beam.combiners.Count.PerKey() | beam.Filter(lambda d_tup: int(d_tup[1]) >= 20) | beam.combiners.ToList() | beam.Map(lambda tup: sorted(tup, key=lambda tup: tup[1], reverse=True)) | beam.Map(make_string) | beam.Map(lambda t_array: ['playerID,SEASONS'] + t_array) | beam.FlatMap(lambda x: x) | beam.io.WriteToText('output', num_shards=1)) result = pipeline.run() result.wait_until_finish() print '\n\n-----Ending Pipeline-----\n'
def pipeline(root): """Beam pipeline. Args: root: the root of the pipeline. """ stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1') stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2') # Create a collection of conformers with duplicate information equivalent_files = gfile.glob(FLAGS.input_equivalent_glob) equivalent_conformers = ( root | 'CreateEquivInputs' >> beam.Create(equivalent_files) | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file)) # Merge by bond_topology_id merged_results = ( (stage1_matched_conformers, stage2_matched_conformers, equivalent_conformers) | 'FlattenAllConformers' >> beam.Flatten() | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id) | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs( MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers')) merged_conformers = merged_results['conformers'] # Write out the merge conflicts _ = (merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT] | 'ConflictsCSVFormat' >> beam.Map(csv_format) | 'ConflictsReshuffle' >> beam.Reshuffle() | 'WriteConflictsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_conflicts', header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS), num_shards=1, file_name_suffix='.csv')) # Get the bond length distributions unused_bond_length_dists_pcoll = ( merged_conformers | 'FilterForBondLengths' >> beam.Filter( smu_utils_lib.should_include_in_standard) | 'ExtractBondLengths' >> beam.FlatMap( extract_bond_lengths, dist_sig_digits=3, unbonded_max=2.0) | 'CountBondLengths' >> beam.combiners.Count.PerElement() | 'ToListBondLengths' >> beam.combiners.ToList() | 'WriteBondLengths' >> beam.ParDo( write_bond_lengths, filename=f'{FLAGS.output_stem}_bond_lengths.csv')) # Various per conformer processing update_results = ( merged_conformers | 'UpdateConformers' >> beam.ParDo(UpdateConformerFn()).with_outputs( UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers')) updated_conformers = update_results['conformers'] # Output SMILES mismatches _ = ( update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH] | 'ReshuffleSmilesOutput' >> beam.Reshuffle() | 'SmilesCSVFormat' >> beam.Map(csv_format) | 'WriteSmilesCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_smiles_compare', header= 'conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h', num_shards=1, file_name_suffix='.csv')) # Process duplicate information final_conformers = ( updated_conformers | 'KeyedForDuplicates' >> beam.FlatMap(generate_keyed_conformers_for_duplicates) | 'DupGroupByKey' >> beam.GroupByKey() | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information)) # Pull the stats of various sorts write to a file _ = (final_conformers | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values) | 'CountStats' >> beam.combiners.Count.PerElement() | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}') | 'WriteStatsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_stats', header='primary_key,secondary_key,count', num_shards=1, file_name_suffix='.csv')) # Generate the summary by bond topology. bare_bt_summaries = ( root | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv]) | 'GenerateBareBTSummaries' >> beam.FlatMap(bond_topology_summaries_from_csv)) real_bt_summaries = ( final_conformers | 'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary)) _ = ((bare_bt_summaries, real_bt_summaries) | 'FlattenAllBTSummaries' >> beam.Flatten() | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary()) # Make the filtered versions of the dataset complete_conformers = (final_conformers | 'MakeComplete' >> beam.Map(make_complete_conformer)) standard_conformers = ( final_conformers | 'MakeStandard' >> beam.FlatMap(make_standard_conformer)) # Write the complete and standard conformers as binary protobuf in TFRecord. for id_str, collection in [['complete', complete_conformers], ['standard', standard_conformers]]: _ = (collection | ('TFRecordReshuffle_' + id_str) >> beam.Reshuffle() | ('WriteTFRecord_' + id_str) >> beam.io.tfrecordio.WriteToTFRecord( f'{FLAGS.output_stem}_{id_str}_tfrecord', coder=beam.coders.ProtoCoder(dataset_pb2.Conformer), num_shards=FLAGS.output_shards)) # Write the complete and standard conformers as JSON. # Bit of a hack here: the slowest part of the whole pipeline is writing out # the JSON for the complete conformers. So we just hard code a tripling of the # shards to get more parallelism. for id_str, collection, num_shards in [[ 'complete', complete_conformers, FLAGS.output_shards * 3 ], ['standard', standard_conformers, FLAGS.output_shards]]: _ = (collection | ('JSONReshuffle_' + id_str) >> beam.Reshuffle() | ('ToJSON_' + id_str) >> beam.Map(conformer_to_json) | ('WriteJSON_' + id_str) >> beam.io.WriteToText( f'{FLAGS.output_stem}_{id_str}_json', num_shards=num_shards, file_name_suffix='.json.gz'))
# Setup options for pipe options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) custom_options = options.view_as(CustomPipelineOptions) google_cloud_options.project = 'freightwaves-engineering-prod' google_cloud_options.job_name = f"clean-intra-bk-{datetime.now().strftime('%Y%m%d%H%M%S')}" google_cloud_options.staging_location = 'gs://fw-etl-tmp-prod/' google_cloud_options.temp_location = 'gs://fw-etl-tmp-prod/' options.view_as(StandardOptions).runner = 'DataFlowRunner' #options.view_as(StandardOptions).runner = 'DirectRunner' # Create pipeline object p = beam.Pipeline(options=options) # Define the pipeline steps out = ( p | "Input" >> beam.io.ReadFromText( f"gs://fw-etl-raw-prod/inttra/{custom_options.file_to_clean}") | "Remove Invalid Imos" >> beam.Filter(is_valid_imo) | "Remove Empty Strings" >> beam.Map(replace_empty_str) | "Output" >> beam.io.WriteToText( f"gs://fw-etl-load-prod/inttra/{custom_options.file_to_clean}", shard_name_template='') #| beam.Map(print) ) # Run the pipeline result = p.run()
def test_row_coder_in_pipeine(self): with TestPipeline() as p: res = (p | beam.Create(self.PEOPLE) | beam.Filter(lambda person: person.name == "Jon Snow")) assert_that(res, equal_to([self.JON_SNOW]))
element['company_name'] = 'default-name-' + element['company_id'] #retorno async yield (element['company_name'] + '_' + element['company_id']) main = ( p | 'data source ' >> beam.io.ReadFromMongoDB(uri='mongodb://localhost:27017', db='conekta', coll='data_stagin', projection={ 'company_name': 1, 'company_id': 1 })) prov = (main | 'filtro por identificador de compania' >> beam.Filter(lambda row: len(row['company_id']) > 24) | 'prepara informacion' >> beam.ParDo(PrepareDataProv()) | 'agrupo por proveedor' >> beam.combiners.Count().PerElement() | 'split de campos unicos' >> beam.Map(lambda row: row[0].split('_')) | 'preparamos el registro' >> beam.Map(lambda row: { 'id': row[1], 'company_name': row[0] }) | 'imprime' >> beam.Map(imprime) | 'Writing to DB table' >> relational_db.Write( source_config=source_config, table_config=table_config)) p.run().wait_until_finish()
def transform_data(train_data_file, eval_data_file, transformed_train_data_base, transformed_eval_data_base, transformed_metadata_dir): """Transform the cleaned data and write out as a TFRecord of Example protos. Read in the cleaned data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data eval_data_file: File containing evaluation data transformed_train_data_base: Base filename for transformed training data shards transformed_eval_data_base: Base filename for cleaned evaluation data shards transformed_metadata_dir: Directory where metadata for transformed data should be written. """ raw_data_schema = { key: dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.string), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.float32), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema(dataset_schema.Domain(tf.string), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # Update outputs of both kinds to convert from shape (batch,), i.e. a batch # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1. # This is needed so the output can be easily wrapped in `FeatureColumn`s. for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS: outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as p: # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply some # Beam transformations here, which will not be encoded in the TF graph since # we don't do the from within tf.Transform's methods (AnalyzeDataset, # TransformDataset etc.). These transformations are just to get data into # a format that the CSV converter can read, in particular removing empty # lines and removing spaces after commas. raw_data = (p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, output_dir=os.path.join(tempfile.mkdtemp()))) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_data_base, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to eval data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'FilterEvalData' >> beam.Filter(lambda line: line and line != '|1x3 Cross validator') | 'FixCommasEvalData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsEvalData' >> beam.Map(lambda line: line[:-1]) | 'DecodeEvalData' >> beam.Map(converter.decode)) raw_eval_dataset = (raw_eval_data, raw_data_metadata) transformed_eval_dataset = ((raw_eval_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_eval_data, _ = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( transformed_eval_data_base, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=p))
def process_hub(self, hub_name, pk, bkey_list, field_list, foreign_keys=None): ext_field_list = \ [CONST_BK_FIELD, CONST_SOURCE_FIELD, CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] + \ field_list with beam.Pipeline(options=self.pipeline_options) as p: # First set up a stream for the data data = read_file( p, hub_name, self.get_psa_location('public.{0}'.format(hub_name)) + '*', pk) index = None try: # Also set up a stream for the index index = read_file( p, '{0}index'.format(hub_name), self.get_source_index('hub_{0}*'.format(hub_name)), pk) except IOError: logging.info("Could not open index, maybe doesn't exist") # create an empty pcollection, so we can at least run index = p | beam.Create([]) # Generate business keys, checksum, dv_source, load_dtm preproc_data = data | 'preprocess_' + hub_name >> \ beam.Map(add_hub_dv_details, bkey_list, self.source) if foreign_keys: preproc_data = self.resolve_foreign_keys( hub_name=hub_name, pk=pk, data=preproc_data, foreign_keys=foreign_keys, pipeline=p) # Group with index to be able to identify new, updated, deleted merge = ({ 'data': preproc_data, 'index': index }) | 'grouped_by_' + pk >> beam.CoGroupByKey() # Extract the data out of the records (still has index/data dict in there) extract = merge \ | 'filter_' + hub_name >> beam.Filter(filter_data_rows) \ | 'extract_' + hub_name >> beam.Map(extract_data) # Write them out to disk in loading area extract | 'Write_' + hub_name >> beam.io.Write( CsvFileSink(self.get_loading_location( 'public.{0}'.format(hub_name)), header=ext_field_list)) # Update the index updated_index = merge | 'updated_index_' + hub_name >> beam.Map( hub_select_index_or_data, pk) updated_index | 'Write_index_' + hub_name >> beam.io.Write( CsvFileSink(self.get_target_index('hub_{0}'.format(hub_name)), header=[CONST_BK_FIELD, CONST_CKSUM_FIELD, pk]))
) ######################################### # Writing to file system the dictionary # ######################################### weather \ | "weather:cleaning" >> beam.Map(lambda counter: '%s, %s' % (counter[0], counter[1])) \ | 'weather:write' >> beam.io.textio.WriteToText('weather_dictionary') ##################################### # Starting Pipeline for the flights # ##################################### flights = (pipeline | 'flights:read' >> beam.io.ReadFromText('flights_large.csv') | 'flights:removeduplicates' >> beam.RemoveDuplicates() | 'flights:lines' >> beam.Map(lambda line: next(csv.reader([line]))) | 'flight:remove heads' >> beam.Filter(lambda row: row[0] != 'Date') | 'flights:fields' >> beam.Map(lambda fields: ( (str(fields[5]) + '-' + str(fields[0]) + '-' + str(hour_(fields[7]))), fields[0], fields[7], fields[1], fields[5], georefe(fields[16], fields[15]), str(fields[5]) + '-->' + str(fields[3]), delaymarker(fields[8]), fields[18].ljust(10, '0'))) | 'flights:addint temperature' >> beam.FlatMap(temp_dict, beam.pvalue.AsDict(weather)) | 'flights:compact' >> beam.Map(lambda (data, temp): '{},{}'.format(','.join(data), temp)) ) flights | 'flights:write' >> beam.io.textio.WriteToText('flights_full_details')
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the story migration. Returns: PCollection. A PCollection of results from the story migration. """ unmigrated_story_models = ( self.pipeline | 'Get all non-deleted story models' >> (ndb_io.GetModels(story_models.StoryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_model: story_model.id)) story_summary_models = ( self.pipeline | 'Get all non-deleted story summary models' >> (ndb_io.GetModels(story_models.StorySummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_summary_model: story_summary_model.id)) topics = ( self.pipeline | 'Get all non-deleted topic models' >> (ndb_io.GetModels(topic_models.TopicModel.get_all())) | 'Transform model into domain object' >> beam.Map( topic_fetchers.get_topic_from_model) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda topic: topic.id)) topic_id_to_topic = beam.pvalue.AsDict(topics) migrated_story_results = ( unmigrated_story_models | 'Transform and migrate model' >> beam.MapTuple( self._migrate_story, topic_id_to_topic=topic_id_to_topic)) migrated_stories = ( migrated_story_results | 'Filter oks' >> beam.Filter(lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap())) migrated_story_job_run_results = ( migrated_story_results | 'Generate results for migration' >> (job_result_transforms.ResultsToJobRunResults('STORY PROCESSED'))) story_changes = (unmigrated_story_models | 'Generate story changes' >> beam.FlatMapTuple( self._generate_story_changes)) story_objects_list = ( { 'story_model': unmigrated_story_models, 'story_summary_model': story_summary_models, 'story': migrated_stories, 'story_change': story_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated stories' >> beam.Filter( lambda x: len(x['story_change']) > 0 and len(x['story']) > 0) | 'Reorganize the story objects' >> beam.Map( lambda objects: { 'story_model': objects['story_model'][0], 'story_summary_model': objects['story_summary_model'][0], 'story': objects['story'][0], 'story_change': objects['story_change'][0] })) story_objects_list_job_run_results = ( story_objects_list | 'Transform story objects into job run results' >> (job_result_transforms.CountObjectsToJobRunResult('STORY MIGRATED') )) cache_deletion_job_run_results = ( story_objects_list | 'Delete story from cache' >> beam.Map(lambda story_objects: self._delete_story_from_cache( story_objects['story'])) | 'Generate results for cache deletion' >> (job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))) story_models_to_put = ( story_objects_list | 'Generate story models to put' >> beam.FlatMap(lambda story_objects: self._update_story( story_objects['story_model'], story_objects['story'], story_objects['story_change'], ))) story_summary_models_to_put = ( story_objects_list | 'Generate story summary models to put' >> beam.Map(lambda story_objects: self._update_story_summary( story_objects['story'], story_objects['story_summary_model']))) unused_put_results = ( (story_models_to_put, story_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels()) return ( (cache_deletion_job_run_results, migrated_story_job_run_results, story_objects_list_job_run_results) | beam.Flatten())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating SkillOpportunityModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating SkillOpportunityModel. """ question_skill_link_models = ( self.pipeline | 'Get all non-deleted QuestionSkillLinkModels' >> (ndb_io.GetModels( question_models.QuestionSkillLinkModel.get_all( include_deleted=False))) | 'Group QuestionSkillLinkModels by skill ID' >> beam.GroupBy(lambda n: n.skill_id)) skills = ( self.pipeline | 'Get all non-deleted SkillModels' >> (ndb_io.GetModels( skill_models.SkillModel.get_all(include_deleted=False))) | 'Get skill object from model' >> beam.Map( skill_fetchers.get_skill_from_model) | 'Group skill objects by skill ID' >> beam.GroupBy(lambda m: m.id)) skills_with_question_counts = ( { 'skill': skills, 'question_skill_links': question_skill_link_models } | 'Merge by skill ID' >> beam.CoGroupByKey() # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Remove skill IDs' >> beam.Values() # pylint: disable=no-value-for-parameter # We are using itertools.chain.from_iterable to flatten # question_skill_links from a 2D list into a 1D list. | 'Flatten skill and question_skill_links' >> beam.Map( lambda object: { 'skill': list(object['skill'][0])[0], 'question_skill_links': list( itertools.chain.from_iterable(object[ 'question_skill_links'])) })) opportunities_results = ( skills_with_question_counts | beam.Map(lambda object: self._create_skill_opportunity_model( object['skill'], object['question_skill_links']))) unused_put_result = ( opportunities_results | 'Filter the results with OK status' >> beam.Filter(lambda result: result.is_ok()) | 'Fetch the models to be put' >> beam.Map(lambda result: result.unwrap()) | 'Put models into the datastore' >> ndb_io.PutModels()) return (opportunities_results | 'Transform Results to JobRunResults' >> (job_result_transforms.ResultsToJobRunResults()))
def test_bad_types(self): p = TestPipeline() evens = None # pylint: disable=unused-variable # [START type_hints_missing_define_numbers] numbers = p | beam.Create(['1', '2', '3']) # [END type_hints_missing_define_numbers] # Consider the following code. # pylint: disable=expression-not-assigned # pylint: disable=unused-variable # [START type_hints_missing_apply] evens = numbers | beam.Filter(lambda x: x % 2 == 0) # [END type_hints_missing_apply] # Now suppose numbers was defined as [snippet above]. # When running this pipeline, you'd get a runtime error, # possibly on a remote machine, possibly very late. with self.assertRaises(TypeError): p.run() # To catch this early, we can assert what types we expect. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_takes] p.options.view_as(TypeOptions).pipeline_type_check = True evens = numbers | beam.Filter( lambda x: x % 2 == 0).with_input_types(int) # [END type_hints_takes] # Type hints can be declared on DoFns and callables as well, rather # than where they're used, to be more self contained. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_do_fn] @beam.typehints.with_input_types(int) class FilterEvensDoFn(beam.NewDoFn): def process(self, element): if element % 2 == 0: yield element evens = numbers | beam.ParDo(FilterEvensDoFn()) # [END type_hints_do_fn] words = p | 'words' >> beam.Create(['a', 'bb', 'c']) # One can assert outputs and apply them to transforms as well. # Helps document the contract and checks it at pipeline construction time. # [START type_hints_transform] T = beam.typehints.TypeVariable('T') @beam.typehints.with_input_types(T) @beam.typehints.with_output_types(beam.typehints.Tuple[int, T]) class MyTransform(beam.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: (len(x), x)) words_with_lens = words | MyTransform() # [END type_hints_transform] # pylint: disable=expression-not-assigned with self.assertRaises(typehints.TypeCheckError): words_with_lens | beam.Map(lambda x: x).with_input_types( beam.typehints.Tuple[int, int])
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') parser.add_argument('--subscription', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument('--table_name', type=str, default='game_stats', help='The BigQuery table name. Should not already exist.') parser.add_argument('--fixed_window_duration', type=int, default=60, help='Numeric value of fixed window duration for user ' 'analysis, in minutes') parser.add_argument('--session_gap', type=int, default=5, help='Numeric value of gap between user sessions, ' 'in minutes') parser.add_argument('--user_activity_window_duration', type=int, default=30, help='Numeric value of fixed window for finding mean of ' 'user session duration, in minutes') args, pipeline_args = parser.parse_known_args(argv) if args.topic is None and args.subscription is None: parser.print_usage() print(sys.argv[0] + ': error: one of --topic or --subscription is required') sys.exit(1) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) fixed_window_duration = args.fixed_window_duration * 60 session_gap = args.session_gap * 60 user_activity_window_duration = args.user_activity_window_duration * 60 # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read game events from Pub/Sub using custom timestamps, which # are extracted from the data elements, and parse the data. if args.subscription: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=args.subscription) else: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=args.topic) raw_events = ( scores | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map( lambda elem: beam.window.TimestampedValue(elem, elem['timestamp']))) # Extract username/score pairs from the event stream user_events = ( raw_events | 'ExtractUserScores' >> beam.Map( lambda elem: (elem['user'], elem['score']))) # Calculate the total score per user over fixed windows, and cumulative # updates for late data spammers_view = ( user_events | 'UserFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. # These might be robots/spammers. | 'CalculateSpammyUsers' >> CalculateSpammyUsers() # Derive a view from the collection of spammer users. It will be used as # a side input in calculating the team score sums, below | 'CreateSpammersView' >> beam.CombineGlobally( beam.combiners.ToDictCombineFn()).as_singleton_view()) # [START filter_and_calc] # Calculate the total score per team over fixed windows, and emit cumulative # updates for late data. Uses the side input derived above --the set of # suspected robots-- to filter out scores from those users from the sum. # Write the results to BigQuery. (raw_events # pylint: disable=expression-not-assigned | 'WindowIntoFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out the detected spammer users, using the side input derived above | 'FilterOutSpammers' >> beam.Filter( lambda elem, spammers: elem['user'] not in spammers, spammers_view) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team') # [END filter_and_calc] | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name + '_teams', args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', 'processing_time': 'STRING', }, options.view_as(GoogleCloudOptions).project)) # [START session_calc] # Detect user sessions-- that is, a burst of activity separated by a gap # from further activity. Find and record the mean session lengths. # This information could help the game designers track the changing user # engagement as their set of game changes. (user_events # pylint: disable=expression-not-assigned | 'WindowIntoSessions' >> beam.WindowInto( beam.window.Sessions(session_gap), timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW) # For this use, we care only about the existence of the session, not any # particular information aggregated over it, so we can just group by key # and assign a "dummy value" of None. | beam.CombinePerKey(lambda _: None) # Get the duration of the session | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity()) # [END session_calc] # [START rewindow] # Re-window to process groups of session sums according to when the # sessions complete | 'WindowToExtractSessionMean' >> beam.WindowInto( beam.window.FixedWindows(user_activity_window_duration)) # Find the mean session duration in each window | beam.CombineGlobally(beam.combiners.MeanCombineFn()).without_defaults() | 'FormatAvgSessionLength' >> beam.Map( lambda elem: {'mean_duration': float(elem)}) | 'WriteAvgSessionLength' >> WriteToBigQuery( args.table_name + '_sessions', args.dataset, { 'mean_duration': 'FLOAT', }, options.view_as(GoogleCloudOptions).project))
def run(args, pipeline_args): # INSERT YOUR CODE HERE key_field_index = 0 if args.director_copies_sold or args.director_dollars_sold: key_field_index = 5 def SplitLine(line): # split to extract each field in the .csv file line_modified = line.replace(', ', '_') return line_modified.split(',') def PairWithCopies(fields): id = fields[key_field_index] purchase_method = fields[12] amount = fields[11] return (id, (amount if purchase_method == 'buy' else 0, amount if purchase_method == 'rent' else 0)) def PairWithRevenue(fields): id = fields[key_field_index] revenue = int(fields[9]) if fields[12] == 'buy' else int(fields[10]) return (id, revenue) def PairWithTransaction(fields): movie_id = fields[0] user_name = fields[13] date_time = fields[14] return ((user_name, date_time), movie_id) def Sum(group): from operator import add buy_tot = 0 rent_tot = 0 (id, records) = group for record in records: (buy_amt, rent_amt) = record buy_tot = buy_tot + int(buy_amt) rent_tot = rent_tot + int(rent_amt) return (id, buy_tot, rent_tot) def Permute(transaction): ((user_name, date_time), movie_list) = transaction li = [] position = 0 for movie_id in movie_list: if len(movie_list) > 1: for movie_id_other in movie_list: if (movie_id_other != movie_id): li.append(((movie_id, movie_id_other), 1)) else: li.append(((movie_id, None), 0)) return li def ChangeKey(movie_combination): #print(movie_combination) (movie_id, movie_id_other), count = movie_combination return (movie_id, (movie_id_other, count)) def Sort(movie_and_list): from operator import itemgetter (movie_id, purchased_together_tuples) = movie_and_list highest_list = [] sorted_list = sorted(purchased_together_tuples, key=itemgetter(1), reverse=True) if sorted_list[0][1] == 0: highest_list.append(('None', str(0))) else: i = 0 while i < len( sorted_list) and sorted_list[i][1] == sorted_list[0][1]: highest_list.append(sorted_list[i]) i = i + 1 return (movie_id, highest_list) def FormatMovieNumbers(result): (id, buy_tot, rent_tot) = result return '%s\t%s\t%s' % (id, str(buy_tot), str(rent_tot)) def FormatMovieRevenue(result): (id, revenue_tot) = result return '%s\t%s' % (id, str(revenue_tot)) def FormatHighestList(result): movie_id, highest_list = result li = [] #print(movie_id) li.append(str(movie_id)) for highest_movie in highest_list: #print(highest_movie) li.append(str(highest_movie[0])) frequency = highest_list[0][1] li.append(str(frequency)) result_formatted = '\t'.join(li) return result_formatted with beam.Pipeline(options=PipelineOptions(pipeline_args)) as pipeline: lines = pipeline | beam.io.ReadFromText(args.input) fields = (lines | 'Split' >> beam.Map(SplitLine)) filtered_fields = ( fields | 'Filter' >> beam.Filter(lambda field: args.genre is None and field is not None or args.genre is not None and field[4] == args.genre)) if args.copies_sold or args.director_copies_sold: movie_numbers = (filtered_fields | 'PairWithCopies' >> beam.Map(PairWithCopies) | 'GroupAndSum' >> beam.GroupByKey() | 'MergeAmount' >> beam.Map(Sum) | 'FormatRenvenue' >> beam.Map(FormatMovieNumbers)) movie_numbers | 'WriteMovieNumbers' >> beam.io.WriteToText( args.output) if args.dollars_sold or args.director_dollars_sold: movie_revenue = (filtered_fields | 'PairWithRevenue' >> beam.Map(PairWithRevenue) | 'CombineRevenue' >> beam.CombinePerKey(sum) | 'FormatRevenue' >> beam.Map(FormatMovieRevenue)) movie_revenue | 'WriteMovieRevenue' >> beam.io.WriteToText( args.output) if args.purchased_together: highest_list = ( filtered_fields | 'PairWithTrnasaction' >> beam.Map( PairWithTransaction) # (user_name, date_time), movie_id | 'GroupByTransaction' >> beam.GroupByKey() | 'Permute' >> beam.FlatMap( Permute) # (movie_id, movie_id_other), 1 | 'CombineMovieCombo' >> beam.CombinePerKey(sum) | 'ChangeKey' >> beam.Map( ChangeKey) # movie_id, (movie_id_other, count) | 'GroupByMovie' >> beam.GroupByKey() # movie_id, [(movie_id_other, count), ... ] | 'SortList' >> beam.Map(Sort) | 'FormatHighestList' >> beam.Map(FormatHighestList)) highest_list | 'WriteHighestList' >> beam.io.WriteToText( args.output) pass
pipeline=beam.Pipeline(argv=argv) side=( pipeline |'read roster'>>beam.io.ReadFromText('gs://justinminsk_bucket/retrosheet/roster') |beam.Map(lambda line:next(csv.reader([line]))) |beam.Map(lambda array:(array[0],array[2] + ' ' + array[1])) ) ( pipeline |beam.io.ReadFromText('gs://justinminsk_bucket/retrosheet/events') |beam.Map(lambda line:next(csv.reader([line]))) |beam.Filter(lambda tuple: int(tuple[2]) == 23) |beam.Map(lambda tuple:dict(zip(header,tuple))) |beam.Map(lambda dict:(dict['playerID'],int(dict['HRTotal']))) |beam.combiners.Count.PerKey() |beam.Map(lambda tuple,d:(tuple[0],tuple[1],d[tuple[0].split(' ')[0]]),beam.pvalue.AsDict(side)) |beam.combiners.ToList() |beam.Map(make_string) |beam.Map(lambda array:['playerID,Name,HRTotal'] + array) |beam.FlatMap(lambda x:x) |beam.io.WriteToText('gs://justinminsk_bucket/retrosheet/Minsk',num_shards=1) ) result=pipeline.run() result.wait_until_finish()
return name, 0 def return_tuple(element): thisTuple = element.split(',') return (thisTuple[0], thisTuple[1:]) p1 = beam.Pipeline() card_defaulter = ( p1 | beam.io.ReadFromText('cards.txt', skip_header_lines=1) | beam.Map(default_score) | beam.CombinePerKey(sum) | beam.Filter(lambda x: x[1] > 0) #|beam.io.WriteToText('./output/card_skip') ) medical_loan_defaulter = ( p1 | 'Read_medical' >> beam.io.ReadFromText('loan.txt', skip_header_lines=1) | 'Split Row' >> beam.Map(lambda row: row.split(',')) | 'Filter medical loan' >> beam.Filter( lambda element: (element[5]).rstrip().lstrip() == 'Medical Loan') | 'Calculate late payment' >> beam.Map(calculate_late_payment) | 'Make key value pairs' >> beam.Map(lambda elements: (elements[ 0] + ', ' + elements[1] + ' ' + elements[2], int(elements[9]))) | 'Group medical loan based on month' >> beam.CombinePerKey(sum) | 'Check for medical loan defaulter' >> beam.Filter(lambda element: element[1] >= 3)
def run(): address_scd = """SELECT * FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ADDRESS_SCD`""" upd_addrorg_data = """SELECT CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID, a.ADDRESS_NAME AS ADDR_NAME, 'CLIC' AS ETL_SOURCE_SYSTEM, a.FILE_SET_DATE AS ETL_END_EFFECTIVE_DT, '0' AS ETL_CURRENT_IND, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR FROM `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a""" primary_pipeline_1 = 'p1' p1 = p | 'AddressSCD Table' >> beam.io.Read( beam.io.BigQuerySource(query=address_scd, use_standard_sql=True)) join_pipeline_1 = 'j1' j1 = p | 'AddressORG Table' >> beam.io.Read( beam.io.BigQuerySource(query=upd_addrorg_data, use_standard_sql=True)) common_key = {'CUSTOMER_ID', 'ADDR_NAME', 'ETL_SOURCE_SYSTEM'} pipelines_dictionary_1 = {primary_pipeline_1: p1, join_pipeline_1: j1} p1j1 = (pipelines_dictionary_1 | 'Updating addrs Fields' >> LeftJoin( primary_pipeline_1, p1, join_pipeline_1, j1, common_key)) ins_addrorg_data = """SELECT (srg_key.MAX_VALUE_KEY + ROW_NUMBER() OVER()) AS CUSTOMER_ADDRESS_KEY, '' AS CUSTOMER_KEY, CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID, a.ADDRESS_NAME AS ADDR_NAME, 'CLIC' AS ETL_SOURCE_SYSTEM, CAST(a.ROW_CREATED_DATE AS TIMESTAMP) AS SOURCE_CREATE_DT, a.ADDRESS_LINE_1 AS ADDR_LINE1_TXT, a.ADDRESS_LINE_2 AS ADDR_LINE2_TXT, a.CITY AS CITY_NAME, a.STATE AS STATE_CODE, a.COUNTRY AS COUNTRY_CODE, SUBSTR(a.ZIP_CODE,1,5) AS POSTAL_ZIP, SUBSTR(a.ZIP_CODE,6,9) AS POSTAL_ZIP4, CASE WHEN a.DISABLE_CLEANSING_FLAG = 'N' THEN 1 ELSE 0 END AS ADDR_CLEANSING_IND, CASE WHEN a.FRAUD_BAD_ACCT_FLAG = 'Y' THEN 1 ELSE 0 END AS ADDR_FRAUD_IND, CASE WHEN a.AGENT_VERIFIED_ADDRESS = 'Y' THEN 1 ELSE 0 END AS ADDR_QAS_VERIFIED_IND, a.ADDRESS_TYPE_CODE AS ADDR_TYPE_CODE, a.SHIP_TO_FIRST_NAME AS SHIPTO_FIRST_NAME, a.SHIP_TO_LAST_NAME AS SHIPTO_LAST_NAME, TIMESTAMP_ADD(a.FILE_SET_DATE, INTERVAL 1 DAY) AS ETL_BEGIN_EFFECTIVE_DT, CAST('2099-12-31 00:00:00' AS TIMESTAMP) AS ETL_END_EFFECTIVE_DT, '1' AS ETL_CURRENT_IND, '2' AS ETL_VERSION_NBR, --should be a sequntial number '0' AS VOID_IND, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS INS_BATCH_NBR FROM `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a, `automatic-asset-253215.STAGE.STG_CLIC_SURROGKEYS` srg_key WHERE srg_key.TABLE_NAME = "IM_CUSTOMER_ADDRESS_SCD" """ Attribute_ref_query = """SELECT CUSTOMER_KEY, CUSTOMER_ID FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ATTRIBUTE_REF` b""" lookup_data = p1 | 'Get Cust_Ids ' >> beam.Map(lambda row: (str(row[ 'CUSTOMER_ID']) + row['ADDR_NAME'] + row['ETL_SOURCE_SYSTEM'], row)) primary_pipeline_2 = 'p2' p2 = (p | 'Read from addrorg' >> beam.io.Read( beam.io.BigQuerySource(query=ins_addrorg_data, use_standard_sql=True)) | 'Lookup' >> beam.Map(lookup, AsDict(lookup_data)) | 'Filter' >> beam.ParDo(filter_out_nones)) join_pipeline_2 = 'j2' j2 = p | 'Read From Attribute Ref Table' >> beam.io.Read( beam.io.BigQuerySource(query=Attribute_ref_query, use_standard_sql=True)) common_key = 'CUSTOMER_ID' pipelines_dictionary_2 = {primary_pipeline_2: p2, join_pipeline_2: j2} p2j2 = (pipelines_dictionary_2 | 'Left join' >> LeftJoin2(primary_pipeline_2, p2, join_pipeline_2, j2, common_key) | 'Filter Nulls' >> beam.Filter(filter_null)) ((p1j1, p2j2) | 'Merge PCollections' >> beam.Flatten() | 'Write to IM_CUSTOMER_ADDRESS_SCD' >> beam.io.WriteToBigQuery( output_table, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER)) p.run().wait_until_finish()
def run_job( output_loc, policies, true_rewards, num_trials, num_contexts, num_logs, slate_depth, cut_off, dataflow_args, ): def init_target_policies_fn(): target_policies = [(p[0], p[1]) for p in policies.items() if p[0] != "logging_policy"] return target_policies target_policy_names = list(zip(*init_target_policies_fn()))[0] pipeline_options = PipelineOptions(dataflow_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline = beam.Pipeline(options=pipeline_options) _init_env_fn = partial( init_env_fn, num_logs=num_logs, num_contexts=num_contexts, true_rewards=true_rewards, policies=policies, depth=slate_depth, ) _init_estimators_fn = partial(init_estimators_fn, cutoffs=cut_off) _flatten_onpolicy_results = partial(flatten_onpolicy_results, cutoffs=cut_off) for trial in range(num_trials): logs = pipeline | "LogSimulation[T-{}]".format(trial) >> BeamRankerSimulator(num_logs, _init_env_fn) ( logs | "FilterLoggingPolicyLog[T-{}]".format(trial) >> beam.Filter(lambda x: x[0] == "logging_policy") | "AddPredictions[T-{}]".format(trial) >> beam.Map(lambda x: addTargetPolicies(x[1], target_policy_names, policies=policies)) | "ListwiseMetricRunner[T-{}]".format(trial) >> BeamListwiseMetricRunner( _init_estimators_fn, init_target_policies_fn, max_cutoff=max(cut_off), ) | "WriteToFile[T-{}]".format(trial) >> beam.io.WriteToText( join(output_loc, "trial-{}-results".format(trial)), file_name_suffix=".json", coder=JsonCoder, ) ) ( logs | "SumRewards[T-{}]".format(trial) >> beam.FlatMap(lambda l: [(l[0] + ":" + str(c), sum(l[1].slate_rewards[:c])) for c in cut_off]) | "ComputeMean[T-{}]".format(trial) >> beam.transforms.combiners.Mean.PerKey() | "GroupAll[T-{}]".format(trial) >> GroupAll() | "FlattenResultIntoSingleMap[T-{}]".format(trial) >> beam.Map(_flatten_onpolicy_results) | "WriteToOnPolicyFile[T-{}]".format(trial) >> beam.io.WriteToText( join(output_loc, "trial-{}-onpolicy".format(trial)), file_name_suffix=".json", coder=JsonCoder, ) ) results = pipeline.run() results.wait_until_finish()
# regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # beam-playground: # name: Filter # description: Task from katas to implement a filter function that filters out odd numbers. # multifile: false # categories: # - Filtering import apache_beam as beam from log_elements import LogElements with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.Filter(lambda num: num % 2 == 0) | LogElements())
def FormatText(elem): return elem[0] + ' has received ' + str(elem[1]) + ' marks' p1 = beam.Pipeline() input_collection = ( p1 | beam.io.ReadFromText('../Apache_Beam_Data/students_marks.txt') | beam.Map(SplitRow)) US_pipeline = ( input_collection | beam.Filter(lambda record: FilterBasedonCountry('US', record)) | "Composite Transformation for US" >> MyTransform() | 'Writing results to US File' >> beam.io.WriteToText('output/US_Result')) India_pipeline = ( input_collection | beam.Filter(lambda record: FilterBasedonCountry('IN', record)) | "Composite Transformation for IN" >> MyTransform() | 'Writing results to India File' >> beam.io.WriteToText('output/IN_Result')) p1.run() print('') print("US Result: ") print(os.system('cat output/US*'))
def transform_data(train_data_file, test_data_file, working_dir, pipeline): def pre_processing_fun(inputs): outputs = {} for fea in NUMERIC_FEATURE_KEYS: outputs[fea] = tft.scale_to_0_1(inputs[fea]) for fea in CATEGORICAL_FEATURE_KEYS: outputs[fea] = tft.string_to_int(inputs[fea]) def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs converter = csv_coder.CsvCoder(ORDERED_COLUMNS, RAW_DATA_META.schema) ''' Transform and save train data ''' raw_train_data = ( pipeline | "Read raw train input" >> beam.io.textio.ReadFromText(train_data_file) | "Filter train line" >> beam.Filter(lambda x: x) | "Fix commas train data" >> beam.Map(lambda x: x.replace(', ', ',')) | "Decode train as csv" >> beam.Map(converter.decode)) raw_train_dataset = (raw_train_data, RAW_DATA_META) transformed_train_dataset, transform_fn = ( raw_train_dataset | beam_impl.AnalyzeAndTransformDataset(pre_processing_fun)) transformed_train_data, transformed_train_meta = transformed_train_dataset # Save transformed training data (transformed_train_data | "Save transformed train data" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE), coder=example_proto_coder.ExampleProtoCoder( transformed_train_meta.schema))) ''' Transform and save test data ''' raw_test_data = ( pipeline | "Read raw test input" >> beam.io.textio.ReadFromText(test_data_file) | "Filter test line" >> beam.Filter(lambda x: x) | "Fix commas test data" >> beam.Map(lambda x: x.replace(', ', ',')) | "Decode test as csv" >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_META) transformed_test_dataset = (raw_test_dataset, transform_fn) | beam_impl.TransformDataset() transformed_test_data, _ = transformed_test_dataset # Save transformed test data (transformed_test_data | "Save transformed test data" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE), coder=example_proto_coder.ExampleProtoCoder( transformed_train_meta.schema))) ''' Save transform function ''' (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def expand(self, pcoll): return pcoll | "IsBid" >> beam.Filter(is_bid)
format='[%(asctime)s][%(name)s][%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) return log_file if __name__ == "__main__": args, log_file = init(), config() logging.info('Starting job...') logging.info(f'Input: "{args.input}"') logging.info(f'Output: "{args.output}"') with beam.Pipeline(runner="DirectRunner") as pipeline: (pipeline | 'Read Data' >> beam.io.ReadFromText(args.input) | 'Parse JSON' >> beam.ParDo(JSONParser()) | 'Remove Invalid' >> beam.Filter(lambda data: 'id' in data) | 'Key/Value Pair' >> beam.Map(lambda data: (data['id'], data)) | 'Group by Key' >> beam.GroupByKey() | 'Remove Duplicates' >> beam.Map(lambda data: data[1][0]) | 'Show IDs' >> beam.ParDo(Printer()) | 'Parse Dates' >> beam.ParDo(DateParser()) | 'Write Output' >> beam.io.WriteToParquet( f'{args.output}/{uuid4()}', schema.jokes(), codec='snappy', file_name_suffix='.snappy.parquet')) logging.info(f'Job finished... Log file saved at "{log_file}"')
def expand(self, pcoll): return pcoll | "IsPerson" >> beam.Filter(is_person)
! pip install apache-beam import apache_beam as beam #! {(mkdir data)} p1 = beam.Pipeline() def SplitRow(element): return element.split(',') def filtering(record): return record[3] == "Accounts" attendance_count = ( p1 |"Read from data" >> beam.io.ReadFromText('dept_data.txt') | "Split data" >> beam.Map(lambda line : (line.split(","))) | "Filter" >> beam.Filter(filtering) | "Adding Key and value" >> beam.Map(lambda l : (l[1],1)) | "Combine by key" >> beam.CombinePerKey(sum) | "Write_File" >> beam.io.WriteToText('data/sample_data12121') ) p1.run() !head -n 20 data/*
import apache_beam as beam def SplitRow(element): return element.split(',') def filtering(record): return record[3] == 'Accounts' def listing(record): return (record[1],1) p1 = beam.Pipeline() attendance_count = ( p1 |"Read" >> beam.io.ReadFromText('dept-data.txt') |"Split" >> beam.Map(lambda element: element.split(',')) |"Filter" >> beam.Filter(lambda record: record[3] == 'Accounts') |"Map Name" >> beam.Map(lambda record: (record[1],1)) |"Combin name" >> beam.CombinePerKey(sum) |"Write to beam" >> beam.io.WriteToText('data/output_new2') ) p1.run()
skip_header_lines=1) | "Of text to list (rainfall)" >> beam.Map(text_to_list, delimeter=',') | "Create key uf-ano-mes" >> beam.Map(key_uf_year_month) | "Sum of rainfall cases" >> beam.CombinePerKey(sum) | "Rounding rain results" >> beam.Map(round_results) #| "Show rain results" >> beam.Map(print) ) results = ( #(rain, dengue) #| "Join at pcollections dengue and rainfall's" >> beam.Flatten() #| "Agroup" >> beam.GroupByKey() #| "Show results finaly" >> beam.Map(print) ({ 'chuvas': rain, 'dengue': dengue }) | "Merge at results" >> beam.CoGroupByKey() | "Filter data" >> beam.Filter(filter_fields) | "Descompact element" >> beam.Map(descompct_element) | "Build at a csv" >> beam.Map(buil_csv) # | "Show results finaly" >> beam.Map(print) ) header = 'UF;YEAR;MONTH;RAINFALL;DENGUE' results | "Build from csv" >> WriteToText( './basedb/resultado', file_name_suffix='.csv', header=header) pipeline.run()