def process_hub(self, hub_name, pk, bkey_list, field_list, foreign_keys=None): ext_field_list = \ field_list + [CONST_BK_FIELD, CONST_SOURCE_FIELD, CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] with beam.Pipeline(options=self.pipeline_options) as p: # First set up a stream for the data data = read_file( p, hub_name, self.get_psa_location('public.{0}'.format(hub_name)) + '*', pk) index = None try: # Also set up a stream for the index index = read_file(p, '{0}index'.format(hub_name), self.get_index('hub_{0}*'.format(hub_name)), pk) except IOError: logging.info("Could not open index, maybe doesn't exist") # create an empty pcollection, so we can at least run index = p | beam.Create([]) # Generate business keys, checksum, dv_source, load_dtm preproc_data = data | 'preprocess_' + hub_name >> \ beam.Map(add_hub_dv_details, bkey_list, self.source) if foreign_keys: data = self.resolve_foreign_keys(hub_name=hub_name, pk=pk, data=data, foreign_keys=foreign_keys, pipeline=p) # Group with index to be able to identify new, updated, deleted merge = ({ 'data': preproc_data, 'index': index }) | 'grouped_by_' + pk >> beam.CoGroupByKey() # Extract the data out of the records (still has index/data dict in there) extract = merge \ | 'filter_' + hub_name >> beam.Filter(filter_data_rows) \ | 'extract_' + hub_name >> beam.Map(extract_data) # Write them out to disk in loading area extract | 'Write_' + hub_name >> beam.io.Write( CsvFileSink(self.get_loading_location( 'public.{0}'.format(hub_name)), header=ext_field_list)) # Update the index updated_index = merge | 'updated_index_' + hub_name >> beam.Map( hub_select_index_or_data, pk) updated_index | 'Write_index_' + hub_name >> beam.io.Write( CsvFileSink(self.get_index('hub_{0}'.format(hub_name)), header=[CONST_BK_FIELD, CONST_CKSUM_FIELD, pk]))
def process_link(self, link_name, bkey_list, field_list, foreign_keys): ext_field_list = \ [LINK_KEY, CONST_SOURCE_FIELD, CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] + \ field_list keys = [t[1] for t in foreign_keys] generated_pk_name = '|'.join(keys) with beam.Pipeline(options=self.pipeline_options) as p: data = read_file( p, link_name, self.get_psa_location('public.{0}'.format(link_name)) + '*') index = None try: # Also set up a stream for the index index = read_file( p, '{0}index'.format(link_name), self.get_source_index('link_{0}*'.format(link_name)), LINK_KEY) except IOError: logging.info("Could not open index, maybe doesn't exist") # create an empty pcollection, so we can at least run index = p | beam.Create([]) preproc_data = data | 'preprocess_' + link_name >> \ beam.Map(add_link_dv_details, foreign_keys, self.source) # preproc_data | 'print_{0}'.format(link_name) >> beam.Map(print_index) preproc_data = self.resolve_foreign_keys(hub_name=link_name, pk=LINK_KEY, data=preproc_data, foreign_keys=foreign_keys, pipeline=p) # Group with index to be able to identify new, updated, deleted merge = ({ 'data': preproc_data, 'index': index }) | 'grouped_by_' + generated_pk_name >> beam.CoGroupByKey() # Extract the data out of the records (still has index/data dict in there) extract = merge \ | 'filter_' + link_name >> beam.Filter(filter_data_rows) \ | 'extract_' + link_name >> beam.Map(extract_data) # Write them out to disk in staging extract | 'Write_' + link_name >> beam.io.Write( CsvFileSink(self.get_loading_location( 'public.{0}'.format(link_name)), header=ext_field_list)) # Update the index updated_index = merge | 'updated_index_' + link_name >> beam.Map( link_select_index_or_data, LINK_KEY) updated_index | 'Write_index_' + link_name >> beam.io.Write( CsvFileSink(self.get_target_index( 'link_{0}'.format(link_name)), header=[CONST_CKSUM_FIELD, LINK_KEY]))
def process_table(self, hub_name, pk, field_list): ext_field_list = \ field_list + [CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] with beam.Pipeline(options=self.pipeline_options) as p: # First set up a stream for the data data = read_file( p, hub_name, self.get_staging_location('public.{0}'.format(hub_name)) + '*', pk) index = None try: # Also set up a stream for the index index = read_file( p, '{0}index'.format(hub_name), self.get_source_index('hub_{0}*'.format(hub_name)), pk) except IOError: logging.info("Could not open index, maybe doesn't exist") # create an empty pcollection, so we can at least run index = p | beam.Create([]) # Generate business keys, checksum, dv_source, load_dtm preproc_data = data | 'preprocess_' + hub_name >> \ beam.Map(add_cksum) # Group with index to be able to identify new, updated, deleted merge = ({ 'data': preproc_data, 'index': index }) | 'grouped_by_' + pk >> beam.CoGroupByKey() # Extract the data out of the records (still has index/data dict in there) extract = merge \ | 'filter_' + hub_name >> beam.Filter(filter_unchanged_rows) \ | 'extract_' + hub_name >> beam.Map(extract_data) # Write them out to disk in staging extract | 'Write_' + hub_name >> beam.io.Write( CsvFileSink(self.get_psa_location( 'public.{0}'.format(hub_name)), header=ext_field_list))
def process_link(self, link_name, field_list, foreign_keys): ext_field_list = field_list + [CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] generated_pk_name = '|'.join(foreign_keys) with beam.Pipeline(options=self.pipeline_options) as p: data = read_file( p, link_name, self.get_staging_location('public.{0}'.format(link_name)) + '*') preproc_data = data | 'preprocess_' + link_name >> \ beam.Map(add_link_cksum, foreign_keys) index = None try: # Also set up a stream for the index index = read_file( p, '{0}index'.format(link_name), self.get_index('link_{0}*'.format(link_name)), LINK_KEY) except IOError: logging.info("Could not open index, maybe doesn't exist") # create an empty pcollection, so we can at least run index = p | beam.Create([]) # Group with index to be able to identify new, updated, deleted merge = ({ 'data': preproc_data, 'index': index }) | 'grouped_by_' + generated_pk_name >> beam.CoGroupByKey() # Extract the data out of the records (still has index/data dict in there) extract = merge \ | 'filter_' + link_name >> beam.Filter(filter_unchanged_rows) \ | 'extract_' + link_name >> beam.Map(extract_data) # Write them out to disk in staging extract | 'Write_' + link_name >> beam.io.Write( CsvFileSink(self.get_psa_location( 'public.{0}'.format(link_name)), header=ext_field_list))
def run(self): pipeline_options = PipelineOptions(self.pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Helper: read a tab-separated key-value mapping from a text file, # escape all quotes/backslashes, and convert it a PCollection of # (key, value) pairs. def read_data_file(label, file_pattern, pk, add_source=False, dictionary_output=True): return (p | 'Read: %s' % label >> beam.io.Read(CsvFileSource(file_pattern, add_source=add_source, dictionary_output=dictionary_output)) | 'Key: %s' % label >> beam.Map(lambda x: (x[pk], x))) def read_index_file(label, file_pattern, pk, bk, add_source=False, dictionary_output=True): return (p | 'Read: %s' % label >> beam.io.Read(CsvFileSource(file_pattern, add_source=add_source, dictionary_output=dictionary_output)) | 'Tuple: %s' % label >> beam.Map(lambda x: (x[pk], x[bk]))) for key, value in self.indexes.items(): self.pcols[value['name']] = read_index_file(key, self.get_index_location(key) + '*', value['pk'], value['bk']) # self.pcols[value['name']] | 'print3' + value['name'] >> beam.Map(print_line) for key, value in self.hubs.items(): self.pcols[value['name']] = read_data_file(key, self.get_data_location(key, value.get('include_date', None)) + '*', value['pk'], add_source=True) self.pcols['full_' + value['name']] = \ self.pcols[value['name']] | 'full_' + key >> beam.Map(generate_bk, value['bk']) self.pcols['bk_' + value['name']] = \ self.pcols['full_' + value['name']] | 'bk_' + key >> beam.Map(filter_bk) self.pcols['extract_' + value['name']] = \ self.pcols['full_' + value['name']] | 'extract_' + value['name'] >> beam.FlatMap(extract_data_rec) self.pcols['extract_' + value['name']] | 'Write_full_' + value['name'] >> beam.io.Write( CsvFileSink(self.get_staging_location(value['name'], 'current'), header=value['header'])) # self.pcols['bk_' + value['name']] | 'print' + value['name'] >> beam.Map(print_line) for (index, data, intersection, new, removed, header) in self.intersections: self.pcols[intersection] = \ ({index: self.pcols[index], data: self.pcols['bk_' + data]}) | intersection >> beam.CoGroupByKey() self.pcols[new] = self.pcols[intersection] | new + '_' + data >> beam.FlatMap(check_missing, index) self.pcols[removed] = self.pcols[intersection] | removed + '_' + data >> beam.FlatMap(check_missing, data) updated_index = self.pcols[intersection] | 'updatedindex_' + data >> beam.FlatMap(grab_from_either, index, data) updated_index | 'Write_' + index >> beam.io.Write( CsvTupleFileSink(self.get_index_location(index), header=header)) self.pcols[new] | 'new_' + data >> beam.FlatMap(grab_from_either, index, data) \ | 'Write_new_' + data >> beam.io.Write( CsvTupleFileSink(self.get_staging_location(data, 'new'), header=header)) self.pcols[removed] | 'removed_' + data >> beam.FlatMap(grab_from_either, index, data) \ | 'Write_removed_' + data >> beam.io.Write( CsvTupleFileSink(self.get_staging_location(data, 'removed'), header=header)) # self.pcols[intersection] | 'print2' + data >> beam.Map(print_line) for key, value in self.links.items(): first = True for pk, lkey, bk, nk, dataset in zip(value['pk'], value['lkey'], value['bk'], value['nk'], value['data']): if first: self.pcols[key] = read_data_file(key, self.get_data_location(key, value.get('include_date', None)) + '*', pk, add_source=True) self.pcols['grouped_by_' + pk] = \ ({key: self.pcols[key], bk: self.pcols[dataset]}) | 'grouped_by_' + pk >> beam.CoGroupByKey() last = 'remap_' + pk self.pcols[last] = \ self.pcols['grouped_by_' + pk] | 'remap_' + pk >> beam.FlatMap(co_group_op_1, key, bk, nk, lkey) # self.pcols[last] | 'print2' + data >> beam.Map(print_line) else: self.pcols['grouped_by_' + pk] = \ ({last: self.pcols[last], bk: self.pcols[dataset]}) | 'grouped_by_' + pk >> beam.CoGroupByKey() new_last = 'remap_' + pk self.pcols[new_last] = \ self.pcols['grouped_by_' + pk] | 'remap_' + pk >> beam.FlatMap(co_group_op_1, last, bk, nk, lkey) # self.pcols[new_last] | 'print2' + data >> beam.Map(print_line) last = new_last first = False self.pcols['extract_' + key] = \ self.pcols[last] | 'extract_' + key >> beam.FlatMap(extract_data_rec) self.pcols['extract_' + key] | 'Write_' + key >> beam.io.Write( CsvFileSink(self.get_staging_location(key, 'current'), header=value['header']))