Beispiel #1
0
    def process_hub(self,
                    hub_name,
                    pk,
                    bkey_list,
                    field_list,
                    foreign_keys=None):
        ext_field_list = \
            field_list + [CONST_BK_FIELD, CONST_SOURCE_FIELD, CONST_LOADDTM_FIELD, CONST_STATUS_FIELD]

        with beam.Pipeline(options=self.pipeline_options) as p:
            # First set up a stream for the data
            data = read_file(
                p, hub_name,
                self.get_psa_location('public.{0}'.format(hub_name)) + '*', pk)

            index = None
            try:
                # Also set up a stream for the index
                index = read_file(p, '{0}index'.format(hub_name),
                                  self.get_index('hub_{0}*'.format(hub_name)),
                                  pk)
            except IOError:
                logging.info("Could not open index, maybe doesn't exist")
                # create an empty pcollection, so we can at least run
                index = p | beam.Create([])

            # Generate business keys, checksum, dv_source, load_dtm
            preproc_data = data | 'preprocess_' + hub_name >> \
                beam.Map(add_hub_dv_details, bkey_list, self.source)

            if foreign_keys:
                data = self.resolve_foreign_keys(hub_name=hub_name,
                                                 pk=pk,
                                                 data=data,
                                                 foreign_keys=foreign_keys,
                                                 pipeline=p)

            # Group with index to be able to identify new, updated, deleted
            merge = ({
                'data': preproc_data,
                'index': index
            }) | 'grouped_by_' + pk >> beam.CoGroupByKey()

            # Extract the data out of the records (still has index/data dict in there)
            extract = merge \
                | 'filter_' + hub_name >> beam.Filter(filter_data_rows) \
                | 'extract_' + hub_name >> beam.Map(extract_data)

            # Write them out to disk in loading area
            extract | 'Write_' + hub_name >> beam.io.Write(
                CsvFileSink(self.get_loading_location(
                    'public.{0}'.format(hub_name)),
                            header=ext_field_list))

            # Update the index
            updated_index = merge | 'updated_index_' + hub_name >> beam.Map(
                hub_select_index_or_data, pk)
            updated_index | 'Write_index_' + hub_name >> beam.io.Write(
                CsvFileSink(self.get_index('hub_{0}'.format(hub_name)),
                            header=[CONST_BK_FIELD, CONST_CKSUM_FIELD, pk]))
    def process_link(self, link_name, bkey_list, field_list, foreign_keys):
        ext_field_list = \
            [LINK_KEY, CONST_SOURCE_FIELD, CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] + \
            field_list

        keys = [t[1] for t in foreign_keys]
        generated_pk_name = '|'.join(keys)

        with beam.Pipeline(options=self.pipeline_options) as p:
            data = read_file(
                p, link_name,
                self.get_psa_location('public.{0}'.format(link_name)) + '*')

            index = None
            try:
                # Also set up a stream for the index
                index = read_file(
                    p, '{0}index'.format(link_name),
                    self.get_source_index('link_{0}*'.format(link_name)),
                    LINK_KEY)
            except IOError:
                logging.info("Could not open index, maybe doesn't exist")
                # create an empty pcollection, so we can at least run
                index = p | beam.Create([])

            preproc_data = data | 'preprocess_' + link_name >> \
                beam.Map(add_link_dv_details, foreign_keys, self.source)

            # preproc_data | 'print_{0}'.format(link_name) >> beam.Map(print_index)

            preproc_data = self.resolve_foreign_keys(hub_name=link_name,
                                                     pk=LINK_KEY,
                                                     data=preproc_data,
                                                     foreign_keys=foreign_keys,
                                                     pipeline=p)

            # Group with index to be able to identify new, updated, deleted
            merge = ({
                'data': preproc_data,
                'index': index
            }) | 'grouped_by_' + generated_pk_name >> beam.CoGroupByKey()

            # Extract the data out of the records (still has index/data dict in there)
            extract = merge \
                | 'filter_' + link_name >> beam.Filter(filter_data_rows) \
                | 'extract_' + link_name >> beam.Map(extract_data)

            # Write them out to disk in staging
            extract | 'Write_' + link_name >> beam.io.Write(
                CsvFileSink(self.get_loading_location(
                    'public.{0}'.format(link_name)),
                            header=ext_field_list))

            # Update the index
            updated_index = merge | 'updated_index_' + link_name >> beam.Map(
                link_select_index_or_data, LINK_KEY)
            updated_index | 'Write_index_' + link_name >> beam.io.Write(
                CsvFileSink(self.get_target_index(
                    'link_{0}'.format(link_name)),
                            header=[CONST_CKSUM_FIELD, LINK_KEY]))
    def process_table(self, hub_name, pk, field_list):
        ext_field_list = \
            field_list + [CONST_LOADDTM_FIELD, CONST_STATUS_FIELD]

        with beam.Pipeline(options=self.pipeline_options) as p:
            # First set up a stream for the data
            data = read_file(
                p, hub_name,
                self.get_staging_location('public.{0}'.format(hub_name)) + '*',
                pk)

            index = None
            try:
                # Also set up a stream for the index
                index = read_file(
                    p, '{0}index'.format(hub_name),
                    self.get_source_index('hub_{0}*'.format(hub_name)), pk)
            except IOError:
                logging.info("Could not open index, maybe doesn't exist")
                # create an empty pcollection, so we can at least run
                index = p | beam.Create([])

            # Generate business keys, checksum, dv_source, load_dtm
            preproc_data = data | 'preprocess_' + hub_name >> \
                beam.Map(add_cksum)

            # Group with index to be able to identify new, updated, deleted
            merge = ({
                'data': preproc_data,
                'index': index
            }) | 'grouped_by_' + pk >> beam.CoGroupByKey()

            # Extract the data out of the records (still has index/data dict in there)
            extract = merge \
                | 'filter_' + hub_name >> beam.Filter(filter_unchanged_rows) \
                | 'extract_' + hub_name >> beam.Map(extract_data)

            # Write them out to disk in staging
            extract | 'Write_' + hub_name >> beam.io.Write(
                CsvFileSink(self.get_psa_location(
                    'public.{0}'.format(hub_name)),
                            header=ext_field_list))
    def process_link(self, link_name, field_list, foreign_keys):
        ext_field_list = field_list + [CONST_LOADDTM_FIELD, CONST_STATUS_FIELD]
        generated_pk_name = '|'.join(foreign_keys)

        with beam.Pipeline(options=self.pipeline_options) as p:
            data = read_file(
                p, link_name,
                self.get_staging_location('public.{0}'.format(link_name)) +
                '*')

            preproc_data = data | 'preprocess_' + link_name >> \
                beam.Map(add_link_cksum, foreign_keys)

            index = None
            try:
                # Also set up a stream for the index
                index = read_file(
                    p, '{0}index'.format(link_name),
                    self.get_index('link_{0}*'.format(link_name)), LINK_KEY)
            except IOError:
                logging.info("Could not open index, maybe doesn't exist")
                # create an empty pcollection, so we can at least run
                index = p | beam.Create([])

            # Group with index to be able to identify new, updated, deleted
            merge = ({
                'data': preproc_data,
                'index': index
            }) | 'grouped_by_' + generated_pk_name >> beam.CoGroupByKey()

            # Extract the data out of the records (still has index/data dict in there)
            extract = merge \
                | 'filter_' + link_name >> beam.Filter(filter_unchanged_rows) \
                | 'extract_' + link_name >> beam.Map(extract_data)

            # Write them out to disk in staging
            extract | 'Write_' + link_name >> beam.io.Write(
                CsvFileSink(self.get_psa_location(
                    'public.{0}'.format(link_name)),
                            header=ext_field_list))
Beispiel #5
0
    def run(self):
        pipeline_options = PipelineOptions(self.pipeline_args)
        pipeline_options.view_as(SetupOptions).save_main_session = True
        with beam.Pipeline(options=pipeline_options) as p:
            # Helper: read a tab-separated key-value mapping from a text file,
            # escape all quotes/backslashes, and convert it a PCollection of
            # (key, value) pairs.
            def read_data_file(label, file_pattern, pk, add_source=False, dictionary_output=True):
                return (p
                    | 'Read: %s' % label >> beam.io.Read(CsvFileSource(file_pattern, 
                                                                       add_source=add_source,
                                                                       dictionary_output=dictionary_output))
                    | 'Key: %s' % label >> beam.Map(lambda x: (x[pk], x)))

            def read_index_file(label, file_pattern, pk, bk, add_source=False, dictionary_output=True):
                return (p
                    | 'Read: %s' % label >> beam.io.Read(CsvFileSource(file_pattern, 
                                                                       add_source=add_source,
                                                                       dictionary_output=dictionary_output))
                    | 'Tuple: %s' % label >> beam.Map(lambda x: (x[pk], x[bk])))

            for key, value in self.indexes.items():
                self.pcols[value['name']] = read_index_file(key,
                    self.get_index_location(key) + '*',
                    value['pk'],
                    value['bk'])
                # self.pcols[value['name']] | 'print3' + value['name'] >> beam.Map(print_line) 

            for key, value in self.hubs.items():
                self.pcols[value['name']] = read_data_file(key,
                    self.get_data_location(key, value.get('include_date', None)) + '*',
                    value['pk'],
                    add_source=True)
                self.pcols['full_' + value['name']] = \
                    self.pcols[value['name']] | 'full_' + key >> beam.Map(generate_bk, value['bk'])
                self.pcols['bk_' + value['name']] = \
                    self.pcols['full_' + value['name']] | 'bk_' + key >> beam.Map(filter_bk)

                self.pcols['extract_' + value['name']] = \
                    self.pcols['full_' + value['name']] | 'extract_' + value['name'] >> beam.FlatMap(extract_data_rec)

                self.pcols['extract_' + value['name']] | 'Write_full_' + value['name'] >> beam.io.Write(
                    CsvFileSink(self.get_staging_location(value['name'], 'current'), header=value['header']))

                # self.pcols['bk_' + value['name']] | 'print' + value['name'] >> beam.Map(print_line) 

            for (index, data, intersection, new, removed, header) in self.intersections:
                self.pcols[intersection] = \
                    ({index: self.pcols[index], 
                        data: self.pcols['bk_' + data]}) | intersection >> beam.CoGroupByKey()
                self.pcols[new] = self.pcols[intersection] | new + '_' + data >> beam.FlatMap(check_missing, index)
                self.pcols[removed] = self.pcols[intersection] | removed + '_' + data >> beam.FlatMap(check_missing, data)

                updated_index = self.pcols[intersection] | 'updatedindex_' + data >> beam.FlatMap(grab_from_either, index, data)
                updated_index | 'Write_' + index >> beam.io.Write(
                    CsvTupleFileSink(self.get_index_location(index), header=header))

                self.pcols[new] | 'new_' + data >> beam.FlatMap(grab_from_either, index, data) \
                    | 'Write_new_' + data >> beam.io.Write(
                        CsvTupleFileSink(self.get_staging_location(data, 'new'), header=header))
                self.pcols[removed] | 'removed_' + data >> beam.FlatMap(grab_from_either, index, data) \
                    | 'Write_removed_' + data >> beam.io.Write(
                        CsvTupleFileSink(self.get_staging_location(data, 'removed'), header=header))

                # self.pcols[intersection] | 'print2' + data >> beam.Map(print_line) 

            for key, value in self.links.items():
                first = True
                for pk, lkey, bk, nk, dataset in zip(value['pk'], value['lkey'], value['bk'], value['nk'], value['data']):
                    if first:
                        self.pcols[key] = read_data_file(key,
                            self.get_data_location(key, value.get('include_date', None)) + '*',
                            pk,
                            add_source=True)
                        self.pcols['grouped_by_' + pk] = \
                            ({key: self.pcols[key], bk: self.pcols[dataset]}) | 'grouped_by_' + pk >> beam.CoGroupByKey()
                        last = 'remap_' + pk
                        self.pcols[last] = \
                            self.pcols['grouped_by_' + pk] | 'remap_' + pk >> beam.FlatMap(co_group_op_1, key, bk, nk, lkey)
                        # self.pcols[last] | 'print2' + data >> beam.Map(print_line)
                    else:
                        self.pcols['grouped_by_' + pk] = \
                            ({last: self.pcols[last], bk: self.pcols[dataset]}) | 'grouped_by_' + pk >> beam.CoGroupByKey()
                        new_last = 'remap_' + pk
                        self.pcols[new_last] = \
                            self.pcols['grouped_by_' + pk] | 'remap_' + pk >> beam.FlatMap(co_group_op_1, last, bk, nk, lkey)
                        # self.pcols[new_last] | 'print2' + data >> beam.Map(print_line) 
                        last = new_last
                    first = False

                self.pcols['extract_' + key] = \
                    self.pcols[last] | 'extract_' + key >> beam.FlatMap(extract_data_rec)

                self.pcols['extract_' + key] | 'Write_' + key >> beam.io.Write(
                    CsvFileSink(self.get_staging_location(key, 'current'), header=value['header']))