Ejemplo n.º 1
0
    def fill_data_gaps(self, **context) -> None:
        """Fill data without dt_ref
        Example:
            input:  ('ef57132e-0d25-4e75-bee6-158c01c5b360', 50000, None)
            output: ('e85e10b5-43af-4721-b832-9b9b4bb366fe', 10708, '01-01-1900')
        """
        list_records = self.get_list_redis(context['redis_key'])
        pipe = RedisHook(self.redis_conn_id).get_conn().pipeline()
        df = pd.DataFrame(data=list_records)
        col_date_ref = list(df.columns)[-1]
        df[col_date_ref].replace(to_replace=[None], value='01-01-1900', inplace=True)
        records = [tuple(x) for x in df.to_numpy()]

        [pipe.lpush(context['current_dag_name'], str(row)) for row in records]
        pipe.execute()
        self.log.info(f"\nSample rows:\n{df.head(5)}")
Ejemplo n.º 2
0
    def split_id_by_date(self, **context) -> None:
        """Create redis key by date

        example (redis keys):
            input: file_name_000000000000000
            output: file_name_000000000000000_01-01-1900, file_name_000000000000000_24-12-2020
        """
        logging.info(f'Spliting IDs by date ...')
        pipe = RedisHook(self.redis_conn_id).get_conn().pipeline()
        list_records = self.get_list_redis(context['redis_key'])

        for date in context['list_current_dates']:
            list_item = [item for item in list_records if date in item[-1]]

            name_redis_key = context['current_dag_name'] + '_' + date
            [pipe.lpush(name_redis_key, str(row)) for row in list_item]
            pipe.execute()
            self.log.info(f'{date} - Storaged at redis {name_redis_key} = {len(list_item)}')