def adjust_type(df, tb_name): logger = logging.getLogger('adjust type') for meta in dataset.consult(tb_name)['fields'].values(): nome_banco = meta['nome_banco'] tipo = meta['tipo'] date_format = meta['date_format'] replace_null = meta['replace_null'] if tipo == 'date': try: df[nome_banco] = df[nome_banco].apply( lambda x: pd.to_datetime(replace_nan(x, replace_null), format=date_format, errors='coerce')) logger.info(f'type date sucess in {nome_banco}') except ValueError as ve: logger.info(f'{ve} type date dont recognize in {nome_banco}') raise elif tipo == 'int': try: df[nome_banco] = df[nome_banco].apply( lambda x: replace_nan(x, replace_null)).astype(int) logger.info(f'type int sucess in {nome_banco}') except ValueError as ve: logger.info(f'{ve} type int dont recognize in {nome_banco}') raise elif tipo == 'float': try: df[nome_banco] = df[nome_banco].apply( lambda x: replace_nan(x, replace_null)).astype(float) logger.info(f'type float sucess in {nome_banco}') except ValueError as ve: logger.info(f'{ve} type float dont recognize in {nome_banco}') raise return df
def validator(df, tb_name): l_validator = [] l_val = [] for meta in dataset.consult(tb_name)['fields'].values(): nome_banco = meta['nome_banco'] nullable = meta['nullable'] replace_null = meta['replace_null'] tipo = meta['tipo'] date_format = meta['date_format'] regex = meta['regex'] try: int(nome_banco) except ValueError: try: if (valnull(nullable, replace_null, df[nome_banco]) and valtype(tipo, date_format, replace_null, df[nome_banco]) and valregex(regex, nullable, replace_null, df[nome_banco])): l_validator.append(True) l_val.append({nome_banco: True}) else: l_validator.append(False) l_val.append({nome_banco: False}) except: l_validator.append(False) l_val.append({nome_banco: False}) if False in l_validator: return False, l_val else: return True, l_val
def transient2raw(self): tb_cols = [ col['nome_banco'] for col in dataset.consult(self.table_nm)['fields'].values() ] self.logger.info(f'List of columns dictionary {tb_cols}') lcol = [x for x in self.df.columns if x < len(tb_cols)] dict_cols = dict(zip(lcol, tb_cols)) self.logger.info(f'Create Dictionary Cols {self.table_nm}') self.df_out.append(self.df.rename(columns=dict_cols)) self.logger.info(f'Rename Cols {self.table_nm}') return self.df_out
def execute(self, context): ldatalake = [file['name'] for file in self.client.list_blobs()] self.logger.info('Create Datalake List') llocal = [ os.path.join(currentpath, file).replace(self.source, '')[1:] for currentpath, folders, files in os.walk(self.source) for file in files if file != 'diff.txt' and file.find('meta') < 0 ] self.logger.info(f'Create Local List {llocal}') ldiffinc = [ file for file in llocal if dataset.consult(file)['typerun'] == 'increment' and file not in ldatalake ] ldifffull = [ file for file in llocal if dataset.consult(file)['typerun'] == 'full' ] ldiff = ldiffinc + ldifffull self.logger.info(f'Genereate Diff List {ldiff}') return util.chunklist(ldiff, self.num_workers)
def _comparedb(self, ldiff): lvalidate = [] dictmax = {} for file in set(ldiff): tbname = f"bt_{dataset.tbname (file)}" tprun = dataset.consult(file)['typerun'] if tprun == 'increment': try: maxval = dictmax[tbname] except: conn = self.engine.connect() selectmax = f'select max(mes_competencia) as mes_competencia from dictas.{tbname}' maxval = conn.execute(selectmax).fetchall()[0][0] dictmax.update({tbname: maxval}) filedate = datetime.strptime( re.search(r"(\d+)", file.split('/')[1].replace('-', '')).group(), '%m%Y').date() if filedate > maxval: lvalidate.append(file) else: lvalidate.append(file) self.logger.info(f'Diff validate {lvalidate}') return lvalidate
def check_exists(nmfile): if dataset.consult(nmfile)['typerun'] == 'full': return 'replace' else: return 'append'