def envia_arquivo_s3(param_df, identificador_arquivo ): #carregar bucket akron - dataframe tratado na etapa1 s3_resource = conexao.connect_s3().s3_client_akron_upload.resource('s3') try: date = '2020-05-04' #datetime.date_time_zn().data bucket_name = 'pagseguro-akron-qa' #object_key = f'upload_arq/upload_{identificador_arquivo}-{date}.snappy.parquet' #gerar arquivo parquet estou com problema lib windows local object_key = f'upload_arq/upload_{identificador_arquivo}-{date}.csv' csv_buffer = StringIO() param_df.to_csv(csv_buffer, sep=";", index=False) #param_df.to_parquet(csv_buffer, engine='pyarrow', compression='snappy') #gerar arquivo parquet estou com problema lib windows local s3_resource.Object(bucket_name, object_key).put(Body=csv_buffer.getvalue()) log.logger.info('Upload success to S3 ...') return True except FileNotFoundError: log.logger.info("The file was not found") return False except NoCredentialsError: log.logger.info("Credentials not available") return False #print(csv_obj) except Exception as err: log.logger.error(err) return 0
def busca_arquivo_s3(identificador_arquivo): date = '2020-05-04' #datetime.date_time_zn().data bucket_name = 'meu-time-qa' #nomedobucker object_key = f'repositorio/{identificador_arquivo}-{date}.csv' #alterar o nome do repositorio #print(object_key) conn_s3 = conexao.connect_s3().s3_client #body = None try: log.logger.info('extract datasets from S3....') csv_obj = conn_s3.get_object(Bucket=bucket_name, Key=object_key) #print(csv_obj) except Exception as err: log.logger.error(err) raise err #le arquivo s3 porem lento body = csv_obj['Body'] #print(body) csv_string = body.read().decode('utf-8') dfpag = StringIO(csv_string) return dfpag
def read_jsonl(file, bucket, prefix_output): try: file_input = f'./raw_files/{file}.jsonl' parquet_file_input = '{}/{}.snappy.parquet'.format(var.output_file, file) parquet_file_output = '{}.snappy.parquet'.format(file) file_error_input = '{}/fail/{}.txt'.format(var.output_file, file) file_error_output = 'fail/{}.txt'.format(file) path_output = prefix_output.format(file, fdt.datetime_str.year, fdt.datetime_str.month, fdt.datetime_str.day, parquet_file_output) path_output_error = prefix_output.format(file, fdt.datetime_str.year, fdt.datetime_str.month, fdt.datetime_str.day, file_error_output) print('reading and processing dataset.. {}'.format(file)) log.logger.info('reading and processing dataset.. {}'.format(file)) crude_file = open(file_input).read() result = { "success": [], "error": [] } for i in crude_file.splitlines(): try: result['success'].append(json.loads(i)) except: log.logger.warning('Error in json loads {}'.format(i)) result['error'].append(i) df_success = pd.DataFrame(result['success']) df_success.columns = map(str.lower, df_success.columns) df_error = pd.DataFrame(result['error']) try: print('saving data into file.. {} : {} rows'.format(file, df_success.shape[0])) log.logger.info('saving data into file.. {} : {} rows'.format(file, df_success.shape[0])) df_success.to_parquet(parquet_file_input, engine='pyarrow', compression='snappy') except Exception as erro: print('fail to save data on file') log.logger.error('fail to save data on file') log.logger.error(erro) sys.exit() try: df_error.to_csv(file_error_input, sep=';', index=False, header=False, doublequote=False, escapechar= '"') except: pass try: try: print('trying connect to AWS S3...') log.logger.info('trying connect to AWS S3...') s3_resource = con.connect_s3() print('connected to AWS S3...') log.logger.info('connected to AWS S3...') except Exception as erro: print('failed try to connect to AWS S3') log.logger.error('failed try to connect to AWS S3') log.logger.error(erro) sys.exit() print('saving dataset to AWS S3...') log.logger.info('saving dataset to AWS S3...') s3_resource.meta.client.upload_file(Filename = parquet_file_input, Bucket = bucket, Key = path_output) try: s3_resource.meta.client.upload_file(Filename = file_error_input, Bucket = bucket, Key = path_output_error) except: pass except Exception as erro: print('fail to save file to AWS S3') log.logger.error('fail to save file to AWS S3') log.logger.error(erro) sys.exit() try: print('Cleaning files from path out') log.logger.info('Cleaning files from path out') os.remove(parquet_file_input) os.remove(file_error_input) except Exception as erro: print(erro) log.logger.error(erro) except Exception as erro: log.logger.error(erro) print('read and proccess dataset success') log.logger.info('read and proccess dataset success') sys.exit()
def read_csv(file, bucket, prefix_output): try: new_df = pd.DataFrame() file_input = f'./raw_files/{file}.csv' parquet_file_input = '{}/{}.snappy.parquet'.format( var.output_file, file) parquet_file_output = '{}.snappy.parquet'.format(file) path_output = prefix_output.format(file, fdt.datetime_str.year, fdt.datetime_str.month, fdt.datetime_str.day, parquet_file_output) print('reading and processing dataset.. {}'.format(file)) log.logger.info('reading and processing dataset.. {}'.format(file)) for df in pd.read_csv(file_input, sep=',', dtype=str, chunksize=1000): df.columns = map(str.lower, df.columns) new_df = new_df.append(df) try: print('saving data into file.. {} : {} rows'.format( file, new_df.shape[0])) log.logger.info('saving data into file.. {} : {} rows'.format( file, new_df.shape[0])) new_df.to_parquet(parquet_file_input, engine='pyarrow', compression='snappy') print('data saved on file') log.logger.info('data saved on file') except Exception as erro: print('fail to save data on file') log.logger.error('fail to save data on file') log.logger.error(erro) sys.exit() try: try: print('trying connect to AWS S3...') log.logger.info('trying connect to AWS S3...') s3_resource = con.connect_s3() print('connected to AWS S3...') log.logger.info('connected to AWS S3...') except Exception as erro: print('failed try to connect to AWS S3') log.logger.error('failed try to connect to AWS S3') log.logger.error(erro) sys.exit() print('saving dataset to AWS S3...') log.logger.info('saving dataset to AWS S3...') s3_resource.meta.client.upload_file(Filename=parquet_file_input, Bucket=bucket, Key=path_output) except Exception as erro: print('fail to save file to AWS S3') log.logger.error('fail to save file to AWS S3') log.logger.error(erro) sys.exit() try: print('Cleaning files from path out') log.logger.info('Cleaning files from path out') os.remove(parquet_file_input) os.remove(file_error_input) except Exception as erro: print(erro) log.logger.error(erro) except Exception as erro: log.logger.error(erro) print('read and proccess dataset success') log.logger.info('read and proccess dataset success') sys.exit()