def envia_arquivo_s3(param_df, identificador_arquivo
                     ):  #carregar bucket akron - dataframe tratado na etapa1
    s3_resource = conexao.connect_s3().s3_client_akron_upload.resource('s3')

    try:
        date = '2020-05-04'  #datetime.date_time_zn().data
        bucket_name = 'pagseguro-akron-qa'
        #object_key = f'upload_arq/upload_{identificador_arquivo}-{date}.snappy.parquet'      #gerar arquivo parquet estou com problema lib windows local
        object_key = f'upload_arq/upload_{identificador_arquivo}-{date}.csv'
        csv_buffer = StringIO()
        param_df.to_csv(csv_buffer, sep=";", index=False)
        #param_df.to_parquet(csv_buffer, engine='pyarrow', compression='snappy') #gerar arquivo parquet estou com problema lib windows local
        s3_resource.Object(bucket_name,
                           object_key).put(Body=csv_buffer.getvalue())

        log.logger.info('Upload success to S3 ...')
        return True
    except FileNotFoundError:
        log.logger.info("The file was not found")
        return False
    except NoCredentialsError:
        log.logger.info("Credentials not available")
        return False
        #print(csv_obj)
    except Exception as err:
        log.logger.error(err)
    return 0
def busca_arquivo_s3(identificador_arquivo):
    date = '2020-05-04'  #datetime.date_time_zn().data
    bucket_name = 'meu-time-qa'  #nomedobucker
    object_key = f'repositorio/{identificador_arquivo}-{date}.csv'  #alterar o nome do repositorio
    #print(object_key)
    conn_s3 = conexao.connect_s3().s3_client
    #body = None
    try:
        log.logger.info('extract datasets from S3....')
        csv_obj = conn_s3.get_object(Bucket=bucket_name, Key=object_key)
        #print(csv_obj)
    except Exception as err:
        log.logger.error(err)
        raise err

    #le arquivo s3 porem lento
    body = csv_obj['Body']
    #print(body)
    csv_string = body.read().decode('utf-8')
    dfpag = StringIO(csv_string)
    return dfpag
Example #3
0
def read_jsonl(file, bucket, prefix_output):
    try:
        file_input = f'./raw_files/{file}.jsonl'
        parquet_file_input = '{}/{}.snappy.parquet'.format(var.output_file, file)
        parquet_file_output = '{}.snappy.parquet'.format(file)
        file_error_input = '{}/fail/{}.txt'.format(var.output_file, file)
        file_error_output = 'fail/{}.txt'.format(file)
        path_output = prefix_output.format(file, fdt.datetime_str.year, fdt.datetime_str.month, fdt.datetime_str.day, parquet_file_output)
        path_output_error = prefix_output.format(file, fdt.datetime_str.year, fdt.datetime_str.month, fdt.datetime_str.day, file_error_output)
        print('reading and processing dataset.. {}'.format(file))
        log.logger.info('reading and processing dataset.. {}'.format(file))
        crude_file = open(file_input).read()
        result = {
                "success": [],
                "error": []
                }
        for i in crude_file.splitlines():
            try:
                result['success'].append(json.loads(i))
            except:
                log.logger.warning('Error in json loads {}'.format(i))
                result['error'].append(i)
        df_success = pd.DataFrame(result['success'])
        df_success.columns = map(str.lower, df_success.columns)
        df_error = pd.DataFrame(result['error'])
        try:
            print('saving data into file.. {} : {} rows'.format(file, df_success.shape[0]))
            log.logger.info('saving data into file.. {} : {} rows'.format(file, df_success.shape[0]))
            df_success.to_parquet(parquet_file_input, engine='pyarrow', compression='snappy')
        except Exception as erro:
            print('fail to save data on file')
            log.logger.error('fail to save data on file')
            log.logger.error(erro)
            sys.exit()
        try:
            df_error.to_csv(file_error_input, sep=';', index=False, header=False, doublequote=False, escapechar= '"')
        except:
            pass
        try:
            try:
                print('trying connect to AWS S3...')
                log.logger.info('trying connect to AWS S3...')
                s3_resource = con.connect_s3()
                print('connected to AWS S3...')
                log.logger.info('connected to AWS S3...')
            except Exception as erro:
                print('failed try to connect to AWS S3')
                log.logger.error('failed try to connect to AWS S3')
                log.logger.error(erro)
                sys.exit()
            print('saving dataset to AWS S3...')
            log.logger.info('saving dataset to AWS S3...')
            s3_resource.meta.client.upload_file(Filename = parquet_file_input, Bucket = bucket, Key = path_output)
            try:
                s3_resource.meta.client.upload_file(Filename = file_error_input, Bucket = bucket, Key = path_output_error)
            except:
                pass
        except Exception as erro:
            print('fail to save file to AWS S3')
            log.logger.error('fail to save file to AWS S3')
            log.logger.error(erro)
            sys.exit()
        try:
            print('Cleaning files from path out')
            log.logger.info('Cleaning files from path out')
            os.remove(parquet_file_input)
            os.remove(file_error_input)
        except Exception as erro:
            print(erro) 
            log.logger.error(erro)
    except Exception as erro:
        log.logger.error(erro)
        print('read and proccess dataset success')
        log.logger.info('read and proccess dataset success')
        sys.exit()
Example #4
0
def read_csv(file, bucket, prefix_output):
    try:
        new_df = pd.DataFrame()
        file_input = f'./raw_files/{file}.csv'
        parquet_file_input = '{}/{}.snappy.parquet'.format(
            var.output_file, file)
        parquet_file_output = '{}.snappy.parquet'.format(file)
        path_output = prefix_output.format(file, fdt.datetime_str.year,
                                           fdt.datetime_str.month,
                                           fdt.datetime_str.day,
                                           parquet_file_output)
        print('reading and processing dataset.. {}'.format(file))
        log.logger.info('reading and processing dataset.. {}'.format(file))
        for df in pd.read_csv(file_input, sep=',', dtype=str, chunksize=1000):
            df.columns = map(str.lower, df.columns)
            new_df = new_df.append(df)
            try:
                print('saving data into file.. {} : {} rows'.format(
                    file, new_df.shape[0]))
                log.logger.info('saving data into file.. {} : {} rows'.format(
                    file, new_df.shape[0]))
                new_df.to_parquet(parquet_file_input,
                                  engine='pyarrow',
                                  compression='snappy')
                print('data saved on file')
                log.logger.info('data saved on file')
            except Exception as erro:
                print('fail to save data on file')
                log.logger.error('fail to save data on file')
                log.logger.error(erro)
                sys.exit()
        try:
            try:
                print('trying connect to AWS S3...')
                log.logger.info('trying connect to AWS S3...')
                s3_resource = con.connect_s3()
                print('connected to AWS S3...')
                log.logger.info('connected to AWS S3...')
            except Exception as erro:
                print('failed try to connect to AWS S3')
                log.logger.error('failed try to connect to AWS S3')
                log.logger.error(erro)
                sys.exit()
            print('saving dataset to AWS S3...')
            log.logger.info('saving dataset to AWS S3...')
            s3_resource.meta.client.upload_file(Filename=parquet_file_input,
                                                Bucket=bucket,
                                                Key=path_output)
        except Exception as erro:
            print('fail to save file to AWS S3')
            log.logger.error('fail to save file to AWS S3')
            log.logger.error(erro)
            sys.exit()
        try:
            print('Cleaning files from path out')
            log.logger.info('Cleaning files from path out')
            os.remove(parquet_file_input)
            os.remove(file_error_input)
        except Exception as erro:
            print(erro)
            log.logger.error(erro)
    except Exception as erro:
        log.logger.error(erro)
        print('read and proccess dataset success')
        log.logger.info('read and proccess dataset success')
        sys.exit()