Beispiel #1
0
def split_and_zip_csvs(zipfile_path, source_path, source_name, download_job):
    try:
        # Split CSV into separate files
        log_time = time.time()
        split_csvs = split_csv(
            source_path,
            row_limit=EXCEL_ROW_LIMIT,
            output_path=os.path.dirname(source_path),
            output_name_template='{}_%s.csv'.format(source_name))
        write_to_log(
            message='Splitting csvs took {} seconds'.format(time.time() -
                                                            log_time),
            download_job=download_job)

        # Zip the split CSVs into one zipfile
        log_time = time.time()
        zipped_csvs = zipfile.ZipFile(zipfile_path,
                                      'a',
                                      compression=zipfile.ZIP_DEFLATED,
                                      allowZip64=True)
        for split_csv_part in split_csvs:
            zipped_csvs.write(split_csv_part, os.path.basename(split_csv_part))

        write_to_log(
            message='Writing to zipfile took {} seconds'.format(time.time() -
                                                                log_time),
            download_job=download_job)
    except Exception as e:
        logger.error(e)
        raise e
    finally:
        if zipped_csvs:
            zipped_csvs.close()
Beispiel #2
0
    def create_local_file(self, award_type, source, agency_code, generate_since):
        """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """
        logger.info('Generating CSV file with creations and modifications')

        # Create file paths and working directory
        working_dir = settings.CSV_LOCAL_PATH + 'delta_gen/'
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)
        source_name = '{}_{}_delta'.format(award_type, VALUE_MAPPINGS['transactions']['download_name'])
        source_path = os.path.join(working_dir, '{}.csv'.format(source_name))

        # Create a unique temporary file with the raw query
        raw_quoted_query = generate_raw_quoted_query(source.row_emitter(None))  # None requests all headers
        csv_query_annotated = self.apply_annotations_to_sql(raw_quoted_query, source.human_names)
        (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix='bd_sql_', dir='/tmp')
        with open(temp_sql_file_path, 'w') as file:
            file.write('\\copy ({}) To STDOUT with CSV HEADER'.format(csv_query_annotated))

        # Generate the csv with \copy
        cat_command = subprocess.Popen(['cat', temp_sql_file_path], stdout=subprocess.PIPE)
        subprocess.check_output(['psql', '-o', source_path, os.environ['DOWNLOAD_DATABASE_URL'], '-v',
                                 'ON_ERROR_STOP=1'], stdin=cat_command.stdout, stderr=subprocess.STDOUT)

        # Append deleted rows to the end of the file
        self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since)
        if csv_row_count(source_path, has_header=True) > 0:
            # Split CSV into separate files
            split_csvs = split_csv(source_path, row_limit=EXCEL_ROW_LIMIT, output_path=os.path.dirname(source_path),
                                   output_name_template='{}_%s.csv'.format(source_name))

            # Zip the split CSVs into one zipfile
            zipfile_path = '{}{}_{}_Delta_{}.zip'.format(settings.CSV_LOCAL_PATH, agency_code, award_type,
                                                         datetime.strftime(date.today(), '%Y%m%d'))
            logger.info('Creating compressed file: {}'.format(os.path.basename(zipfile_path)))
            zipped_csvs = zipfile.ZipFile(zipfile_path, 'a', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
            for split_csv_part in split_csvs:
                zipped_csvs.write(split_csv_part, os.path.basename(split_csv_part))
        else:
            zipfile_path = None

        os.close(temp_sql_file)
        os.remove(temp_sql_file_path)
        shutil.rmtree(working_dir)

        return zipfile_path