def generate_export_query(source_query, limit, source, columns, file_format): if limit: source_query = source_query[:limit] query_annotated = apply_annotations_to_sql( generate_raw_quoted_query(source_query), source.columns(columns)) options = FILE_FORMATS[file_format]["options"] return r"\COPY ({}) TO STDOUT {}".format(query_annotated, options)
def create_local_file(self, award_type, source, agency_code, generate_since): """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """ logger.info("Generating CSV file with creations and modifications") # Create file paths and working directory timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S%f") working_dir = f"{settings.CSV_LOCAL_PATH}_{agency_code}_delta_gen_{timestamp}/" if not os.path.exists(working_dir): os.mkdir(working_dir) agency_str = "All" if agency_code == "all" else agency_code source_name = f"FY(All)_{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" source_path = os.path.join(working_dir, "{}.csv".format(source_name)) # Create a unique temporary file with the raw query raw_quoted_query = generate_raw_quoted_query(source.row_emitter(None)) # None requests all headers # The raw query is a union of two other queries, each in parentheses. To do replacement we need to split out # each query, apply annotations to each of those, then recombine in a UNION csv_query_annotated = ( "(" + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[0].strip()[1:-1], source.human_names) + ") UNION (" + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[1].strip()[1:-1], source.human_names) + ")" ) (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix="bd_sql_", dir="/tmp") with open(temp_sql_file_path, "w") as file: file.write("\\copy ({}) To STDOUT with CSV HEADER".format(csv_query_annotated)) logger.info("Generated temp SQL file {}".format(temp_sql_file_path)) # Generate the csv with \copy cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) try: subprocess.check_output( ["psql", "-o", source_path, os.environ["DOWNLOAD_DATABASE_URL"], "-v", "ON_ERROR_STOP=1"], stdin=cat_command.stdout, stderr=subprocess.STDOUT, ) except subprocess.CalledProcessError as e: logger.exception(e.output) raise e # Append deleted rows to the end of the file if not self.debugging_skip_deleted: self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since) if count_rows_in_delimited_file(source_path, has_header=True, safe=True) > 0: # Split the CSV into multiple files and zip it up zipfile_path = "{}{}.zip".format(settings.CSV_LOCAL_PATH, source_name) logger.info("Creating compressed file: {}".format(os.path.basename(zipfile_path))) split_and_zip_data_files(zipfile_path, source_path, source_name, "csv") else: zipfile_path = None os.close(temp_sql_file) os.remove(temp_sql_file_path) shutil.rmtree(working_dir) return zipfile_path
def generate_export_query(source_query, limit, source, columns, file_format, generate_export_query_function): if limit: source_query = source_query[:limit] query_annotated = apply_annotations_to_sql( generate_raw_quoted_query(source_query), source.columns(columns)) options = FILE_FORMATS[file_format]["options"] return generate_export_query_function(source, query_annotated, options)
def generate_temp_query_file(source_query, limit, source, download_job, columns): if limit: source_query = source_query[:limit] csv_query_annotated = apply_annotations_to_sql(generate_raw_quoted_query(source_query), source.columns(columns)) write_to_log( message="Creating PSQL Query: {}".format(csv_query_annotated), download_job=download_job, is_debug=True ) # Create a unique temporary file to hold the raw query, using \copy (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix="bd_sql_", dir="/tmp") with open(temp_sql_file_path, "w") as file: file.write(r"\copy ({}) To STDOUT with CSV HEADER".format(csv_query_annotated)) return temp_sql_file, temp_sql_file_path
def fetch_all_category_counts(filters, category_to_model_dict): loop = asyncio.new_event_loop() results = {} for k, v in category_to_model_dict.items(): queryset = matview_search_filter( filters, v).annotate(count=Count("*")).values("count") sql = generate_raw_quoted_query(queryset) # Django refuses to provide a viable option to exclude "GROUP BY ..." so it is stripped before running the SQL remove_groupby_string_index = sql.find("GROUP BY") results[k] = asyncio.ensure_future(async_run_select( sql[:remove_groupby_string_index]), loop=loop) all_statements = asyncio.gather(*[value for value in results.values()]) loop.run_until_complete(all_statements) loop.close() return {k: v.result()[0]["count"] for k, v in results.items()}
def create_local_file(self, award_type, source, agency_code, generate_since): """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """ logger.info("Generating CSV file with creations and modifications") # Create file paths and working directory timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S%f") working_dir = "{}_{}_delta_gen_{}/".format(settings.CSV_LOCAL_PATH, agency_code, timestamp) if not os.path.exists(working_dir): os.mkdir(working_dir) source_name = "{}_{}_Delta_{}".format( agency_code, award_type, datetime.strftime(date.today(), "%Y%m%d")) source_path = os.path.join(working_dir, "{}.csv".format(source_name)) # Create a unique temporary file with the raw query raw_quoted_query = generate_raw_quoted_query( source.row_emitter(None)) # None requests all headers csv_query_annotated = self.apply_annotations_to_sql( raw_quoted_query, source.human_names) (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix="bd_sql_", dir="/tmp") with open(temp_sql_file_path, "w") as file: file.write("\\copy ({}) To STDOUT with CSV HEADER".format( csv_query_annotated)) logger.info("Generated temp SQL file {}".format(temp_sql_file_path)) # Generate the csv with \copy cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) try: subprocess.check_output( [ "psql", "-o", source_path, os.environ["DOWNLOAD_DATABASE_URL"], "-v", "ON_ERROR_STOP=1" ], stdin=cat_command.stdout, stderr=subprocess.STDOUT, ) except subprocess.CalledProcessError as e: logger.exception(e.output) raise e # Append deleted rows to the end of the file self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since) if count_rows_in_csv_file(source_path, has_header=True, safe=True) > 0: # Split the CSV into multiple files and zip it up zipfile_path = "{}{}.zip".format(settings.CSV_LOCAL_PATH, source_name) logger.info("Creating compressed file: {}".format( os.path.basename(zipfile_path))) split_and_zip_csvs(zipfile_path, source_path, source_name) else: zipfile_path = None os.close(temp_sql_file) os.remove(temp_sql_file_path) shutil.rmtree(working_dir) return zipfile_path