def create_local_file(self, award_type, source, agency_code, generate_since): """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """ logger.info("Generating CSV file with creations and modifications") # Create file paths and working directory timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S%f") working_dir = f"{settings.CSV_LOCAL_PATH}_{agency_code}_delta_gen_{timestamp}/" if not os.path.exists(working_dir): os.mkdir(working_dir) agency_str = "All" if agency_code == "all" else agency_code source_name = f"FY(All)_{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" source_path = os.path.join(working_dir, "{}.csv".format(source_name)) # Create a unique temporary file with the raw query raw_quoted_query = generate_raw_quoted_query(source.row_emitter(None)) # None requests all headers # The raw query is a union of two other queries, each in parentheses. To do replacement we need to split out # each query, apply annotations to each of those, then recombine in a UNION csv_query_annotated = ( "(" + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[0].strip()[1:-1], source.human_names) + ") UNION (" + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[1].strip()[1:-1], source.human_names) + ")" ) (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix="bd_sql_", dir="/tmp") with open(temp_sql_file_path, "w") as file: file.write("\\copy ({}) To STDOUT with CSV HEADER".format(csv_query_annotated)) logger.info("Generated temp SQL file {}".format(temp_sql_file_path)) # Generate the csv with \copy cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) try: subprocess.check_output( ["psql", "-o", source_path, os.environ["DOWNLOAD_DATABASE_URL"], "-v", "ON_ERROR_STOP=1"], stdin=cat_command.stdout, stderr=subprocess.STDOUT, ) except subprocess.CalledProcessError as e: logger.exception(e.output) raise e # Append deleted rows to the end of the file if not self.debugging_skip_deleted: self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since) if count_rows_in_delimited_file(source_path, has_header=True, safe=True) > 0: # Split the CSV into multiple files and zip it up zipfile_path = "{}{}.zip".format(settings.CSV_LOCAL_PATH, source_name) logger.info("Creating compressed file: {}".format(os.path.basename(zipfile_path))) split_and_zip_data_files(zipfile_path, source_path, source_name, "csv") else: zipfile_path = None os.close(temp_sql_file) os.remove(temp_sql_file_path) shutil.rmtree(working_dir) return zipfile_path
def test_top_level_split_nesting_logic(): unnested_string = "sdfoing SPLT noivijf" assert _top_level_split(unnested_string, "SPLT") == ["sdfoing ", " noivijf"] nested_string_1 = "() blah SP pojf" assert _top_level_split(nested_string_1, "SP") == ["() blah ", " pojf"] nested_string_2 = "(SP) blah SP pojf" assert _top_level_split(nested_string_2, "SP") == ["(SP) blah ", " pojf"] nested_string_3 = "(SP) blah SP (pojf SP) SP" assert _top_level_split(nested_string_3, "SP") == ["(SP) blah ", " (pojf SP) SP"] nested_string_4 = "((SP)) blah ()SP (pojf SP) SP" assert _top_level_split(nested_string_4, "SP") == ["((SP)) blah ()", " (pojf SP) SP"]
def test_top_level_split_unsplittable(): unsplittable = "sdoiaghioweg oijfewiwje efoiwnovisper" with pytest.raises(Exception): _top_level_split(unsplittable, "not in this")
def test_top_level_split_quote_logic(): unquoted_string = "oinodsg *** vsdoij" assert _top_level_split(unquoted_string, "***") == ["oinodsg ", " vsdoij"] quoted_string_1 = 'oinodsg "***"*** vsdoij' assert _top_level_split(quoted_string_1, "***") == ['oinodsg "***"', " vsdoij"]