def test_move_file_into_non_existing_directory(self): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) dest_dirpath = tmpdir / "destination/directory" # Create a test file in tmp_dir tmp_file_path = tmpdir / "test_file.txt" with open(tmp_file_path, "w+") as f: f.write("Test file.") self.assertIn("test_file.txt", os.listdir(tmpdir)) # Move the test file and test move(tmp_file_path, dest_dirpath=dest_dirpath) self.assertNotIn("test_file.txt", os.listdir(tmpdir)) self.assertIn("test_file.txt", os.listdir(dest_dirpath)) with open(dest_dirpath / "test_file.txt", "r") as f: self.assertEqual(f.read(), "Test file.")
def extract_xmls_from_zipfile(zipfile: Union[None, dict]) -> Union[None, dict]: if zipfile: logger = prefect.context.get("logger") logger.info(f"Extracting zipfile {zipfile['full_name']}.") message_type = get_message_type(zipfile["full_name"]) # Handle ERS3 messages and acknoledgement statuses if message_type in ["ERS3", "ERS3_ACK"]: with ZipFile(zipfile["input_dir"] / zipfile["full_name"]) as zipobj: xml_filenames = zipobj.namelist() xml_messages = [] for xml_filename in xml_filenames: with zipobj.open(xml_filename, mode="r") as f: xml_messages.append(f.read().decode("utf-8")) zipfile["xml_messages"] = xml_messages return zipfile # Handle FLUX messages (move them to non_treated_directory # as there is currently no parser available) elif message_type in ["UN"]: logger.info( f"Skipping zipfile {zipfile['full_name']} of type {message_type} " + "which is currently not handled. " + f"Moving {zipfile['full_name']} to non-treated directory." ) move( zipfile["input_dir"] / zipfile["full_name"], zipfile["non_treated_dir"], if_exists="replace", ) # Move unexpected file types to error directory else: logger.error( f"Unexpected message type '{message_type}' ({zipfile}). " + f"Moving {zipfile} to error directory." ) move( zipfile["input_dir"] / zipfile["full_name"], zipfile["error_dir"], if_exists="replace", )
def test_move_file_already_exists(self): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) dest_dirpath = tmpdir / "destination/directory" os.makedirs(dest_dirpath) # Create a test file in tmpdir and in dest_dir tmp_file_path = tmpdir / "test_file.txt" with open(tmp_file_path, "w+") as f: f.write("New file.") with open(dest_dirpath / "test_file.txt", "w+") as f: f.write("Original file.") self.assertIn("test_file.txt", os.listdir(tmpdir)) self.assertIn("test_file.txt", os.listdir(dest_dirpath)) # Move the test file and test with self.assertRaises( shutil.Error): # Raise an error by default... move(tmp_file_path, dest_dirpath) move(tmp_file_path, dest_dirpath, if_exists="replace") # Unless specified self.assertNotIn("test_file.txt", os.listdir(tmpdir)) self.assertIn("test_file.txt", os.listdir(dest_dirpath)) with open(dest_dirpath / "test_file.txt", "r") as f: self.assertEqual(f.read(), "New file.") # Test if_exists argument with self.assertRaises(ValueError): move(tmp_file_path, dest_dirpath, if_exists="unexpected")
def extract_zipfiles( input_dir: pathlib.Path, treated_dir: pathlib.Path, non_treated_dir: pathlib.Path, error_dir: pathlib.Path, ) -> List[dict]: """Scans input_dir, in which ers zipfiles are expected to be arranged in a hierarchy folders like : year / month / zipfiles. Yields founds zipfiles along with the corresponding paths of : - input_directory where the zipfile is located - treated_directory where the zipfile will be transfered after integration into the monitorfish database - non_treated_directory where the zipfile will be transfered if it is a FLUX type of message (currently not handled) - error_directory if an error occurs during the treatment of messages. """ logger = prefect.context.get("logger") expected_months = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", ] expected_years = list(map(str, range(2000, 2050))) years = os.listdir(input_dir) res = [] n = 0 for year in years: if year not in expected_years: logger.warning(f"Unexpected year {year}. Skipping directory.") continue logger.info(f"Starting extraction of ERS messages for year {year}.") months = os.listdir(input_dir / year) for month in months: if month not in expected_months: logger.warning(f"Unexpected month {month}. Skipping directory.") continue logger.info(f"Starting extraction of ERS messages for {year}/{month}.") zipfile_input_dir = input_dir / year / month zipfile_treated_dir = treated_dir / year / month zipfile_non_treated_dir = non_treated_dir / year / month zipfile_error_dir = error_dir / year / month files = os.listdir(zipfile_input_dir) zipfiles = list(filter(lambda s: s[-4:] == ".zip", files)) non_zipfiles = list(filter(lambda s: s[-4:] != ".zip", files)) if len(non_zipfiles) > 0: logger.warning( f"Non zip files found in {year} / {month}." + "Moving files to error_directory." ) for non_zipfile in non_zipfiles: move( zipfile_input_dir / non_zipfile, zipfile_error_dir, if_exists="replace", ) for zipfile in zipfiles: res.append( { "full_name": zipfile, "input_dir": zipfile_input_dir, "treated_dir": zipfile_treated_dir, "non_treated_dir": zipfile_non_treated_dir, "error_dir": zipfile_error_dir, } ) n += 1 if n == 200: return res return res
def load_ers(cleaned_data: List[dict]): """Loads ERS data into public.ers and public.ers_messages tables. Args: cleaned_data (list) : list of dictionaries (output of `clean` task) """ schema = "public" ers_table_name = "ers" ers_messages_table_name = "ers_messages" engine = create_engine("monitorfish_remote") logger = prefect.context.get("logger") # Check that ers tables exist get_table(ers_table_name, schema, engine, logger) ers_messages_table = get_table(ers_messages_table_name, schema, engine, logger) cleaned_data = list(filter(lambda x: True if x else False, cleaned_data)) for ers in cleaned_data: with engine.begin() as connection: parsed = ers["parsed"] parsed_with_xml = ers["parsed_with_xml"] # Drop rows for which the operation number already exists in the # ers_messages database parsed = drop_rows_already_in_table( df=parsed, df_column_name="operation_number", table=ers_messages_table, table_column_name="operation_number", connection=connection, logger=logger, ) parsed_with_xml = drop_rows_already_in_table( df=parsed_with_xml, df_column_name="operation_number", table=ers_messages_table, table_column_name="operation_number", connection=connection, logger=logger, ) if len(parsed_with_xml) > 0: n_lines = len(parsed_with_xml) logger.info(f"Inserting {n_lines} messages in ers_messages table.") parsed_with_xml.to_sql( name=ers_messages_table_name, con=connection, schema=schema, index=False, method=psql_insert_copy, if_exists="append", ) if len(parsed) > 0: n_lines = len(parsed) logger.info(f"Inserting {n_lines} messages in ers table.") # Serialize dicts to prepare for insertion as json in database parsed["value"] = parsed.value.map(to_json) parsed.to_sql( name=ers_table_name, con=connection, schema=schema, index=False, method=psql_insert_copy, if_exists="append", ) if ers["batch_generated_errors"]: logger.error( "Errors occurred during parsing of some of the messages. " f"Moving {ers['full_name']} to error directory." ) move( ers["input_dir"] / ers["full_name"], ers["error_dir"], if_exists="replace", ) else: move( ers["input_dir"] / ers["full_name"], ers["treated_dir"], if_exists="replace", )