Beispiel #1
0
    def test_move_file_into_non_existing_directory(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir = Path(tmpdir)
            dest_dirpath = tmpdir / "destination/directory"

            # Create a test file in tmp_dir
            tmp_file_path = tmpdir / "test_file.txt"
            with open(tmp_file_path, "w+") as f:
                f.write("Test file.")
            self.assertIn("test_file.txt", os.listdir(tmpdir))

            # Move the test file and test
            move(tmp_file_path, dest_dirpath=dest_dirpath)
            self.assertNotIn("test_file.txt", os.listdir(tmpdir))
            self.assertIn("test_file.txt", os.listdir(dest_dirpath))
            with open(dest_dirpath / "test_file.txt", "r") as f:
                self.assertEqual(f.read(), "Test file.")
Beispiel #2
0
def extract_xmls_from_zipfile(zipfile: Union[None, dict]) -> Union[None, dict]:

    if zipfile:
        logger = prefect.context.get("logger")

        logger.info(f"Extracting zipfile {zipfile['full_name']}.")
        message_type = get_message_type(zipfile["full_name"])

        # Handle ERS3 messages and acknoledgement statuses
        if message_type in ["ERS3", "ERS3_ACK"]:
            with ZipFile(zipfile["input_dir"] / zipfile["full_name"]) as zipobj:
                xml_filenames = zipobj.namelist()
                xml_messages = []
                for xml_filename in xml_filenames:
                    with zipobj.open(xml_filename, mode="r") as f:
                        xml_messages.append(f.read().decode("utf-8"))
                zipfile["xml_messages"] = xml_messages
                return zipfile

        # Handle FLUX messages (move them to non_treated_directory
        # as there is currently no parser available)
        elif message_type in ["UN"]:
            logger.info(
                f"Skipping zipfile {zipfile['full_name']} of type {message_type} "
                + "which is currently not handled. "
                + f"Moving {zipfile['full_name']} to non-treated directory."
            )
            move(
                zipfile["input_dir"] / zipfile["full_name"],
                zipfile["non_treated_dir"],
                if_exists="replace",
            )

        # Move unexpected file types to error directory
        else:
            logger.error(
                f"Unexpected message type '{message_type}' ({zipfile}). "
                + f"Moving {zipfile} to error directory."
            )
            move(
                zipfile["input_dir"] / zipfile["full_name"],
                zipfile["error_dir"],
                if_exists="replace",
            )
Beispiel #3
0
    def test_move_file_already_exists(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir = Path(tmpdir)
            dest_dirpath = tmpdir / "destination/directory"
            os.makedirs(dest_dirpath)

            # Create a test file in tmpdir and in dest_dir
            tmp_file_path = tmpdir / "test_file.txt"
            with open(tmp_file_path, "w+") as f:
                f.write("New file.")
            with open(dest_dirpath / "test_file.txt", "w+") as f:
                f.write("Original file.")

            self.assertIn("test_file.txt", os.listdir(tmpdir))
            self.assertIn("test_file.txt", os.listdir(dest_dirpath))

            # Move the test file and test
            with self.assertRaises(
                    shutil.Error):  # Raise an error by default...
                move(tmp_file_path, dest_dirpath)

            move(tmp_file_path, dest_dirpath,
                 if_exists="replace")  # Unless specified

            self.assertNotIn("test_file.txt", os.listdir(tmpdir))
            self.assertIn("test_file.txt", os.listdir(dest_dirpath))

            with open(dest_dirpath / "test_file.txt", "r") as f:
                self.assertEqual(f.read(), "New file.")

            # Test if_exists argument
            with self.assertRaises(ValueError):
                move(tmp_file_path, dest_dirpath, if_exists="unexpected")
Beispiel #4
0
def extract_zipfiles(
    input_dir: pathlib.Path,
    treated_dir: pathlib.Path,
    non_treated_dir: pathlib.Path,
    error_dir: pathlib.Path,
) -> List[dict]:
    """Scans input_dir, in which ers zipfiles are expected to be arranged in a
     hierarchy folders like : year / month / zipfiles.
    Yields founds zipfiles along with the corresponding paths of :
    - input_directory where the zipfile is located
    - treated_directory where the zipfile will be transfered after integration into the
    monitorfish database
    - non_treated_directory where the zipfile will be transfered if it is a FLUX type
    of message (currently not handled)
    - error_directory if an error occurs during the treatment of messages.
    """

    logger = prefect.context.get("logger")

    expected_months = [
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "01",
        "02",
        "03",
        "04",
        "05",
        "06",
        "07",
        "08",
        "09",
        "10",
        "11",
        "12",
    ]

    expected_years = list(map(str, range(2000, 2050)))

    years = os.listdir(input_dir)

    res = []
    n = 0

    for year in years:
        if year not in expected_years:
            logger.warning(f"Unexpected year {year}. Skipping directory.")
            continue

        logger.info(f"Starting extraction of ERS messages for year {year}.")
        months = os.listdir(input_dir / year)

        for month in months:
            if month not in expected_months:
                logger.warning(f"Unexpected month {month}. Skipping directory.")
                continue

            logger.info(f"Starting extraction of ERS messages for {year}/{month}.")

            zipfile_input_dir = input_dir / year / month
            zipfile_treated_dir = treated_dir / year / month
            zipfile_non_treated_dir = non_treated_dir / year / month
            zipfile_error_dir = error_dir / year / month

            files = os.listdir(zipfile_input_dir)

            zipfiles = list(filter(lambda s: s[-4:] == ".zip", files))
            non_zipfiles = list(filter(lambda s: s[-4:] != ".zip", files))

            if len(non_zipfiles) > 0:
                logger.warning(
                    f"Non zip files found in {year} / {month}."
                    + "Moving files to error_directory."
                )
                for non_zipfile in non_zipfiles:
                    move(
                        zipfile_input_dir / non_zipfile,
                        zipfile_error_dir,
                        if_exists="replace",
                    )

            for zipfile in zipfiles:
                res.append(
                    {
                        "full_name": zipfile,
                        "input_dir": zipfile_input_dir,
                        "treated_dir": zipfile_treated_dir,
                        "non_treated_dir": zipfile_non_treated_dir,
                        "error_dir": zipfile_error_dir,
                    }
                )
                n += 1
                if n == 200:
                    return res

    return res
Beispiel #5
0
def load_ers(cleaned_data: List[dict]):
    """Loads ERS data into public.ers and public.ers_messages tables.

    Args:
        cleaned_data (list) : list of dictionaries (output of `clean` task)
    """
    schema = "public"
    ers_table_name = "ers"
    ers_messages_table_name = "ers_messages"
    engine = create_engine("monitorfish_remote")
    logger = prefect.context.get("logger")

    # Check that ers tables exist
    get_table(ers_table_name, schema, engine, logger)
    ers_messages_table = get_table(ers_messages_table_name, schema, engine, logger)

    cleaned_data = list(filter(lambda x: True if x else False, cleaned_data))

    for ers in cleaned_data:

        with engine.begin() as connection:
            parsed = ers["parsed"]
            parsed_with_xml = ers["parsed_with_xml"]

            # Drop rows for which the operation number already exists in the
            # ers_messages database

            parsed = drop_rows_already_in_table(
                df=parsed,
                df_column_name="operation_number",
                table=ers_messages_table,
                table_column_name="operation_number",
                connection=connection,
                logger=logger,
            )

            parsed_with_xml = drop_rows_already_in_table(
                df=parsed_with_xml,
                df_column_name="operation_number",
                table=ers_messages_table,
                table_column_name="operation_number",
                connection=connection,
                logger=logger,
            )

            if len(parsed_with_xml) > 0:
                n_lines = len(parsed_with_xml)
                logger.info(f"Inserting {n_lines} messages in ers_messages table.")

                parsed_with_xml.to_sql(
                    name=ers_messages_table_name,
                    con=connection,
                    schema=schema,
                    index=False,
                    method=psql_insert_copy,
                    if_exists="append",
                )

            if len(parsed) > 0:
                n_lines = len(parsed)
                logger.info(f"Inserting {n_lines} messages in ers table.")

                # Serialize dicts to prepare for insertion as json in database
                parsed["value"] = parsed.value.map(to_json)

                parsed.to_sql(
                    name=ers_table_name,
                    con=connection,
                    schema=schema,
                    index=False,
                    method=psql_insert_copy,
                    if_exists="append",
                )

            if ers["batch_generated_errors"]:
                logger.error(
                    "Errors occurred during parsing of some of the messages. "
                    f"Moving {ers['full_name']} to error directory."
                )
                move(
                    ers["input_dir"] / ers["full_name"],
                    ers["error_dir"],
                    if_exists="replace",
                )
            else:
                move(
                    ers["input_dir"] / ers["full_name"],
                    ers["treated_dir"],
                    if_exists="replace",
                )