Exemple #1
0
    def test_clear_rt_deployment(self):

        # TEST 'clear-files' status. process and results identical to status 'renamed'
        preexisting_files = PipelineFileCollection()

        existing_file1 = PipelineFile(PREV_NC_RT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_NC_RT)))

        existing_file2 = PipelineFile(PREV_PNG_TRANSECT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_TRANSECT)))

        preexisting_files.update([existing_file1, existing_file2])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)


        handler = self.run_handler(MISSION_STATUS_CLR)

        # Process should resuls in : input file unhandled , preexisting file should be deleted


        nc = handler.file_collection.filter_by_attribute_id('file_type', FileType.NETCDF)
        self.assertEqual(nc[0].publish_type, PipelineFilePublishType.DELETE_UNHARVEST)
        self.assertTrue(nc[0].is_deleted)

        png = handler.file_collection.filter_by_attribute_id('file_type', FileType.PNG)
        self.assertEqual(png[0].publish_type, PipelineFilePublishType.DELETE_ONLY)
        self.assertTrue(png[0].is_deleted)
Exemple #2
0
    def setUp(self):
        self.logger = get_pipeline_logger('unittest')

        self.dummy_input_file = 'dummy.input_file'
        incoming_file_path = os.path.join(
            self.config.pipeline_config['watch']['incoming_dir'],
            os.path.basename(self.temp_nc_file))
        safe_copy_file(self.temp_nc_file, incoming_file_path)

        celery_request = type('DummyRequest', (object, ),
                              {'id': 'NO_REQUEST_ID'})()
        self.state_manager = IncomingFileStateManager(
            incoming_file_path,
            pipeline_name='UNITTEST',
            config=self.config,
            logger=self.logger,
            celery_request=celery_request)
        self.state_manager.handler = MagicMock(
            file_basename=self.dummy_input_file,
            error_cleanup_regexes=[r'test.*'])

        previous_file_same_name = PipelineFile(
            self.temp_nc_file,
            dest_path='dummy.input_file.40c4ec0d-c9db-498d-84f9-01011330086e')
        nc = PipelineFile(GOOD_NC, dest_path=os.path.basename(GOOD_NC))
        png = PipelineFile(INVALID_PNG,
                           dest_path=os.path.basename(INVALID_PNG))
        ico = PipelineFile(TEST_ICO, dest_path=os.path.basename(TEST_ICO))
        unknown = PipelineFile(UNKNOWN_FILE_TYPE,
                               dest_path=os.path.basename(UNKNOWN_FILE_TYPE))
        existing_collection = PipelineFileCollection(
            [previous_file_same_name, nc, png, ico, unknown])
        self.state_manager.error_broker.upload(existing_collection)
Exemple #3
0
def get_harvest_collection(delete=False,
                           late_deletion=False,
                           with_store=False,
                           already_stored=False):
    pf_bad = PipelineFile(BAD_NC,
                          is_deletion=delete,
                          late_deletion=late_deletion)
    pf_empty = PipelineFile(EMPTY_NC,
                            is_deletion=delete,
                            late_deletion=late_deletion)
    pf_good = PipelineFile(GOOD_NC,
                           is_deletion=delete,
                           late_deletion=late_deletion)

    collection = PipelineFileCollection([pf_bad, pf_empty, pf_good])

    if with_store:
        publish_type = PipelineFilePublishType.DELETE_UNHARVEST if delete else PipelineFilePublishType.HARVEST_UPLOAD
    else:
        publish_type = PipelineFilePublishType.UNHARVEST_ONLY if delete else PipelineFilePublishType.HARVEST_ONLY

    for pipeline_file in collection:
        pipeline_file.is_stored = already_stored
        pipeline_file.dest_path = os.path.join(
            'DUMMY', os.path.basename(pipeline_file.src_path))
        pipeline_file.publish_type = publish_type

    return collection
Exemple #4
0
    def test_cleanup(self):
        nc = PipelineFile(GOOD_NC, dest_path=os.path.basename(GOOD_NC))
        png = PipelineFile(INVALID_PNG,
                           dest_path=os.path.basename(INVALID_PNG))
        ico = PipelineFile(TEST_ICO, dest_path=os.path.basename(TEST_ICO))
        unknown = PipelineFile(UNKNOWN_FILE_TYPE,
                               dest_path=os.path.basename(UNKNOWN_FILE_TYPE))
        existing_collection = PipelineFileCollection([nc, png, ico, unknown])
        self.state_manager.error_broker.upload(existing_collection)

        self.state_manager.move_to_processing()

        actual_error_files_before_cleanup = [
            v.dest_path for v in self.state_manager.error_broker.query()
        ]
        expected_error_files_before_cleanup = [
            'good.nc', 'test.unknown_file_extension', 'test.ico', 'invalid.png'
        ]
        self.assertCountEqual(expected_error_files_before_cleanup,
                              actual_error_files_before_cleanup)

        self.state_manager.success_exit_policies.append(
            ExitPolicy.DELETE_CUSTOM_REGEXES_FROM_ERROR_STORE)
        self.state_manager.move_to_success()

        actual_error_files_after_cleanup = [
            v.dest_path for v in self.state_manager.error_broker.query()
        ]
        expected_error_files_after_cleanup = ['good.nc', 'invalid.png']
        self.assertCountEqual(expected_error_files_after_cleanup,
                              actual_error_files_after_cleanup)
Exemple #5
0
    def test_rt_update(self):
        """ test the update of realtime mission:
         update consits in :
         - deletion of previous netCDF
         - deletion of transect png files
         - harvest of new netCDF
         - overwriting of other files
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file1 = PipelineFile(PREV_NC_RT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_NC_RT)))

        existing_file2 = PipelineFile(PREV_PNG_TRANSECT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_TRANSECT)))
        existing_file3 = PipelineFile(PREV_PNG_MISSION, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_MISSION)))

        preexisting_files.update([existing_file1, existing_file2, existing_file3])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler
        handler = self.run_handler(GOOD_ZIP_RT)

        nc = handler.file_collection.filter_by_attribute_id('file_type', FileType.NETCDF)
        for n in nc:
            if n.name == os.path.basename(PREV_NC_RT):
                self.assertEqual(n.publish_type, PipelineFilePublishType.DELETE_UNHARVEST)
                self.assertTrue(n.is_deleted)
            else:
                self.assertEqual(n.publish_type, PipelineFilePublishType.HARVEST_UPLOAD)
                self.assertTrue(n.is_harvested)
                self.assertTrue(n.is_stored)

        pngs = handler.file_collection.filter_by_attribute_id('file_type', FileType.PNG)
        for png in pngs:
            if png.name == os.path.basename(PREV_PNG_MISSION):
                self.assertTrue(png.is_overwrite)
            else:
                self.assertTrue(png.is_uploaded)

        # no update the harvestMission List in that case
        csv = handler.file_collection.filter_by_attribute_id('file_type', FileType.CSV)
        self.assertEqual(len(csv), 0)
Exemple #6
0
    def get_remote_metadata_from_zip(self, remote_pfile):
        """Fetch the metdata.csv file from a RemotePipelineFile zip and wrapping it on a
        PipelineFile."""
        # TODO: remove the first try statement when pipeline
        remote_collection = RemotePipelineFileCollection(remote_pfile)
        try:
            download = self.state_query.download
        except AttributeError:
            download = self.state_query._storage_broker.download

        download(remote_collection, self.temp_dir)
        dest_folder = os.path.join(self.temp_dir,
                                   os.path.dirname(remote_pfile.dest_path))
        local_zipfile = os.path.join(self.temp_dir, remote_pfile.dest_path)

        extract_zip(local_zipfile, dest_folder)

        local_metadata_file = [
            os.path.join(dest_folder, x) for x in os.listdir(dest_folder)
            if "metadata" in x
        ]
        try:
            return PipelineFile(local_metadata_file[0])
        except IndexError:
            return None
Exemple #7
0
    def test_setup_upload_location_push_older_file(self):
        """
        Test case: Check creation date of incoming *.nc.gz is older that one already on storage
                   NO_ACTION  *nc.gz
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file = PipelineFile(PREV_NC_GZ_STORAGE,
                                     dest_path=os.path.join(
                                         'IMOS/OceanCurrent/GSLA/DM00/2018/',
                                         os.path.basename(PREV_NC_GZ_STORAGE)))

        preexisting_files.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler on the new file with an older creation date
        self.run_handler_with_exception(InvalidFileNameError,
                                        OLDER_CREATION_DATE_NC_GZ)
Exemple #8
0
    def test_setup_upload_location_push_newer_file_bad_prefix(self):
        """
        Test case: Check creation date of incoming *.nc.gz is newer that one already on storage
                   HARVEST_UPLOAD the content of the *nc.gz

                   BUT check THAT
                   We don't delete files not starting with a good value of GSLA_PREFIX_PATH.
                   In our case we patch this global variable to empty to check this
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file = PipelineFile(PREV_NC_GZ_STORAGE,
                                     dest_path=os.path.join(
                                         'IMOS/OceanCurrent/GSLA/DM00/2018/',
                                         os.path.basename(PREV_NC_GZ_STORAGE)))

        preexisting_files.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler
        self.run_handler_with_exception(
            AttributeValidationError,
            NEWER_CREATION_DATE_NC_GZ,
            allowed_dest_path_regexes=["IMOS/OceanCurrent/GSLA"])
Exemple #9
0
    def preprocess(self):
        """
        pre processsing to handle the conversion of BUFR csv files to NetCDF
        :return:
        """
        csv_file = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.CSV)

        if csv_file:
            csv_file = csv_file[0]
            csv_file.publish_type = PipelineFilePublishType.ARCHIVE_ONLY

            profiles = parse_bufr_file(csv_file.src_path)
            profiles = return_unique_profiles(
                profiles)  # check for duplicate profiles within BUFR file
            for profile in profiles:
                profile = fzf_vessel_get_info(
                    profile)  # fuzzy search finder for vessel name
                profile = xbt_line_get_info(
                    profile, self.xbt_line_vocab_url
                )  # get hard coded info per xbt line
                netcdf_filepath = netcdf_writer(
                    profile, self.temp_dir)  # convert BUFR to NetCDF

                # publish
                nc_file = PipelineFile(
                    netcdf_filepath,
                    file_update_callback=self._file_update_callback)
                nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD

                self.file_collection.add(nc_file)
Exemple #10
0
    def set_deployment_status(self, input_file, message):
        """
        Write message to Harvestmission.csv file. ingested in RT pieline
        Update the anfog_rt.harvest_listing table after ingestion of csv file by RT pipeline
        Note that to be consistent with the available message in the production DB,
        dashes need to be replaced by underscore, for ex delayed-mode =>delayed_mode
        :return:  Harvestmission.csv updated with deployment specific status
        """
        name = os.path.basename(input_file)
        deployment = AnfogFileClassifier.get_deployment_code(input_file)
        platform = AnfogFileClassifier.get_platform(name)

        listing_path = os.path.join(self.products_dir,
                                    AnfogFileClassifier.MISSION_LISTING)
        with open(listing_path, 'w') as f:
            f.write('deployment_name, platform_type, status' + os.linesep)
            row = "%s,%s,%s" % (deployment, platform, message.replace(
                '-', '_').lower())
            f.write(row)

        product = PipelineFile(listing_path)
        product.publish_type = PipelineFilePublishType.HARVEST_ONLY
        product.check_type = PipelineFileCheckType.FORMAT_CHECK
        product.dest_path = os.path.join(self.upload_destination,
                                         os.path.basename(listing_path))
        self.file_collection.add(product)
Exemple #11
0
    def test_good_dm_file_with_compliance_check(self):
        #  this is tested as an update to avoid raising invalid input file error cause of missing ancillary material
        preexisting_file = PipelineFileCollection()

        existing_file = PipelineFile(GOOD_NC, dest_path=os.path.join(
            'IMOS/ANFOG/slocum_glider/TwoRocks20180503a/', os.path.basename(GOOD_NC)))

        preexisting_file.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_file.set_publish_types(PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_file)

        handler = self.run_handler(GOOD_NC, check_params={'checks': ['cf', 'imos:1.4']})

        f = handler.file_collection.filter_by_attribute_id('file_type', FileType.NETCDF)
        # self.assertEqual(f[0].check_type, PipelineFileCheckType.NC_COMPLIANCE_CHECK)
        self.assertEqual(f[0].publish_type, PipelineFilePublishType.HARVEST_UPLOAD)

        self.assertEqual(f[0].dest_path,
                         'IMOS/ANFOG/slocum_glider/TwoRocks20180503a/'
                         'IMOS_ANFOG_BCEOPSTUV_20180503T080042Z_SL210_FV01_timeseries_END-20180505T054942Z.nc')
        self.assertTrue(f[0].is_checked)
        self.assertTrue(f[0].is_stored)
        self.assertTrue(f[0].is_harvested)
Exemple #12
0
    def test_download_remotepipelinefilecollection(self):
        state_query = StateQuery(storage_broker=self.storage_broker,
                                 wfs_broker=self.wfs_broker)
        pipeline_file = PipelineFile(GOOD_NC, dest_path='dest/path/1.nc')
        self.storage_broker.upload(pipeline_file)

        remote_file = RemotePipelineFile.from_pipelinefile(pipeline_file)
        state_query.download(remote_file, local_path=self.temp_dir)

        self.assertTrue(os.path.exists(remote_file.local_path))
Exemple #13
0
    def test_deletion_rt_after_dm_upload(self):
        """test deletion of RT mission at upload of related DM version"""
        preexisting_files = PipelineFileCollection()

        existing_file1 = PipelineFile(PREV_NC_RT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_NC_RT)))

        existing_file2 = PipelineFile(PREV_PNG_TRANSECT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_TRANSECT)))
        existing_file3 = PipelineFile(PREV_PNG_MISSION, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_MISSION)))

        preexisting_files.update([existing_file1, existing_file2, existing_file3])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler
        handler = self.run_handler(GOOD_ZIP_DM, check_params={'checks': ['cf', 'imos:1.4']})

        nc = handler.file_collection.filter_by_attribute_id('file_type', FileType.NETCDF)
        for n in nc:
            if n.name == os.path.basename(PREV_NC_RT):
                self.assertEqual(n.publish_type, PipelineFilePublishType.DELETE_UNHARVEST)
                self.assertTrue(n.is_deleted)
            elif re.match(AnfogFileClassifier.DM_REGEX, n.name):
                self.assertEqual(n.publish_type, PipelineFilePublishType.HARVEST_UPLOAD)
                self.assertTrue(n.is_harvested)
                self.assertTrue(n.is_stored)
            else:
                self.assertEqual(n.publish_type, PipelineFilePublishType.ARCHIVE_ONLY)
                self.assertTrue(n.is_archived)

        pngs = handler.file_collection.filter_by_attribute_id('file_type', FileType.PNG)
        for png in pngs:
            self.assertTrue(png.is_deleted)
Exemple #14
0
 def schedule_file_removal(self, remote_pipeline_file):
     """schedule a file to be removed."""
     filename = remote_pipeline_file.name
     file_to_remove = PipelineFile(
         filename,
         is_deletion=True,
         dest_path=self.dest_path_function(filename),
     )
     file_to_remove.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
     self.file_collection.add(file_to_remove)
     logger.info(
         NRT_FILE_REMOVAL_MSG.format(file=file_to_remove,
                                     ptype=file_to_remove.publish_type))
Exemple #15
0
    def test_overwrite_same_file(self, mock_callsign):
        # check that files with same name are overwritten
        preexisting_files = PipelineFileCollection()

        existing_file1 = PipelineFile(
            GOOD_NC,
            dest_path=os.path.join(
                'IMOS/SOOP/SOOP-BA/VKAD_Antarctic-Discovery/Antarctic-Discovery_20160116-20160129/',
                os.path.basename(GOOD_NC)))

        existing_file2 = PipelineFile(
            CSV,
            dest_path=os.path.join(
                'IMOS/SOOP/SOOP-BA/VKAD_Antarctic-Discovery/Antarctic-Discovery_20160116-20160129/',
                os.path.basename(CSV)))

        preexisting_files.update([existing_file1, existing_file2])
        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler
        handler = self.run_handler(GOOD_ZIP)
        nc = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)
        self.assertEqual(nc[0].publish_type,
                         PipelineFilePublishType.HARVEST_UPLOAD)
        self.assertEqual(nc[0].is_deleted, False)

        csvs = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.CSV)
        for csv in csvs:
            if csv.name == os.path.basename(CSV):
                self.assertEqual(csv.publish_type,
                                 PipelineFilePublishType.UPLOAD_ONLY)
                self.assertEqual(csv.is_deleted, False)
Exemple #16
0
    def preprocess(self):
        """
        Files to be deleted as found in 'soop_trv_duplicate_url' wfs layer
        """
        files_to_delete = self.state_query.query_wfs_urls_for_layer(
            'soop_trv_duplicate_url')

        for f in files_to_delete:
            file_to_delete = PipelineFile(
                os.path.basename(f),
                is_deletion=True,
                dest_path=f,
                file_update_callback=self._file_update_callback)
            file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
            self.file_collection.add(file_to_delete)
Exemple #17
0
    def test_renamed_rt_deployment(self):
        # test deletion of RT files when deployment renamed or when cleaning files on S3
        preexisting_files = PipelineFileCollection()

        existing_file1 = PipelineFile(PREV_NC_RT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_NC_RT)))

        existing_file2 = PipelineFile(PREV_PNG_TRANSECT, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_TRANSECT)))
        existing_file3 = PipelineFile(PREV_PNG_MISSION, dest_path=os.path.join(
            'IMOS/ANFOG/REALTIME/slocum_glider/TwoRocks20180503a/', os.path.basename(PREV_PNG_MISSION)))

        preexisting_files.update([existing_file1, existing_file2, existing_file3])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        handler = self.run_handler(MISSION_STATUS_RENAMED)

        # Process should resuls in : input file unhandled , preexisting file should be deleted, cvs file harvested
        csv = handler.file_collection.filter_by_attribute_id('file_type', FileType.CSV)
        self.assertEqual(csv[0].publish_type, PipelineFilePublishType.HARVEST_ONLY)
        self.assertTrue(csv[0].is_harvested)

        nc = handler.file_collection.filter_by_attribute_id('file_type', FileType.NETCDF)
        self.assertEqual(nc[0].publish_type, PipelineFilePublishType.DELETE_UNHARVEST)
        self.assertTrue(nc[0].is_deleted)

        pngs = handler.file_collection.filter_by_attribute_id('file_type', FileType.PNG)
        for png in pngs:
            self.assertEqual(png.publish_type, PipelineFilePublishType.DELETE_ONLY)
            self.assertTrue(png.is_deleted)
Exemple #18
0
    def preprocess(self):
        """ Preprocessing for NRT and DM files
           - NRT: generate a NetCDF files based on input text file.
             Set the input file publish_type property to 'archive'
           - DM file collection: update the check_type and publish_type properties for non-NetCDF files.
             These files are not checked or harvested, but uploaded to S3

        """
        if self.custom_params is not None and self.custom_params.get(
                'ship_callsign_ls'):
            self.ship_callsign_ls = self.custom_params['ship_callsign_ls']
        else:
            self.ship_callsign_ls = ship_callsign_list()

        # Delayed mode file submitted as a zip archive
        if self.file_extension == '.zip':
            nc_file = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(nc_file) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in ZIP archive '{zip}'".format(
                        zip=os.path.basename(self.input_file)))

            # first process the NetCDF file to set the destination path for the file collection
            nc = nc_file[0]
            nc.dest_path = self.dest_path(nc.src_path)
            nc_dir_path = os.path.dirname(nc.dest_path)

            # SOOP-CO2 DM and FRMAP .txt,.pdf or/and .xml files.
            # Set check type to NONEMPTY and publish type to UPLOAD_ONLY
            non_nc_files = PipelineFileCollection(
                f for f in self.file_collection
                if f.file_type is not FileType.NETCDF)
            for non_nc in non_nc_files:
                non_nc.check_type = PipelineFileCheckType.FORMAT_CHECK
                non_nc.publish_type = PipelineFilePublishType.UPLOAD_ONLY
                non_nc.dest_path = os.path.join(nc_dir_path, non_nc.name)

        elif self.input_file.endswith('dat.txt'):
            # Single text file Realtime files (*dat.txt)
            rt_file = self.file_collection[0]
            rt_file.publish_type = PipelineFilePublishType.ARCHIVE_ONLY

            nrt_nc_file_path = soop_co2_nrt_nc_generator.process_co2_rt(
                rt_file, self.products_dir, self.ship_callsign_ls)
            nrt_nc_file = PipelineFile(nrt_nc_file_path)
            self.file_collection.add(nrt_nc_file)
            nrt_nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
Exemple #19
0
def get_notification_data():
    collection = PipelineFileCollection(PipelineFile(GOOD_NC))
    collection_headers, collection_data = collection.get_table_data()

    data = {
        'input_file': 'good.nc',
        'processing_result': 'HANDLER_SUCCESS',
        'handler_start_time': '2017-10-23 16:05',
        'checks': None,
        'collection_headers': collection_headers,
        'collection_data': collection_data,
        'error_details': '',
        'upload_dir': None
    }

    return data
Exemple #20
0
    def test_setup_upload_location_push_newer_yearly_file(self):
        """
        Test case: Check creation date of incoming  yearly *.nc.gz is newer that one already on storage
                   UPLOAD_ONLY the new incoming *.nc.gz
                   DELETE_ONLY the previous *.nc.gz
                   NO_ACTION on the nc inside the *.nc.gz
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file = PipelineFile(
            PREV_YEARLY_NC_GZ_STORAGE,
            dest_path=os.path.join(
                'IMOS/OceanCurrent/GSLA/DM00/yearfiles',
                os.path.basename(PREV_YEARLY_NC_GZ_STORAGE)))

        preexisting_files.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler
        handler = self.run_handler(GOOD_YEARLY_FILE_DM00)

        nc_file = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        self.assertEqual(nc_file.publish_type,
                         PipelineFilePublishType.NO_ACTION)

        nc_gz_file = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.GZIP)[0]
        self.assertEqual(nc_gz_file.publish_type,
                         PipelineFilePublishType.UPLOAD_ONLY)

        nc_gz_delete = handler.file_collection.filter_by_attribute_value(
            'name', os.path.basename(PREV_YEARLY_NC_GZ_STORAGE))[0]
        self.assertEqual(nc_gz_delete.publish_type,
                         PipelineFilePublishType.DELETE_ONLY)
Exemple #21
0
    def _cleanup_previous_version(self, product_filename):
        """Identify any previously published version(s) of the given product file and mark them for deletion.
        Ignores cases where the previous version has exactly the same file name, as this will simply be overwritten.

        :param product_filename: File name of the newly generated product
        """
        product_type = get_product_type(product_filename)
        for old_product_url in self.old_product_files.get(product_type, []):
            if os.path.basename(old_product_url) != product_filename:
                # Add the previous version as a "late deletion". It will be deleted during the handler's `publish`
                # step after (and only if) all new files have been successfully published.
                old_file = PipelineFile(
                    old_product_url,
                    dest_path=old_product_url,
                    is_deletion=True,
                    late_deletion=True,
                    file_update_callback=self._file_update_callback)
                old_file.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
                self.file_collection.add(old_file)
Exemple #22
0
    def test_setup_upload_location_push_file_newer_creation_date(self):
        """
        Test case: Check creation date of new *.nc is newer that one already on storage
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file = PipelineFile(
            GOOD_NC_FV01, dest_path=AcornHandler.dest_path(GOOD_NC_FV01))
        preexisting_files.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # create a new file based on GOOD_NC_FV01. Modify it with an newer creation date
        # we patch the global variable from the handler in order to use the temporary broker file
        nc_file_new_creation_date_path = os.path.join(
            self.temp_dir, os.path.basename(GOOD_NC_FV01))
        shutil.copyfile(GOOD_NC_FV01, nc_file_new_creation_date_path)
        with Dataset(nc_file_new_creation_date_path, mode='r+') as nc_obj:
            delta_time = timedelta(1, 1, 1)
            new_time = datetime.strptime(nc_obj.date_created,
                                         '%Y-%m-%dT%H:%M:%SZ') + delta_time
            nc_obj.date_created = datetime.strftime(new_time,
                                                    '%Y-%m-%dT%H:%M:%SZ')

        # run the handler on the new file with an newer creation date
        handler = self.handler_class(nc_file_new_creation_date_path,
                                     include_regexes=[r'IMOS_ACORN_.*\.nc'])
        handler.opendap_root = broker.prefix
        handler.run()

        nc_file = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        self.assertEqual(nc_file.publish_type,
                         PipelineFilePublishType.HARVEST_UPLOAD)
Exemple #23
0
    def preprocess(self):
        """Check that every input file is valid according to the include/exclude regex patterns. Any non-matching
        file will be left with publish_type UNSET after the _resolve step.

        If there are any netCDF files from burst-sampling instruments in the collection, create the burst-averaged
        version of each and add them to the collection.

        :return: None
        """
        self.logger.info(
            "Checking for invalid files and adjusting check/publish properties."
        )

        invalid_files = self.file_collection.filter_by_attribute_id(
            'publish_type', PipelineFilePublishType.UNSET)
        if invalid_files:
            raise InvalidFileNameError(
                "File name(s) don't match the pattern expected for this upload location: {names}"
                .format(names=invalid_files.get_attribute_list('name')))

        # Burst-processing for FV01 files with burst-sampling global attributes
        burst_files = (self.file_collection.filter_by_attribute_id(
            'file_type',
            FileType.NETCDF).filter_by_attribute_regex('name', r'.*_FV01_'))
        for f in burst_files:
            with Dataset(f.src_path, mode='r') as D:
                has_interval = hasattr(D, 'instrument_burst_interval')
                has_duration = hasattr(D, 'instrument_burst_duration')
                is_adcp = ('DIST_ALONG_BEAMS' in D.dimensions
                           or 'HEIGHT_ABOVE_SENSOR' in D.dimensions)
            if not (has_interval and has_duration) or is_adcp:
                continue

            self.logger.info("Burst-processing {f.name}".format(f=f))
            product_path = create_burst_average_netcdf(f.src_path,
                                                       self.products_dir)
            product_file = PipelineFile(
                product_path, file_update_callback=self._file_update_callback)
            product_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
            self.file_collection.add(product_file)
Exemple #24
0
    def test_setup_upload_location_push_file_older_creation_date(self):
        """
        Test case: Check creation date of new *.nc is older that one already on storage
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file = PipelineFile(
            GOOD_NC_FV01, dest_path=AcornHandler.dest_path(GOOD_NC_FV01))
        preexisting_files.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # create a new file based on GOOD_NC_FV01. Modify it with an older creation date
        nc_file_old_creation_date_path = os.path.join(
            self.temp_dir, os.path.basename(GOOD_NC_FV01))
        shutil.copyfile(GOOD_NC_FV01, nc_file_old_creation_date_path)
        with Dataset(nc_file_old_creation_date_path, mode='r+') as nc_obj:
            delta_time = timedelta(1, 1, 1)
            new_time = datetime.strptime(nc_obj.date_created,
                                         '%Y-%m-%dT%H:%M:%SZ') - delta_time
            nc_obj.date_created = datetime.strftime(new_time,
                                                    '%Y-%m-%dT%H:%M:%SZ')

        # run the handler on the new file with an older creation date
        handler = self.handler_class(nc_file_old_creation_date_path,
                                     include_regexes=[r'IMOS_ACORN_.*\.nc'])
        handler.opendap_root = broker.prefix
        handler.run()

        self.assertIsInstance(handler.error, InvalidFileContentError)
Exemple #25
0
    def test_dstg(self):
        preexisting_file = PipelineFileCollection()
        existing_file = PipelineFile(DSTG, dest_path=os.path.join(
            'Department_of_Defence/DSTG/slocum_glider/TalismanSaberB20130706/', os.path.basename(DSTG)))

        preexisting_file.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_file.set_publish_types(PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_file)

        # test processing of DSTG and NRL NetCDF files
        handler = self.run_handler(DSTG)

        f = handler.file_collection[0]
        self.assertEqual(f.publish_type, PipelineFilePublishType.HARVEST_UPLOAD)
        self.assertEqual(f.dest_path,
                         'Department_of_Defence/DSTG/slocum_glider/TalismanSaberB20130706/' + f.name)
        self.assertTrue(f.is_stored)
        self.assertTrue(f.is_harvested)
Exemple #26
0
    def test_setup_upload_location_push_same_file(self):
        """
        Test case: Push same file twice to $INCOMING_DIR
                   HARVEST_UPLOAD the incoming *.nc.gz
                   NO_ACTION on the nc inside the *.nc.gz
        """
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file = PipelineFile(PREV_NC_GZ_STORAGE,
                                     dest_path=os.path.join(
                                         'IMOS/OceanCurrent/GSLA/DM00/2018/',
                                         os.path.basename(PREV_NC_GZ_STORAGE)))

        preexisting_files.update([existing_file])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler by uploading again the same file
        handler = self.run_handler(PREV_NC_GZ_STORAGE)

        nc_file = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        self.assertEqual(nc_file.publish_type,
                         PipelineFilePublishType.NO_ACTION)

        nc_gz_file = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.GZIP)[0]
        self.assertEqual(nc_gz_file.publish_type,
                         PipelineFilePublishType.HARVEST_UPLOAD)
Exemple #27
0
with open(GETFEATURE_FILE) as f:
    TEST_GETFEATURE_JSON = f.read()

with open(GETFEATURE_OLD_PRODUCTS_FILE) as f:
    TEST_GETFEATURE_OLD_PRODUCTS_JSON = f.read()

with open(GETFEATURE_EMPTY_FILE) as f:
    TEST_GETFEATURE_EMPTY_JSON = f.read()

# Create collection of input files for the products
# These will be uploaded to the mocked equivalent of S3 (where the real input files will be)
features = json.loads(TEST_GETFEATURE_JSON)['features']
INPUT_FILE_COLLECTION = PipelineFileCollection()
for f in features:
    pf = PipelineFile(
            os.path.join(TEST_ROOT, os.path.basename(f['properties']['url'])),
            dest_path=f['properties']['url']
    )
    pf.publish_type = PipelineFilePublishType.UPLOAD_ONLY
    INPUT_FILE_COLLECTION.add(pf)


class TestMooringsProductsHandler(HandlerTestCase):
    def setUp(self):
        self.handler_class = MooringsProductsHandler
        upload_broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        upload_broker.upload(INPUT_FILE_COLLECTION)
        super().setUp()

    @patch('aodncore.util.wfs.WebFeatureService')
    def test_all_products(self, mock_webfeatureservice):
        mock_webfeatureservice().getfeature().getvalue.side_effect = [TEST_GETFEATURE_JSON,
Exemple #28
0
    def get_previous_version(self, previous_file_list, path, input_file_name):
        """
            Find previous version of each incoming file based on its type/extension and
            add them to the filecollection with the correct publish type
            extension can be: .inf', '.nc.png','.pitch.csv','.roll.csv',.gps.csv'
            inputs: previous_file_list : dictionary containing file listing(full path) and metadata from destination
                  input_file :  file basename
                   path : full destination path
        """

        if not previous_file_list:
            return

        files_to_delete = PipelineFileCollection()

        try:
            extension = ALLOWED_CONTENT_EXTENSIONS.match(
                input_file_name).groupdict()['extension']
        except KeyError:
            raise ValueError(
                "unable to determine extension from file name {infile}".format(
                    infile=input_file_name))

        # get list of previous files basename  to search through
        basenames = {os.path.basename(f) for f in previous_file_list}

        this_extension_pattern = re.compile(
            r".*\.{ext}$".format(ext=extension))
        if input_file_name not in basenames:
            previous_file = [
                f for f in previous_file_list
                if this_extension_pattern.match(f)
            ]

            if extension == 'nc':
                if len(previous_file) != 1:
                    raise ValueError(
                        "Expected exactly 1 previous versions of the netcdf file, found {n}. Aborting "
                        .format(n=len(previous_file)))
            else:
                # if uploaded file name has the same name published file => no action, file will be overwritten, otherwise
                # sort file per wildcard and work out which one to delete (
                # check previous file widcard :
                # can be '.inf', '.nc.png','.pitch.csv','.roll.csv',.gps.csv'
                if len(previous_file) > 1:
                    raise ValueError(
                        "Found more than one previous versions of the extension '{ext}'. Aborting"
                        .format(ext=extension))
                elif len(previous_file) == 0:
                    return

            prev_file = previous_file[0]
            dest_path = os.path.join(path, os.path.basename(prev_file))
            self.logger.info(
                "adding deletion of previous file '{dest_path}'".format(
                    dest_path=dest_path))

            file_to_delete = PipelineFile(prev_file,
                                          is_deletion=True,
                                          dest_path=dest_path)

            if extension == 'nc':
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
            else:
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY

            files_to_delete.add(file_to_delete)

        return files_to_delete
Exemple #29
0
    def preprocess(self):

        # if input file is a NetCDF, create a .nc.gz and harvest upload it.
        # historically, files were always sent as *.nc.gz. But as of April 2021, files might be pushed as *.nc.
        # to be consistent, we transform this .nc into a .nz.gz
        if self.file_type is FileType.NETCDF:
            self.file_collection.set_publish_types(
                PipelineFilePublishType.NO_ACTION)

            gzip_path = os.path.join(self.temp_dir, self.file_basename + '.gz')
            with open(self.input_file,
                      'rb') as f_in, gzip.open(gzip_path, 'wb') as gz_out:
                gz_out.writelines(f_in)

            # publish
            self.add_to_collection(
                gzip_path, publish_type=PipelineFilePublishType.HARVEST_UPLOAD)

        if self.file_type is FileType.GZIP:
            # add nc_gz file to collection (not by default)
            self.file_collection.add(self.input_file_object)
            netcdf_file_gz_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.GZIP)
            netcdf_file_gz = netcdf_file_gz_collection[0]
            netcdf_file_gz.publish_type = PipelineFilePublishType.HARVEST_UPLOAD  # default

            # some GSLA files are gzipped, so gunzip them before checking them
            # if uploaded file is GZIP check that GZIP contains a NetCDF
            netcdf_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(netcdf_collection) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in GZIP archive '{gzip}'".
                    format(gzip=os.path.basename(self.input_file)))

        netcdf_file_gz = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.GZIP)[0]
        netcdf_file = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        # setting the path of the gz file with the gunzipped file
        netcdf_file_gz.dest_path = self.dest_path(netcdf_file.src_path)
        # Nothing to do with *.nc. Talend can harvest *.nc.gz. Set to NO_ACTION
        netcdf_file.publish_type = PipelineFilePublishType.NO_ACTION

        # we don't know the product type (DM00 or DM01) of the file already
        # on s3 in order to deduce its path. We need to get the product
        # type from the file in incoming
        result_previous_version_creation_date = self.get_previous_version_creation_date(
            netcdf_file.src_path)
        """ default values
        by default we push to the storage the file landed in the pipeline (ie *.nc.gz) """
        push_new_file = True
        remove_previous_version = False

        # compare creation dates with file already on storage
        if result_previous_version_creation_date:
            new_file_creation_date = get_creation_date(netcdf_file.name)
            if result_previous_version_creation_date > new_file_creation_date:
                push_new_file = False
            elif result_previous_version_creation_date == new_file_creation_date:
                push_new_file = True
            else:
                remove_previous_version = True
                previous_file_path = self.get_previous_version_object(
                    netcdf_file.src_path)

        if push_new_file:
            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                # yearly file should never be harvested
                netcdf_file_gz.publish_type = PipelineFilePublishType.UPLOAD_ONLY
        else:
            raise InvalidFileNameError(
                "file name: \"{filename}\"  creation date is older than file already on "
                "storage".format(filename=netcdf_file_gz.name))

        # deletion of the previous file
        if remove_previous_version:
            previous_file_name = os.path.basename(previous_file_path)
            file_to_delete = PipelineFile(
                previous_file_name,
                is_deletion=True,
                dest_path=previous_file_path,
                file_update_callback=self._file_update_callback)

            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY
            else:
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST

            self.file_collection.add(file_to_delete)
Exemple #30
0
    def test_delete_previous_file(self, mock_callsign):
        # create some PipelineFiles to represent the existing files on 'S3'
        preexisting_files = PipelineFileCollection()

        existing_file1 = PipelineFile(
            PREV_NC,
            dest_path=os.path.join(
                'IMOS/SOOP/SOOP-BA/VKAD_Antarctic-Discovery/Antarctic-Discovery_20160116-20160129/',
                os.path.basename(PREV_NC)))

        existing_file2 = PipelineFile(
            CSV,
            dest_path=os.path.join(
                'IMOS/SOOP/SOOP-BA/VKAD_Antarctic-Discovery/Antarctic-Discovery_20160116-20160129/',
                os.path.basename(CSV)))
        existing_file3 = PipelineFile(
            PNG,
            dest_path=os.path.join(
                'IMOS/SOOP/SOOP-BA/VKAD_Antarctic-Discovery/Antarctic-Discovery_20160116-20160129/',
                os.path.basename(PNG)))
        preexisting_files.update(
            [existing_file1, existing_file2, existing_file3])

        # set the files to UPLOAD_ONLY
        preexisting_files.set_publish_types(
            PipelineFilePublishType.UPLOAD_ONLY)

        # upload the 'preexisting_files' collection to the unit test's temporary upload location
        broker = get_storage_broker(
            self.config.pipeline_config['global']['upload_uri'])
        broker.upload(preexisting_files)

        # run the handler
        handler = self.run_handler(GOOD_ZIP)

        # add some tests to make sure the previous files were handled appropriately, e.g.
        # - they were added as deletions
        # - they were successfully deleted
        # - they were the *only* ones deleted
        nc_files = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)

        for nc in nc_files:
            if nc.name == os.path.basename(PREV_NC):
                self.assertEqual(nc.publish_type,
                                 PipelineFilePublishType.DELETE_UNHARVEST)
                self.assertEqual(nc.is_deleted, True)
            else:
                self.assertEqual(nc.is_deleted, False)
        csvs = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.CSV)
        for csv in csvs:
            if csv.name == os.path.basename(CSV):
                self.assertEqual(csv.publish_type,
                                 PipelineFilePublishType.UPLOAD_ONLY)
                self.assertEqual(csv.is_deleted, False)

        pngs = handler.file_collection.filter_by_attribute_id(
            'file_type', FileType.PNG)
        for png in pngs:
            if png.name == os.path.basename(PNG):
                self.assertEqual(png.is_deleted, True)