def test_read_file(self):
        """test reading files"""

        contents = util.read_file(
            resolve_test_data_path('data/20040709.ECC.2Z.2ZL1.NOAA-CMDL.csv'))

        self.assertIsInstance(contents, str)

        contents = util.read_file(
            resolve_test_data_path('data/wmo_acronym_vertical_sm.jpg'))

        self.assertIsInstance(contents, str)

        with self.assertRaises(FileNotFoundError):
            contents = util.read_file('404file.dat')
    def test_mixed_run_report(self):
        """
        Test that passing and failing files are written to the run report
        when a mixture of the two is processed
        """

        infile_root = resolve_test_data_path('data/general/pass_and_fail')

        agency = 'MSC'

        expected_passes = set()
        expected_fails = set()

        with report.OperatorReport() as error_bank:
            run_report = report.RunReport(SANDBOX_DIR)

            for infile in os.listdir(infile_root):
                fullpath = os.path.join(infile_root, infile)

                try:
                    contents = util.read_file(fullpath)
                    ecsv = parser.ExtendedCSV(contents, error_bank)
                except (parser.MetadataValidationError,
                        parser.NonStandardDataError):
                    expected_fails.add(fullpath)
                    run_report.write_failing_file(fullpath, agency)
                    continue

                try:
                    ecsv.validate_metadata_tables()
                    ecsv.validate_dataset_tables()
                    data_record = models.DataRecord(ecsv)
                    data_record.filename = infile

                    expected_passes.add(fullpath)
                    run_report.write_passing_file(fullpath, agency)
                except (parser.MetadataValidationError,
                        parser.NonStandardDataError):
                    expected_fails.add(fullpath)
                    run_report.write_failing_file(fullpath, agency)

        self.assertEquals(len(expected_passes), 6)
        self.assertEquals(len(expected_fails), 4)

        output_path = os.path.join(SANDBOX_DIR, 'run_report')
        self.assertTrue(os.path.exists(output_path))

        with open(output_path) as output:
            lines = output.read().splitlines()
            self.assertEquals(lines[0], agency)
            self.assertEquals(len(lines),
                              len(expected_passes) + len(expected_fails) + 1)

            for line in lines[1:]:
                if line.startswith('Pass'):
                    target = line[6:].strip()
                    self.assertIn(target, expected_passes)
                elif line.startswith('Fail'):
                    target = line[6:].strip()
                    self.assertIn(target, expected_fails)
    def test_non_extcsv_run_report(self):
        """Test that an unparseable file is written to the run report"""

        filename = 'not-an-ecsv.dat'
        infile = resolve_test_data_path('data/general/{}'.format(filename))
        contents = util.read_file(infile)

        agency = 'UNKNOWN'

        with report.OperatorReport() as error_bank:
            run_report = report.RunReport(SANDBOX_DIR)

            try:
                _ = parser.ExtendedCSV(contents, error_bank)
                raise AssertionError(
                    'Parsing of {} did not fail'.format(infile))
            except (parser.MetadataValidationError,
                    parser.NonStandardDataError):
                output_path = os.path.join(SANDBOX_DIR, 'run_report')

                run_report.write_failing_file(infile, agency)

                self.assertTrue(os.path.exists(output_path))
                with open(output_path) as output:
                    lines = output.read().splitlines()
                    self.assertEquals(len(lines), 2)

                    self.assertEquals(lines[0], agency)
                    self.assertEquals(lines[1], 'Fail: {}'.format(infile))
    def test_failing_run_report(self):
        """Test that a failing file is written to the run report"""

        filename = 'ecsv-missing-instrument-name.csv'
        infile = resolve_test_data_path('data/general/{}'.format(filename))
        contents = util.read_file(infile)

        ecsv = None
        # Agency typically filled in with FTP username for failing files.
        agency = 'rmda'

        with report.OperatorReport() as error_bank:
            run_report = report.RunReport(SANDBOX_DIR)

            try:
                ecsv = parser.ExtendedCSV(contents, error_bank)
                ecsv.validate_metadata_tables()
                agency = ecsv.extcsv['DATA_GENERATION']['Agency']

                ecsv.validate_dataset_tables()
                raise AssertionError(
                    'Parsing of {} did not fail'.format(infile))
            except (parser.MetadataValidationError,
                    parser.NonStandardDataError):
                output_path = os.path.join(SANDBOX_DIR, 'run_report')

                run_report.write_failing_file(infile, agency)

        self.assertTrue(os.path.exists(output_path))
        with open(output_path) as output:
            lines = output.read().splitlines()
            self.assertEquals(len(lines), 2)

            self.assertEquals(lines[0], agency)
            self.assertEquals(lines[1], 'Fail: {}'.format(infile))
    def test_passing_run_report(self):
        """Test that a passing file is written to the run report"""

        filename = '20080101.Kipp_Zonen.UV-S-E-T.000560.PMOD-WRC.csv'
        infile = resolve_test_data_path('data/general/{}'.format(filename))
        contents = util.read_file(infile)

        run_report = report.RunReport(SANDBOX_DIR)
        with report.OperatorReport() as error_bank:
            ecsv = parser.ExtendedCSV(contents, error_bank)

            ecsv.validate_metadata_tables()
            ecsv.validate_dataset_tables()
            data_record = models.DataRecord(ecsv)
            data_record.filename = filename

            agency = ecsv.extcsv['DATA_GENERATION']['Agency']
            output_path = os.path.join(SANDBOX_DIR, 'run_report')

            run_report.write_passing_file(infile, agency)

        self.assertTrue(os.path.exists(output_path))
        with open(output_path) as output:
            lines = output.read().splitlines()
            self.assertEquals(len(lines), 2)

            self.assertEquals(lines[0], agency)
            self.assertEquals(lines[1], 'Pass: {}'.format(infile))
    def test_warning_operator_report(self):
        """Test that file warnings are written in the operator report"""

        filename = 'ecsv-trailing-commas.csv'
        infile = resolve_test_data_path('data/general/{}'.format(filename))
        contents = util.read_file(infile)

        with report.OperatorReport(SANDBOX_DIR) as op_report:
            ecsv = parser.ExtendedCSV(contents, op_report)

            # Some warnings are encountered during parsing.
            ecsv.validate_metadata_tables()
            ecsv.validate_dataset_tables()
            data_record = models.DataRecord(ecsv)
            data_record.filename = filename

            agency = ecsv.extcsv['DATA_GENERATION']['Agency']

            today = datetime.now().strftime('%Y-%m-%d')
            output_path = os.path.join(SANDBOX_DIR,
                                       'operator-report-{}.csv'.format(today))

            op_report.add_message(200)  # File passes validation
            op_report.write_passing_file(infile, ecsv, data_record)

        self.assertTrue(os.path.exists(output_path))
        with open(output_path) as output:
            reader = csv.reader(output)
            next(reader)

            expected_warnings = len(ecsv.warnings)
            for _ in range(expected_warnings):
                report_line = next(reader)

                self.assertEquals(report_line[0], 'P')
                self.assertEquals(report_line[1], 'Warning')
                self.assertIn(agency, report_line)
                self.assertIn(os.path.basename(infile), report_line)

            report_line = next(reader)
            self.assertEquals(report_line[0], 'P')
            self.assertEquals(report_line[1], 'Warning')
            self.assertEquals(report_line[2], '200')
            self.assertIn(agency, report_line)
            self.assertIn(os.path.basename(infile), report_line)

            with self.assertRaises(StopIteration):
                next(reader)
    def test_passing_operator_report(self):
        """Test that a passing file is written in the operator report"""

        filename = '20080101.Kipp_Zonen.UV-S-E-T.000560.PMOD-WRC.csv'
        infile = resolve_test_data_path('data/general/{}'.format(filename))
        contents = util.read_file(infile)

        with report.OperatorReport(SANDBOX_DIR) as op_report:
            ecsv = parser.ExtendedCSV(contents, op_report)

            ecsv.validate_metadata_tables()
            ecsv.validate_dataset_tables()
            data_record = models.DataRecord(ecsv)
            data_record.filename = filename

            agency = ecsv.extcsv['DATA_GENERATION']['Agency']

            today = datetime.now().strftime('%Y-%m-%d')
            output_path = os.path.join(SANDBOX_DIR,
                                       'operator-report-{}.csv'.format(today))

            op_report.add_message(200)  # File passes validation
            op_report.write_passing_file(infile, ecsv, data_record)

        self.assertTrue(os.path.exists(output_path))
        with open(output_path) as output:
            reader = csv.reader(output)
            next(reader)

            report_line = next(reader)
            self.assertEquals(report_line[0], 'P')
            self.assertEquals(report_line[2], '200')
            self.assertIn(agency, report_line)
            self.assertIn(os.path.basename(infile), report_line)

            with self.assertRaises(StopIteration):
                next(reader)
    def test_run_report_multiple_agencies(self):
        """Test that files in the run report are grouped by agency"""

        infile_root = resolve_test_data_path('data/general/agencies')

        expected_passes = {}
        expected_fails = {}
        agency_aliases = {
            'msc': 'MSC',
            'casiap': 'CAS-IAP',
            'mlcd-lu': 'MLCD-LU',
            'dwd-mohp': 'DWD-MOHp'
        }

        with report.OperatorReport() as error_bank:
            run_report = report.RunReport(SANDBOX_DIR)

            for dirpath, dirnames, filenames in os.walk(infile_root):
                for infile in filenames:
                    fullpath = os.path.join(dirpath, infile)
                    # Agency inferred from directory name.
                    agency = dirpath.split('/')[-1]

                    try:
                        contents = util.read_file(fullpath)
                        ecsv = parser.ExtendedCSV(contents, error_bank)
                    except (parser.MetadataValidationError,
                            parser.NonStandardDataError):
                        if agency not in expected_passes:
                            expected_passes[agency] = set()
                        if agency not in expected_fails:
                            expected_fails[agency] = set()
                        expected_fails[agency].add(fullpath)
                        run_report.write_failing_file(fullpath, agency)
                        continue

                    try:
                        ecsv.validate_metadata_tables()
                        agency = ecsv.extcsv['DATA_GENERATION']['Agency']

                        if agency not in expected_passes:
                            expected_passes[agency] = set()
                        if agency not in expected_fails:
                            expected_fails[agency] = set()

                        ecsv.validate_dataset_tables()
                        data_record = models.DataRecord(ecsv)
                        data_record.filename = infile

                        expected_passes[agency].add(fullpath)
                        run_report.write_passing_file(fullpath, agency)
                    except (parser.MetadataValidationError,
                            parser.NonStandardDataError):
                        agency = agency_aliases[agency]
                        if agency not in expected_passes:
                            expected_passes[agency] = set()
                        if agency not in expected_fails:
                            expected_fails[agency] = set()

                        expected_fails[agency].add(fullpath)
                        run_report.write_failing_file(fullpath, agency)

        self.assertEquals(len(expected_passes['CAS-IAP']), 1)
        self.assertEquals(len(expected_passes['DWD-MOHp']), 2)
        self.assertEquals(len(expected_passes['MLCD-LU']), 3)
        self.assertEquals(len(expected_passes['MSC']), 4)

        self.assertEquals(len(expected_fails['CAS-IAP']), 0)
        self.assertEquals(len(expected_fails['DWD-MOHp']), 1)
        self.assertEquals(len(expected_fails['MLCD-LU']), 0)
        self.assertEquals(len(expected_fails['MSC']), 1)

        output_path = os.path.join(SANDBOX_DIR, 'run_report')
        self.assertTrue(os.path.exists(output_path))

        with open(output_path) as output:
            lines = output.read().splitlines()
            curr_agency = None

            for line in lines:
                if line.startswith('Pass'):
                    target = line[6:]
                    self.assertIn(target, expected_passes[curr_agency])
                elif line.startswith('Fail'):
                    target = line[6:]
                    self.assertIn(target, expected_fails[curr_agency])
                elif line.strip() != '':
                    curr_agency = line.strip()
                    self.assertIn(line, agency_aliases.values())
    def test_mixed_operator_report(self):
        """
        Test that passing and failing files are written to the operator report
        when a mixture of the two is processed
        """

        infile_root = resolve_test_data_path('data/general/pass_and_fail')

        warnings = {}
        errors = {}

        expected_warnings = {}
        expected_errors = {}

        agency = 'UNKNOWN'

        with report.OperatorReport(SANDBOX_DIR) as op_report:
            for infile in os.listdir(infile_root):
                fullpath = os.path.join(infile_root, infile)

                warnings[fullpath] = 0
                errors[fullpath] = 0

                try:
                    contents = util.read_file(fullpath)
                    ecsv = parser.ExtendedCSV(contents, op_report)
                except (parser.MetadataValidationError,
                        parser.NonStandardDataError) as err:
                    expected_errors[fullpath] = len(err.errors)

                    op_report.add_message(209)
                    op_report.write_failing_file(fullpath, agency)
                    continue

                try:
                    ecsv.validate_metadata_tables()
                    agency = ecsv.extcsv['DATA_GENERATION']['Agency']

                    ecsv.validate_dataset_tables()
                    data_record = models.DataRecord(ecsv)
                    data_record.filename = infile

                    expected_warnings[fullpath] = len(ecsv.warnings)
                    expected_errors[fullpath] = 0
                    op_report.write_passing_file(fullpath, ecsv, data_record)
                except (parser.MetadataValidationError,
                        parser.NonStandardDataError):
                    expected_warnings[fullpath] = len(ecsv.warnings)
                    expected_errors[fullpath] = len(ecsv.errors)

                    op_report.add_message(209)
                    op_report.write_failing_file(fullpath, agency, ecsv)

        today = datetime.now().strftime('%Y-%m-%d')
        output_path = os.path.join(SANDBOX_DIR,
                                   'operator-report-{}.csv'.format(today))

        self.assertTrue(os.path.exists(output_path))
        with open(output_path) as output:
            reader = csv.reader(output)
            next(reader)

            for line in reader:
                if expected_errors[line[12]] == 0:
                    self.assertEquals(line[0], 'P')
                    self.assertEquals(line[1], 'Warning')
                else:
                    self.assertEquals(line[0], 'F')

                if line[2] == '200':
                    self.assertEquals(expected_errors[line[12]], 0)
                elif line[2] == '209':
                    self.assertGreater(expected_errors[line[12]], 0)
                elif line[1] == 'Warning':
                    warnings[line[12]] += 1
                elif line[1] == 'Error':
                    errors[line[12]] += 1

        self.assertEquals(warnings, expected_warnings)
        self.assertEquals(errors, expected_errors)
    def test_failing_operator_report(self):
        """Test that a failing file is written in the operator report"""

        filename = 'ecsv-missing-instrument-name.csv'
        infile = resolve_test_data_path('data/general/{}'.format(filename))
        contents = util.read_file(infile)

        ecsv = None
        agency = 'UNKNOWN'

        with report.OperatorReport(SANDBOX_DIR) as op_report:
            try:
                ecsv = parser.ExtendedCSV(contents, op_report)
                ecsv.validate_metadata_tables()
                agency = ecsv.extcsv['DATA_GENERATION']['Agency']

                ecsv.validate_dataset_tables()
                raise AssertionError(
                    'Parsing of {} did not fail'.format(infile))
            except (parser.MetadataValidationError,
                    parser.NonStandardDataError):
                output_path = os.path.join(SANDBOX_DIR, 'run1')

                op_report.add_message(209)
                op_report.write_failing_file(infile, agency, ecsv)

        today = datetime.now().strftime('%Y-%m-%d')
        output_path = os.path.join(SANDBOX_DIR,
                                   'operator-report-{}.csv'.format(today))

        self.assertTrue(os.path.exists(output_path))
        with open(output_path) as output:
            reader = csv.reader(output)
            next(reader)

            warnings = 0
            errors = 0

            expected_warnings = len(ecsv.warnings)
            expected_errors = len(ecsv.errors)
            for _ in range(expected_warnings + expected_errors):
                report_line = next(reader)
                self.assertEquals(report_line[0], 'F')

                if report_line[1] == 'Warning':
                    warnings += 1
                elif report_line[1] == 'Error':
                    errors += 1

            self.assertEquals(warnings, expected_warnings)
            self.assertEquals(errors, expected_errors)

            report_line = next(reader)
            self.assertEquals(report_line[0], 'F')
            self.assertEquals(report_line[1], 'Error')
            self.assertEquals(report_line[2], '209')
            self.assertIn(agency, report_line)
            self.assertIn(os.path.basename(infile), report_line)

            with self.assertRaises(StopIteration):
                next(reader)
Beispiel #11
0
    def process_data(self, infile, verify_only=False, bypass=False):
        """
        process incoming data record

        :param infile: incoming filepath
        :param verify_only: perform verification only (no ingest)
        :param bypass: skip permission prompts

        :returns: `bool` of processing result
        """

        # detect incoming data file
        data = None
        self.data_record = None
        self.search_engine = search.SearchIndex()

        LOGGER.info('Processing file {}'.format(infile))
        LOGGER.info('Detecting file')
        if not is_text_file(infile):
            self.status = 'failed'
            self.code = 'NonStandardDataError'
            self.message = 'binary file detected'
            LOGGER.error('Unknown file: {}'.format(self.message))
            return False

        try:
            data = read_file(infile)
        except UnicodeDecodeError as err:
            self.status = 'failed'
            self.code = 'NonStandardDataError'
            self.message = err
            LOGGER.error('Unknown file: {}'.format(err))
            return False

        LOGGER.info('Parsing data record')
        ecsv = ExtendedCSV(data)

        try:
            LOGGER.info('Validating Extended CSV')
            ecsv.validate_metadata()
            LOGGER.info('Valid Extended CSV')
        except NonStandardDataError as err:
            self.status = 'failed'
            self.code = 'NonStandardDataError'
            self.message = err
            LOGGER.error('Invalid Extended CSV: {}'.format(err))
            return False
        except MetadataValidationError as err:
            self.status = 'failed'
            self.code = 'MetadataValidationError'
            self.message = err
            LOGGER.error('Invalid Extended CSV: {}'.format(err.errors))
            return False

        LOGGER.info('Data is valid Extended CSV')

        self.data_record = DataRecord(ecsv)
        self.data_record.ingest_filepath = infile
        self.data_record.filename = os.path.basename(infile)
        self.data_record.url = self.data_record.get_waf_path(
            config.WDR_WAF_BASEURL)
        self.process_end = datetime.utcnow()

        LOGGER.debug('Verifying if URN already exists')
        results = self.registry.query_by_field(DataRecord, self.data_record,
                                               'identifier')

        if results:
            msg = 'Data exists'
            self.status = 'failed'
            self.code = 'ProcessingError'
            self.message = msg
            LOGGER.error(msg)
            return False


#        domains_to_check = [
#            'content_category',
#            'data_generation_agency',
#            'platform_type',
#            'platform_id',
#            'platform_name',
#            'platform_country',
#            'instrument_name',
#            'instrument_model'
#        ]

#        for domain_to_check in domains_to_check:
#            value = getattr(self.data_record, domain_to_check)
#            domain = getattr(DataRecord, domain_to_check)
#
#            if value not in self.registry.query_distinct(domain):
#                msg = 'value {} not in domain {}'.format(value,
#                                                         domain_to_check)
#                LOGGER.error(msg)
#                # raise ProcessingError(msg)

        LOGGER.info('Verifying data record against core metadata fields')

        LOGGER.debug('Validating project')
        self.projects = self.registry.query_distinct(Project.identifier)
        if self.data_record.content_class not in self.projects:
            msg = 'Project {} not found in registry'.format(
                self.data_record.content_class)
            LOGGER.error(msg)
            raise ProcessingError(msg)
        else:
            LOGGER.debug('Matched with project: {}'.format(
                self.data_record.content_class))

        LOGGER.debug('Validating dataset')
        self.datasets = self.registry.query_distinct(Dataset.identifier)
        if self.data_record.content_category not in self.datasets:
            msg = 'Dataset {} not found in registry'.format(
                self.data_record.content_category)
            LOGGER.error(msg)
            raise ProcessingError(msg)
        else:
            LOGGER.debug('Matched with dataset: {}'.format(
                self.data_record.content_category))

        LOGGER.debug('Validating contributor')
        self.contributors = self.registry.query_distinct(
            Contributor.identifier)
        file_contributor = '{}:{}'.format(
            self.data_record.data_generation_agency,
            self.data_record.content_class)
        if file_contributor not in self.contributors:
            msg = 'Contributor {} not found in registry'.format(
                file_contributor)
            LOGGER.error(msg)
            raise ProcessingError(msg)
        else:
            LOGGER.debug(
                'Matched with contributor: {}'.format(file_contributor))

        # TODO: consider adding and checking #PLATFORM_Type
        LOGGER.debug('Validating station data')
        station = {
            'identifier': self.data_record.platform_id,
            'name': self.data_record.platform_name,
            'country_id': self.data_record.platform_country
        }

        LOGGER.debug('Validating station id...')
        results = self.registry.query_multiple_fields(Station, station,
                                                      ['identifier'])
        if results:
            LOGGER.debug('Validated with id: {}'.format(
                self.data_record.platform_id))
        else:
            msg = 'Station {} not found in registry'.format(
                self.data_record.platform_id)
            LOGGER.error(msg)
            raise ProcessingError(msg)

        LOGGER.debug('Validating station name...')
        fields = ['identifier', 'name']
        results = self.registry.query_multiple_fields(Station, station, fields)
        if results:
            LOGGER.debug('Validated with name: {} for id: {}'.format(
                self.data_record.platform_name, self.data_record.platform_id))
        else:
            msg = 'Station name: {} did not match data for id: {}'.format(
                self.data_record.platform_name, self.data_record.platform_id)
            LOGGER.error(msg)
            raise ProcessingError(msg)

        LOGGER.debug('Validating station country...')
        fields = ['identifier', 'country_id']
        results = self.registry.query_multiple_fields(Station, station, fields)
        if results:
            LOGGER.debug('Validated with country: {} for id: {}'.format(
                self.data_record.platform_country,
                self.data_record.platform_id))
        else:
            msg = 'Station country: {} did not match data for id: {}'.format(
                self.data_record.platform_country,
                self.data_record.platform_id)
            LOGGER.error(msg)
            raise ProcessingError(msg)

        LOGGER.debug('Validating instrument')
        self.instruments = self.registry.query_distinct(Instrument.identifier)
        instrument_added = False
        instrument = [
            self.data_record.instrument_name,
            self.data_record.instrument_model,
            self.data_record.instrument_number, self.data_record.platform_id,
            self.data_record.content_category
        ]
        instrument_id = ':'.join(instrument)
        if instrument_id not in self.instruments:
            instrument[2] = str(int(instrument[2]))
            old_instrument_id = ':'.join(instrument)
            if old_instrument_id not in self.instruments:
                msg = 'Instrument {} not found in registry'.format(
                    instrument_id)
                LOGGER.warning(msg)
                LOGGER.debug('Checking for new serial number...')
                instrument_added = self.new_serial(instrument_id, verify_only)
                if not instrument_added:
                    if bypass:
                        LOGGER.info('Bypass mode. Skipping permission check.')
                        ins_data = self.get_instrument_data(instrument_id)
                        instrument = Instrument(ins_data)
                        self.registry.save(instrument)
                        LOGGER.info('Instrument successfully added.')
                        instrument_added = True
                    else:
                        response = input('Not instrument with new serial. Add'
                                         ' new instrument? (y/n)\n')
                        if response == 'y':
                            ins_data = self.get_instrument_data(instrument_id)
                            instrument = Instrument(ins_data)
                            self.registry.save(instrument)
                            LOGGER.info('Instrument successfully added.')
                            instrument_added = True
                        else:
                            msg = 'Instrument data for id:{} does not match '\
                                  'existing records.'.format(instrument_id)
                            LOGGER.error(msg)
                            raise ProcessingError(msg)
                LOGGER.debug('Updating instruments list.')
                self.instruments = self.registry.\
                    query_distinct(Instrument.identifier)
            else:
                instrument_id = old_instrument_id

        if instrument_added and verify_only:
            LOGGER.debug('Skipping location check due to instrument '
                         'not being added in verification mode.')
        else:
            LOGGER.debug('Matched with instrument: {}'.format(instrument_id))
            LOGGER.debug('Checking instrument location...')
            location = {
                'identifier': instrument_id,
                'x': self.data_record.x,
                'y': self.data_record.y,
                'z': self.data_record.z
            }
            results = self.registry.query_multiple_fields(Instrument, location)
            if results:
                LOGGER.debug('Instrument location validated.')
            else:
                msg = 'Instrument location does not match database records.'
                LOGGER.error(msg)
                raise ProcessingError(msg)

        LOGGER.debug('Validating agency deployment')
        deployment_id = ':'.join([
            self.data_record.platform_id,
            self.data_record.data_generation_agency,
            self.data_record.content_class
        ])
        data = {
            'identifier': deployment_id,
            'station_id': self.data_record.platform_id,
            'contributor_id': file_contributor,
            'start_date': self.data_record.timestamp_date,
            'end_date': self.data_record.timestamp_date
        }
        deployment = self.registry.query_multiple_fields(
            Deployment, data, ['identifier'])
        if deployment:
            if deployment.start_date > self.data_record.timestamp_date:
                deployment.start_date = self.data_record.timestamp_date
                self.registry.save()
                LOGGER.debug('Deployment start date updated.')
            elif deployment.end_date < self.data_record.timestamp_date:
                deployment.end_date = self.data_record.timestamp_date
                self.registry.save()
                LOGGER.debug('Deployment end date updated.')
            LOGGER.debug('Deployment validated')
        else:
            LOGGER.warning('Deployment not found')
            if bypass:
                LOGGER.info('Bypass mode. Skipping permission check')
                deployment = Deployment(data)
                self.registry.save(deployment)
                LOGGER.warning('Deployment {} added'.format(
                    deployment.identifier))
            else:
                response = input('Deployment {} not found. Add? (y/n)\n')
                if response == 'y':
                    deployment = Deployment(data)
                    self.registry.save(deployment)
                    LOGGER.warning('Deployment {} added'.format(
                        deployment.identifier))
                else:
                    msg = 'Deployment {} not added. Skipping file.'.format(
                        deployment.identifier)
                    LOGGER.error(msg)
                    raise ProcessingError(msg)
        LOGGER.info('Data record is valid and verified')

        if verify_only:  # do not save or index
            LOGGER.info('Verification mode detected. NOT saving to registry')
            return True

        LOGGER.info('Saving data record CSV to registry')
        self.registry.save(self.data_record)

        LOGGER.info('Saving data record CSV to WAF')
        waf_filepath = self.data_record.get_waf_path(config.WDR_WAF_BASEDIR)
        os.makedirs(os.path.dirname(waf_filepath), exist_ok=True)
        shutil.copy2(self.data_record.ingest_filepath, waf_filepath)

        LOGGER.info('Indexing data record search engine')
        version = self.search_engine.get_record_version(self.data_record.es_id)
        if version:
            if version < self.data_record.data_generation_version:
                self.search_engine.index_data_record(
                    self.data_record.__geo_interface__)
        else:
            self.search_engine.index_data_record(
                self.data_record.__geo_interface__)
        return True
    def test_ecsv(self):
        """test Extended CSV handling"""

        # good file
        contents = util.read_file(
            resolve_test_data_path('data/20040709.ECC.2Z.2ZL1.NOAA-CMDL.csv'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)
        self.assertEqual('20040709.ECC.2Z.2ZL1.NOAA-CMDL.csv',
                         ecsv.gen_woudc_filename())

        # good file, missing instrument number
        contents = util.read_file(
            resolve_test_data_path('data/ecsv-missing-instrument-number.csv'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)
        self.assertEqual('20111101.Brewer.MKIII.na.RMDA.csv',
                         ecsv.gen_woudc_filename())

        # good file, space in instrument name
        contents = util.read_file(
            resolve_test_data_path('data/ecsv-space-in-instrument-name.csv'))
        ecsv = parser.ExtendedCSV(contents)
        self.assertEqual('20111101.Brewer-foo.MKIII.na.RMDA.csv',
                         ecsv.gen_woudc_filename())

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        self.assertTrue(
            set(DOMAINS['metadata_tables'].keys()).issubset(
                set(ecsv.extcsv.keys())))
        ecsv.validate_metadata()

        # good file, test special characters
        contents = util.read_file(
            resolve_test_data_path('data/Brewer229_Daily_SEP2016.493'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        self.assertTrue(
            set(DOMAINS['metadata_tables'].keys()).issubset(
                set(ecsv.extcsv.keys())))
        ecsv.validate_metadata()

        self.assertEqual(ecsv.extcsv['PLATFORM']['Name'], 'Río Gallegos')

        # bad file (not an ecsv)
        contents = util.read_file(
            resolve_test_data_path('data/not-an-ecsv.dat'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        with self.assertRaises(parser.NonStandardDataError):
            ecsv.validate_metadata()

        # bad file (missing table)
        contents = util.read_file(
            resolve_test_data_path('data/ecsv-missing-location-table.csv'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        with self.assertRaises(parser.MetadataValidationError):
            ecsv.validate_metadata()

        # bad file (missing data - LOCATION.Height)
        contents = util.read_file(
            resolve_test_data_path('data/ecsv-missing-location-height.csv'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        self.assertTrue(
            set(DOMAINS['metadata_tables'].keys()).issubset(
                set(ecsv.extcsv.keys())))

        with self.assertRaises(parser.MetadataValidationError):
            ecsv.validate_metadata()

        # bad file (invalid location latitude)
        contents = util.read_file(
            resolve_test_data_path('data/ecsv-invalid-location-latitude.csv'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        with self.assertRaises(parser.MetadataValidationError):
            ecsv.validate_metadata()

        # bad file (invalid location longitude)
        contents = util.read_file(
            resolve_test_data_path('data/ecsv-invalid-location-longitude.csv'))

        ecsv = parser.ExtendedCSV(contents)
        self.assertIsInstance(ecsv, parser.ExtendedCSV)

        with self.assertRaises(parser.MetadataValidationError):
            ecsv.validate_metadata()
Beispiel #13
0
def orchestrate(source,
                working_dir,
                metadata_only=False,
                verify_only=False,
                bypass=False):
    """
    Core orchestation workflow

    :param source: Path to input file or directory tree containing them.
    :param working_dir: Output directory for log and report files.
    :param metadata_only: `bool` of whether to verify only the
                          common metadata tables.
    :param verify_only: `bool` of whether to verify the file for correctness
                        without processing.
    :param bypass: `bool` of whether to skip permission prompts for adding
                   new records.
    :returns: void
    """

    files_to_process = []

    if os.path.isfile(source):
        fullpath = Path(source).parent.resolve()
        parent_dir = os.path.basename(str(fullpath))

        # Use parent dir to guess the contributor acronym during processing
        # runs, where the parent path is the contributor's FTP name.
        files_to_process = [(source, parent_dir)]
    elif os.path.isdir(source):
        for root, dirs, files in os.walk(source):
            parent_dir = os.path.basename(root)

            for f in files:
                fullpath = os.path.join(root, f)
                files_to_process.append((fullpath, parent_dir))

    files_to_process.sort()

    passed = []
    failed = []

    registry = Registry()
    search_engine = SearchIndex()

    with OperatorReport(working_dir) as op_report, \
         click.progressbar(files_to_process, label='Processing files') as run_:  # noqa

        run_report = RunReport(working_dir)

        for file_to_process, contributor in run_:
            click.echo('Processing filename: {}'.format(file_to_process))

            LOGGER.info('Detecting file')
            if not is_text_file(file_to_process):
                _, is_error = op_report.add_message(1)
                if is_error:
                    op_report.write_failing_file(file_to_process, contributor)
                    run_report.write_failing_file(file_to_process, contributor)

                    failed.append(file_to_process)
                    continue

            try:
                contents = read_file(file_to_process)

                LOGGER.info('Parsing data record')
                extcsv = ExtendedCSV(contents, op_report)

                LOGGER.info('Validating Extended CSV')
                extcsv.validate_metadata_tables()
                contributor = extcsv.extcsv['DATA_GENERATION']['Agency']

                if not metadata_only:
                    extcsv.validate_dataset_tables()
                LOGGER.info('Valid Extended CSV')

                p = Process(registry, search_engine, op_report)
                data_record = p.validate(extcsv,
                                         bypass=bypass,
                                         metadata_only=metadata_only)

                if data_record is None:
                    click.echo('Not ingesting')
                    failed.append(file_to_process)

                    op_report.write_failing_file(file_to_process, contributor,
                                                 extcsv)
                    run_report.write_failing_file(file_to_process, contributor)
                else:
                    data_record.ingest_filepath = file_to_process
                    data_record.filename = os.path.basename(file_to_process)
                    data_record.url = \
                        data_record.get_waf_path(config.WDR_WAF_BASEURL)
                    data_record.output_filepath = \
                        data_record.get_waf_path(config.WDR_WAF_BASEDIR)

                    if verify_only:
                        click.echo('Verified but not ingested')
                    else:
                        p.persist()
                        click.echo('Ingested successfully')

                    op_report.write_passing_file(file_to_process, extcsv,
                                                 data_record)
                    run_report.write_passing_file(file_to_process, contributor)

                    passed.append(file_to_process)

            except UnicodeDecodeError as err:
                LOGGER.error('Unknown file format: {}'.format(err))

                click.echo('Not ingested')
                failed.append(file_to_process)

                op_report.write_failing_file(file_to_process, contributor)
                run_report.write_failing_file(file_to_process, contributor)
            except NonStandardDataError as err:
                LOGGER.error('Invalid Extended CSV: {}'.format(err.errors))

                click.echo('Not ingested')
                failed.append(file_to_process)

                op_report.write_failing_file(file_to_process, contributor)
                run_report.write_failing_file(file_to_process, contributor)
            except MetadataValidationError as err:
                LOGGER.error('Invalid Extended CSV: {}'.format(err.errors))

                click.echo('Not ingested')
                failed.append(file_to_process)

                op_report.write_failing_file(file_to_process, contributor)
                run_report.write_failing_file(file_to_process, contributor)
            except Exception as err:
                click.echo('Processing failed: {}'.format(err))
                failed.append(file_to_process)

                op_report.write_failing_file(file_to_process, contributor)
                run_report.write_failing_file(file_to_process, contributor)

    registry.close_session()

    for name in files_to_process:
        if name in passed:
            click.echo('Pass: {}'.format(name))
        elif name in failed:
            click.echo('Fail: {}'.format(name))

    click.echo('({}/{} files passed)'.format(len(passed),
                                             len(files_to_process)))