Beispiel #1
0
def test_checks_records_error():
    setup_main_database()
    with tempfile.TemporaryDirectory() as tmpdir:
        source = EmptySource(tmpdir)
        metadata_db = MetadataDB(tmpdir)
        metadata_db.create_session_metadata("Test", True,
                                            "http://www.test.com",
                                            "2018-01-01-10-00-00")
        metadata_db.add_filestatus({
            'filename': 'test1.json',
            'url': 'http://www.test.com',
            'data_type': 'record_package'
        })

        # store details
        source_session_id = database.start_store("test_source",
                                                 "2018-01-01-10-00-00", True,
                                                 metadata_db)
        for data in metadata_db.list_filestatus():
            with database.add_file(source_session_id, data) as database_file:
                database_file.insert_record({'record': 'totally'}, {
                    'version': '0.1-does-not-exist',
                    'extensions': []
                })
        database.end_store(source_session_id)

        record_id = 1
        # Don't like hard coding ID in. Relies on DB assigning 1 to this new row. But I think we can assume that.

        # Test
        assert not database.is_record_check_done(record_id)
        assert not database.is_check_done(source_session_id)

        # check!
        for data in metadata_db.list_filestatus():
            checks.check_file(source, source_session_id, data)

        # Test
        assert database.is_record_check_done(record_id)
        assert database.is_check_done(source_session_id)

        with database.get_engine().begin() as connection:
            s = sa.sql.select([database.record_check_error_table])
            result = connection.execute(s)
            data = result.fetchone()

        assert 'The schema version in your data is not valid. Accepted values:' in data[
            'error']
Beispiel #2
0
def test_checks_releases():
    setup_main_database()
    with tempfile.TemporaryDirectory() as tmpdir:
        source = EmptySource(tmpdir)
        metadata_db = MetadataDB(tmpdir)
        metadata_db.create_session_metadata("Test", True,
                                            "http://www.test.com",
                                            "2018-01-01-10-00-00")
        metadata_db.add_filestatus({
            'filename': 'test1.json',
            'url': 'http://www.test.com',
            'data_type': 'release_package'
        })

        # store details
        source_session_id = database.start_store("test_source",
                                                 "2018-01-01-10-00-00", True,
                                                 metadata_db)
        for data in metadata_db.list_filestatus():
            with database.add_file(source_session_id, data) as database_file:
                database_file.insert_release({'release': 'totally'},
                                             {'extensions': []})
        database.end_store(source_session_id)

        release_id = 1
        # Don't like hard coding ID in. Relies on DB assigning 1 to this new row. But I think we can assume that.

        # Test
        assert not database.is_release_check_done(release_id)

        # check!
        for data in metadata_db.list_filestatus():
            checks.check_file(source, source_session_id, data)

        # Test
        assert database.is_release_check_done(release_id)

        with database.get_engine().begin() as connection:
            s = sa.sql.select([database.release_check_table])
            result = connection.execute(s)
            data = result.fetchone()

        assert data['cove_output']['file_type'] == 'json'
        assert len(data['cove_output']['validation_errors']) > 0
Beispiel #3
0
class Source:
    publisher_name = None
    url = None
    output_directory = None
    source_id = None
    sample = False
    data_version = None
    """It is possible to pass extra arguments.

    This specifies a list of the extra arguments possible. Each item in the list should be a dict with the keys:
      *  name - a name compatible with argparse. Names should be unique across all sources, so include a prefix of some kind.
      *  help - a help string for argparse
    """
    argument_definitions = []

    def __init__(self,
                 base_dir,
                 remove_dir=False,
                 publisher_name=None,
                 url=None,
                 sample=False,
                 data_version=None,
                 new_version=False):

        self.base_dir = base_dir
        self.sample = sample

        self.publisher_name = publisher_name or self.publisher_name
        if not self.publisher_name:
            raise AttributeError('A publisher name needs to be specified')

        # Make sure the output directory is fully specified, including sample bit (if applicable)
        self.output_directory = self.output_directory or self.source_id
        if not self.output_directory:
            raise AttributeError('An output directory needs to be specified')

        if self.sample and not self.output_directory.endswith('_sample'):
            self.output_directory += '_sample'

        # Load all versions if possible, pick an existing one or set a new one.
        all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\
            if os.path.exists(os.path.join(base_dir, self.output_directory)) else []

        if self.data_version:
            pass
        elif data_version and data_version in all_versions:  # Version specified is valid
            self.data_version = data_version
        elif data_version:  # Version specified is invalid!
            raise AttributeError('A version was specified that does not exist')
        elif new_version or len(all_versions) == 0:  # New Version
            self.data_version = datetime.datetime.utcnow().strftime(
                '%Y-%m-%d-%H-%M-%S')
        elif len(all_versions) > 0:  # Get the latest version to resume
            self.data_version = all_versions[0]
        else:  # Should not happen...
            raise AttributeError(
                'The version is unavailable on the output directory')

        # Build full directory, make sure it exists
        self.full_directory = os.path.join(base_dir, self.output_directory,
                                           self.data_version)

        exists = os.path.exists(self.full_directory)

        try:
            if exists and remove_dir:
                os.rmdir(self.full_directory)
                exists = False

            if not exists:
                os.makedirs(self.full_directory)
        except OSError:
            raise RuntimeError(
                "Error: Write permission is needed on the directory specified (or project dir). %s"
                % self.full_directory)

        # Misc

        self.url = url or self.url

        self.metadata_db = MetadataDB(self.full_directory)

        self.metadata_db.create_session_metadata(
            publisher_name=self.publisher_name,
            sample=self.sample,
            url=self.url,
            data_version=self.data_version)

        self.logger = logging.getLogger('ocdskingfisher.source')

    """Returns an array with objects for each url.

    The return objects includes url,filename,type and more."""

    def gather_all_download_urls(self):
        raise NotImplementedError()

    def set_arguments(self, arguments):
        pass

    def run_gather(self):
        self.logger.info("Starting run_gather")

        metadata = self.metadata_db.get_session()

        if metadata['gather_success']:
            return

        self.metadata_db.update_session_gather_start()

        try:
            for info in self.gather_all_download_urls():
                if self.metadata_db.has_filestatus_filename(info['filename']):
                    if not self.metadata_db.compare_filestatus_to_database(
                            info):
                        raise Exception(
                            "Tried to add the file " + info['filename'] +
                            " but it clashed with a file already in the list!")
                else:
                    self.metadata_db.add_filestatus(info)
        except Exception as e:
            error = repr(e)
            stacktrace = traceback.format_exception(*sys.exc_info())
            self.metadata_db.update_session_gather_end(False, error,
                                                       stacktrace)
            return

        self.metadata_db.update_session_gather_end(True, None, None)

    def is_gather_finished(self):
        metadata = self.metadata_db.get_session()
        return bool(metadata['gather_finished_datetime'])

    def run_fetch(self):
        self.logger.info("Starting run_fetch")

        metadata = self.metadata_db.get_session()

        if metadata['fetch_success']:
            return

        if not metadata['gather_success']:
            msg = 'Can not run fetch without a successful gather!'
            if metadata['gather_errors']:
                msg += ' Gather errors: ' + metadata['gather_errors']
            raise Exception(msg)

        self.metadata_db.update_session_fetch_start()

        data = self.metadata_db.get_next_filestatus_to_fetch()
        while data:

            self.logger.info("Starting run_fetch for file " + data['filename'])
            self.metadata_db.update_filestatus_fetch_start(data['filename'])
            try:
                response = self.save_url(
                    data['filename'], data,
                    os.path.join(self.full_directory, data['filename']))
                if response.additional_files:
                    for info in response.additional_files:
                        if self.metadata_db.has_filestatus_filename(
                                info['filename']):
                            if not self.metadata_db.compare_filestatus_to_database(
                                    info):
                                response.errors.append(
                                    "Tried to add the file " +
                                    info['filename'] +
                                    " but it clashed with a file already in the list!"
                                )
                        else:
                            self.metadata_db.add_filestatus(info)
                self.metadata_db.update_filestatus_fetch_end(
                    data['filename'], response.errors, response.warnings)

            except Exception as e:
                self.metadata_db.update_filestatus_fetch_end(
                    data['filename'], [repr(e)])

            data = self.metadata_db.get_next_filestatus_to_fetch()

        self.metadata_db.update_session_fetch_end()

    def is_fetch_finished(self):
        metadata = self.metadata_db.get_session()
        return bool(metadata['fetch_finished_datetime'])

    """Uploads the fetched data as record rows to the Database"""

    def run_store(self):
        self.logger.info("Starting run_store")
        metadata = self.metadata_db.get_session()

        if not metadata['fetch_success']:
            raise Exception('Can not run store without a successful fetch')

        if database.is_store_done(self.source_id, self.data_version,
                                  self.sample):
            return

        source_session_id = database.start_store(self.source_id,
                                                 self.data_version,
                                                 self.sample, self.metadata_db)

        for data in self.metadata_db.list_filestatus():

            if data['data_type'].startswith('meta'):
                continue

            if database.is_store_file_done(source_session_id, data):
                continue

            with database.add_file(source_session_id, data) as database_file:

                self.logger.info("Starting run_store for file " +
                                 data['filename'])

                try:
                    with open(os.path.join(self.full_directory,
                                           data['filename']),
                              encoding=data['encoding']) as f:
                        file_json_data = json.load(f)
                except Exception as e:
                    # TODO better way of dealing with this?
                    raise e
                    return

                objects_list = []
                if data['data_type'] == 'record_package_list_in_results':
                    objects_list.extend(file_json_data['results'])
                elif data['data_type'] == 'release_package_list_in_results':
                    objects_list.extend(file_json_data['results'])
                elif data['data_type'] == 'record_package_list' or data[
                        'data_type'] == 'release_package_list':
                    objects_list.extend(file_json_data)
                else:
                    objects_list.append(file_json_data)

                del file_json_data

                for json_data in objects_list:
                    error_msg = ''
                    if not isinstance(json_data, dict):
                        error_msg = "Can not process data in file {} as JSON is not an object".format(
                            data['filename'])

                    if data['data_type'] == 'release':
                        data_list = [json_data]
                    elif data['data_type'] == 'release_package' or \
                            data['data_type'] == 'release_package_list_in_results' or \
                            data['data_type'] == 'release_package_list':
                        if 'releases' not in json_data:
                            error_msg = "Release list not found in file {}".format(
                                data['filename'])
                        elif not isinstance(json_data['releases'], list):
                            error_msg = "Release list which is not a list found in file {}".format(
                                data['filename'])
                        data_list = json_data['releases']
                    elif data['data_type'] == 'record_package' or \
                            data['data_type'] == 'record_package_list_in_results' or \
                            data['data_type'] == 'record_package_list':
                        if 'records' not in json_data:
                            error_msg = "Record list not found in file {}".format(
                                data['filename'])
                        elif not isinstance(json_data['records'], list):
                            error_msg = "Record list which is not a list found in file {}".format(
                                data['filename'])
                        data_list = json_data['records']
                    else:
                        error_msg = "data_type not a known type"

                    if error_msg:
                        raise Exception(error_msg)
                    package_data = {}
                    if not data['data_type'] == 'release':
                        for key, value in json_data.items():
                            if key not in ('releases', 'records'):
                                package_data[key] = value

                    for row in data_list:
                        if not isinstance(row, dict):
                            error_msg = "Row in data is not a object {}".format(
                                data['filename'])
                            raise Exception(error_msg)

                        if data['data_type'] == 'record_package' or \
                                data['data_type'] == 'record_package_list_in_results' or \
                                data['data_type'] == 'record_package_list':
                            database_file.insert_record(row, package_data)
                        else:
                            database_file.insert_release(row, package_data)

        database.end_store(source_session_id)

    def save_url(self, file_name, data, file_path):
        save_content_response = save_content(data['url'], file_path)
        return self.SaveUrlResult(errors=save_content_response.errors,
                                  warnings=save_content_response.warnings)

    class SaveUrlResult:
        def __init__(self, additional_files=[], errors=[], warnings=[]):
            self.additional_files = additional_files
            self.errors = errors
            self.warnings = warnings

    def run_check(self, override_schema_version=None):
        self.logger.info("Starting run_check")
        if not database.is_store_done(self.source_id, self.data_version,
                                      self.sample):
            raise Exception('Can not run check without a successful store')

        source_session_id = database.get_id_of_store(self.source_id,
                                                     self.data_version,
                                                     self.sample)

        for data in self.metadata_db.list_filestatus():

            if data['data_type'].startswith('meta'):
                continue

            self.logger.info("Starting run_check for file " + data['filename'])

            check_file(self, source_session_id, data, override_schema_version)

    """Called with data to check before checks are run, so any problems can be fixed (See Australia)"""

    def before_check_data(self, data, override_schema_version=None):
        return data

    """Gather, Fetch, Store and Check data from this publisher."""

    def run_all(self):
        self.run_gather()
        self.run_fetch()
        self.run_store()
        self.run_check()

    def force_fetch_to_gather(self):
        self.logger.info("Starting force_fetch_to_gather")
        self.metadata_db.force_fetch_to_gather()
Beispiel #4
0
class SourceStatus:
    def __init__(self,
                 base_dir,
                 source_id,
                 output_directory=None,
                 sample=False,
                 data_version=None):

        self.base_dir = base_dir
        self.source_id = source_id
        self.sample = sample

        # Make sure the output directory is fully specified, including sample bit (if applicable)
        self.output_directory = output_directory or source_id
        if not self.output_directory:
            raise AttributeError('An output directory needs to be specified')

        if self.sample and not self.output_directory.endswith('_sample'):
            self.output_directory += '_sample'

        # Load all versions if possible, pick an existing one or set a new one.
        all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\
            if os.path.exists(os.path.join(base_dir, self.output_directory)) else []

        if data_version and data_version in all_versions:  # Version specified is valid
            self.data_version = data_version
        elif data_version:  # Version specified is invalid!
            raise AttributeError('A version was specified that does not exist')
        elif len(all_versions) > 0:  # Get the latest version to resume
            self.data_version = all_versions[0]
        else:  # Should not happen...
            raise AttributeError(
                'The source and/or version is unavailable on the output directory'
            )

        # Build full directory, make sure it exists
        self.full_directory = os.path.join(base_dir, self.output_directory,
                                           self.data_version)
        if not os.path.exists(self.full_directory):
            raise AttributeError('Full Directory does not exist!')

        # Misc
        self.metadata_db = MetadataDB(self.full_directory)

    def is_gather_finished(self):
        metadata = self.metadata_db.get_session()
        return bool(metadata['gather_finished_datetime'])

    def get_gather_progress_as_text(self):

        out = []
        metadata = self.metadata_db.get_session()

        if metadata['gather_start_datetime']:
            out.append("Started " +
                       metadata['gather_start_datetime'].strftime("%c"))

        return "\n".join(out)

    def is_fetch_finished(self):
        metadata = self.metadata_db.get_session()
        return bool(metadata['fetch_finished_datetime'])

    def get_fetch_progress_as_text(self):

        file_statuses = self.metadata_db.list_filestatus()
        count_finished = 0
        current_out = []

        for file_status in file_statuses:
            if file_status['fetch_finished_datetime'] is not None:
                count_finished += 1

            if file_status['fetch_start_datetime'] and file_status[
                    'fetch_finished_datetime'] is None:
                current_out.append("Filename: " + file_status['filename'])
                current_out.append("URL: " + file_status['url'])
                current_out.append("Data Type: " + file_status['data_type'])
                current_out.append("Encoding: " + file_status['encoding'])
                current_out.append("Priority: " + str(file_status['priority']))
                current_out.append(
                    "Started: " +
                    file_status['fetch_start_datetime'].strftime("%c"))

        out = "Finished " + str(count_finished) + " out of " + str(
            len(file_statuses)) + " files.\n"
        if current_out:
            return out + "\nIn Progress:\n" + "\n".join(current_out)
        else:
            return out

    def is_store_finished(self):
        return database.is_store_done(self.source_id, self.data_version,
                                      self.sample)

    def get_store_progress_as_text(self):
        return 'Store is in progress'

    def is_check_finished(self):
        source_session_id = database.get_id_of_store(self.source_id,
                                                     self.data_version,
                                                     self.sample)
        return database.is_check_done(source_session_id)

    def get_check_progress_as_text(self):
        return 'Check is in progress'