Example #1
0
def delete_records(context, database, table):
    """Deletes all records from repository"""

    LOGGER.info('Deleting all records')

    repo = repository.Repository(database, context, table=table)
    repo.delete(constraint={'where': '', 'values': []})
Example #2
0
 def delete_records(self, constraint={'where': '', 'values': []}):
     if self.context == None:
         print("[WARN] pycsw config not available. NOT DELETING RECORDS")
         return
     repo = repository.Repository(self.database,
                                  self.context,
                                  table=self.table)
     repo.delete(constraint=constraint)
Example #3
0
def rebuild_db_indexes(context, database, table):
    """Rebuild database indexes"""

    LOGGER.info('Rebuilding database %s, table %s', database, table)
    repos = repository.Repository(database, context, table=table)
    connection = repos.engine.connect()
    connection.autocommit = True
    connection.execute('REINDEX %s' % table)
    connection.close()
    LOGGER.info('Done')
Example #4
0
def export_records(context, database, table, xml_dirpath):
    """Export metadata records from database to directory of files"""
    repo = repository.Repository(database, context, table=table)

    LOGGER.info('Querying database %s, table %s ....', database, table)
    records = repo.session.query(repo.dataset)

    LOGGER.info('Found %d records\n', records.count())

    LOGGER.info('Exporting records\n')

    dirpath = os.path.abspath(xml_dirpath)

    exported_files = set()

    if not os.path.exists(dirpath):
        LOGGER.info('Directory %s does not exist.  Creating...', dirpath)
        try:
            os.makedirs(dirpath)
        except OSError as err:
            LOGGER.exception('Could not create directory')
            raise RuntimeError('Could not create %s %s' %
                               (dirpath, err)) from err

    for record in records.all():
        identifier = \
            getattr(record,
                    context.md_core_model['mappings']['pycsw:Identifier'])

        LOGGER.info('Processing %s', identifier)

        # sanitize identifier
        identifier = util.secure_filename(identifier)
        # write to XML document
        filename = os.path.join(dirpath, '%s.xml' % identifier)
        try:
            LOGGER.info('Writing to file %s', filename)
            if hasattr(record.xml, 'decode'):
                str_xml = record.xml.decode('utf-8')
            else:
                str_xml = record.xml
            with open(filename, 'w') as xml:
                xml.write('<?xml version="1.0" encoding="UTF-8"?>\n')
                xml.write(str_xml)
        except Exception as err:
            # Something went wrong so skip over this file but log an error
            LOGGER.exception('Error writing %s to disk', filename)
            # If we wrote a partial file or created an empty file make sure it is removed
            if os.path.exists(filename):
                os.remove(filename)
            continue
        else:
            exported_files.add(filename)

    return tuple(exported_files)
Example #5
0
def export_record_table_csv(context, database, table, mappings, xml_dirpath):
    """Export record table from database to csv file"""
    import csv
    from datetime import datetime

    repo = repository.Repository(database, context, table=table)

    LOGGER.info('Querying database %s, table %s ....', database, table)
    records = repo.session.query(repo.dataset)

    LOGGER.info('Found %d records\n', records.count())

    LOGGER.info('Exporting records\n')

    dirpath = os.path.abspath(xml_dirpath)

    if not os.path.exists(dirpath):
        LOGGER.info('Directory %s does not exist.  Creating...', dirpath)
        try:
            os.makedirs(dirpath)
        except OSError as err:
            raise RuntimeError('Could not create %s %s' % (dirpath, err))

    is_headers_written = False
    filename = os.path.join(dirpath, '%s.csv' % str(datetime.now()))
    with open(filename, 'wb') as csvfile:
        csvwriter = csv.writer(csvfile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)

        for record in records.all():
            if mappings is None:
                model = context.md_core_model
            else:
                model = import_model_from_file(mappings)

            headers = []
            row = []
            map_dict = model['mappings']
            for k, v in map_dict.iteritems():
                field_value = \
                  getattr(record,
                    model['mappings'][k], "")

                if is_headers_written is False:
                    headers.append(model['mappings'][k])

                row.append(field_value)

            if is_headers_written is False:
                csvwriter.writerow(headers)

            csvwriter.writerow(row)
            is_headers_written = True
def load_records(context,
                 database,
                 table,
                 xml_dirpath,
                 recursive=False,
                 force_update=False):
    """Load metadata records from directory of files to database"""
    repo = repository.Repository(database, context, table=table)

    file_list = []

    if os.path.isfile(xml_dirpath):
        file_list.append(xml_dirpath)
    elif recursive:
        for root, dirs, files in os.walk(xml_dirpath):
            for mfile in files:
                if mfile.endswith('.xml'):
                    file_list.append(os.path.join(root, mfile))
    else:
        for rec in glob(os.path.join(xml_dirpath, '*.xml')):
            file_list.append(rec)

    total = len(file_list)
    counter = 0

    for recfile in sorted(file_list):
        counter += 1
        LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total)
        # read document
        try:
            exml = etree.parse(recfile, context.parser)
        except Exception as err:
            LOGGER.warn('XML document is not well-formed: %s', str(err))
            continue

        record = metadata.parse_record(context, exml, repo)

        for rec in record:
            LOGGER.info('Inserting %s %s into database %s, table %s ....',
                        rec.typename, rec.identifier, database, table)

            # TODO: do this as CSW Harvest
            try:
                repo.insert(rec, 'local', util.get_today_and_now())
                LOGGER.info('Inserted')
            except RuntimeError as err:
                if force_update:
                    LOGGER.info('Record exists. Updating.')
                    repo.update(rec)
                    LOGGER.info('Updated')
                else:
                    LOGGER.warn('ERROR: not inserted %s', err)
Example #7
0
    def __init__(self, repository_database_uri, ows_url: str = '',
                 public_s3_url: str = ''):
        self.collections = []
        self.ows_url = ows_url
        self.public_s3_url = public_s3_url

        logger.debug('Setting up static context')
        self.context = pycsw.core.config.StaticContext()

        logger.debug('Initializing pycsw repository')
        self.repo = repository.Repository(repository_database_uri,
                                          self.context, table='records')
        logger.debug('Loading collection level metadata identifiers')
        for clm in os.listdir(COLLECTION_LEVEL_METADATA):
            self.collections.append(os.path.splitext(clm)[0])
Example #8
0
def export_records(context, database, table, mappings, xml_dirpath):
    """Export metadata records from database to directory of files"""
    repo = repository.Repository(database, context, table=table)

    LOGGER.info('Querying database %s, table %s ....', database, table)
    records = repo.session.query(repo.dataset)

    LOGGER.info('Found %d records\n', records.count())

    LOGGER.info('Exporting records\n')

    dirpath = os.path.abspath(xml_dirpath)

    if not os.path.exists(dirpath):
        LOGGER.info('Directory %s does not exist.  Creating...', dirpath)
        try:
            os.makedirs(dirpath)
        except OSError as err:
            raise RuntimeError('Could not create %s %s' % (dirpath, err))

    for record in records.all():
        if mappings is None:
            model = context.md_core_model
        else:
            model = import_model_from_file(mappings)

        identifier = \
   getattr(record,
        model['mappings']['pycsw:Identifier'])
        xml_field = \
   getattr(record,
        model['mappings']['pycsw:XML'])

        LOGGER.info('Processing %s', identifier)
        if identifier.find(':') != -1:  # it's a URN
            # sanitize identifier
            LOGGER.info(' Sanitizing identifier')
            identifier = identifier.split(':')[-1]

        # write to XML document
        filename = os.path.join(dirpath, '%s.xml' % identifier)
        try:
            LOGGER.info('Writing to file %s', filename)
            with open(filename, 'w') as xml:
                xml.write('<?xml version="1.0" encoding="UTF-8"?>\n')
                xml.write(xml_field)
        except Exception as err:
            raise RuntimeError("Error writing to %s" % filename, err)
Example #9
0
def optimize_db(context, database, table):
    """Optimize database"""
    from sqlalchemy.exc import ArgumentError, OperationalError

    LOGGER.info('Optimizing database %s', database)
    repos = repository.Repository(database, context, table=table)
    connection = repos.engine.connect()
    try:
        # PostgreSQL
        connection.execution_options(isolation_level="AUTOCOMMIT")
        connection.execute('VACUUM ANALYZE')
    except (ArgumentError, OperationalError):
        # SQLite
        connection.autocommit = True
        connection.execute('VACUUM')
        connection.execute('ANALYZE')
    finally:
        connection.close()
        LOGGER.info('Done')
Example #10
0
def gen_sitemap(context, database, table, url, output_file):
    """generate an XML sitemap from all records in repository"""

    # get configuration and init repo connection
    repos = repository.Repository(database, context, table=table)

    # write out sitemap document
    urlset = etree.Element(util.nspath_eval('sitemap:urlset',
                                            context.namespaces),
                           nsmap=context.namespaces)

    schema_loc = util.nspath_eval('xsi:schemaLocation', context.namespaces)

    urlset.attrib[schema_loc] = \
        '%s http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd' % \
        context.namespaces['sitemap']

    # get all records
    count, records = repos.query(constraint={}, maxrecords=99999999)

    LOGGER.info('Found %s records', count)

    for rec in records:
        url_ = etree.SubElement(
            urlset, util.nspath_eval('sitemap:url', context.namespaces))
        uri = '%s?service=CSW&version=2.0.2&request=GetRepositoryItem&id=%s' % \
            (url,
             getattr(rec,
                     context.md_core_model['mappings']['pycsw:Identifier']))
        etree.SubElement(url_,
                         util.nspath_eval('sitemap:loc',
                                          context.namespaces)).text = uri

    # write to file
    LOGGER.info('Writing to %s', output_file)
    with open(output_file, 'wb') as ofile:
        ofile.write(
            etree.tostring(urlset,
                           pretty_print=1,
                           encoding='utf8',
                           xml_declaration=1))
Example #11
0
def refresh_harvested_records(context, database, table, url):
    """refresh / harvest all non-local records in repository"""
    from owslib.csw import CatalogueServiceWeb

    # get configuration and init repo connection
    repos = repository.Repository(database, context, table=table)

    # get all harvested records
    count, records = repos.query(constraint={
        'where': "mdsource != 'local'",
        'values': []
    })

    if int(count) > 0:
        LOGGER.info('Refreshing %s harvested records', count)
        csw = CatalogueServiceWeb(url)

        for rec in records:
            source = \
                getattr(rec,
                        context.md_core_model['mappings']['pycsw:Source'])
            schema = \
                getattr(rec,
                        context.md_core_model['mappings']['pycsw:Schema'])
            identifier = \
                getattr(rec,
                        context.md_core_model['mappings']['pycsw:Identifier'])

            LOGGER.info('Harvesting %s (identifier = %s) ...', source,
                        identifier)
            # TODO: find a smarter way of catching this
            if schema == 'http://www.isotc211.org/2005/gmd':
                schema = 'http://www.isotc211.org/schemas/2005/gmd/'
            try:
                csw.harvest(source, schema)
                LOGGER.info(csw.response)
            except Exception as err:
                LOGGER.exception('Could not harvest')
    else:
        LOGGER.info('No harvested records')
Example #12
0
    def __init__(self, config: ConfigParser):
        """
        constructor

        :param config: ConfigParser pycsw configuration dict

        :returns: `pycsw.ogc.api.API` instance
        """

        self.config = config

        log.setup_logger(self.config)

        if self.config['server']['url'].startswith('${'):
            LOGGER.debug(
                f"Server URL is an environment variable: {self.config['server']['url']}"
            )
            url_ = match_env_var(self.config['server']['url'])
        else:
            url_ = self.config['server']['url']

        LOGGER.debug(f'Server URL: {url_}')
        self.config['server']['url'] = url_.rstrip('/')

        self.context = StaticContext()

        LOGGER.debug('Setting maxrecords')
        try:
            self.maxrecords = int(self.config['server']['maxrecords'])
        except KeyError:
            self.maxrecords = 10
        LOGGER.debug(f'maxrecords: {self.maxrecords}')

        repo_filter = None
        if self.config.has_option('repository', 'filter'):
            repo_filter = self.config.get('repository', 'filter')

        self.orm = 'sqlalchemy'
        from pycsw.core import repository
        try:
            LOGGER.info('Loading default repository')
            self.repository = repository.Repository(
                self.config.get('repository', 'database'),
                self.context,
                # self.environ.get('local.app_root', None),
                None,
                self.config.get('repository', 'table'),
                repo_filter)
            LOGGER.debug(f'Repository loaded {self.repository.dbtype}')
        except Exception as err:
            msg = f'Could not load repository {err}'
            LOGGER.exception(msg)
            raise

        self.query_mappings = {
            'type': self.repository.dataset.type,
            'recordUpdated': self.repository.dataset.insert_date,
            'title': self.repository.dataset.title,
            'description': self.repository.dataset.abstract,
            'keywords': self.repository.dataset.keywords,
            'anytext': self.repository.dataset.anytext,
            'bbox': self.repository.dataset.wkt_geometry
        }
Example #13
0
def load(pycsw_config, ckan_url):

    database = pycsw_config.get('repository', 'database')
    table_name = pycsw_config.get('repository', 'table', 'records')

    context = pycsw.core.config.StaticContext()
    repo = repository.Repository(database, context, table=table_name)

    log.info('Started gathering CKAN datasets identifiers: {0}'.format(
        str(datetime.datetime.now())))

    query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,extras_metadata_source", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}'

    start = 0

    gathered_records = {}

    while True:
        url = ckan_url + query % start

        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError, 'Wrong API response: %s' % listing
        results = listing.get('results')
        if not results:
            break
        for result in results:
            gathered_records[result['id']] = {
                'metadata_modified': result['metadata_modified'],
                'harvest_object_id': result['extras']['harvest_object_id'],
                'source': result['extras'].get('metadata_source')
            }

        start = start + 1000
        log.debug('Gathered %s' % start)

    log.info('Gather finished ({0} datasets): {1}'.format(
        len(gathered_records.keys()), str(datetime.datetime.now())))

    existing_records = {}

    query = repo.session.query(repo.dataset.ckan_id,
                               repo.dataset.ckan_modified)
    for row in query:
        existing_records[row[0]] = row[1]
    repo.session.close()

    new = set(gathered_records) - set(existing_records)
    deleted = set(existing_records) - set(gathered_records)
    changed = set()

    for key in set(gathered_records) & set(existing_records):
        if gathered_records[key]['metadata_modified'] > existing_records[key]:
            changed.add(key)

    for ckan_id in deleted:
        try:
            repo.session.begin()
            repo.session.query(
                repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete()
            log.info('Deleted %s' % ckan_id)
            repo.session.commit()
        except Exception, err:
            repo.session.rollback()
            raise
Example #14
0
    def dispatch(self, writer=sys.stdout, write_headers=True):
        ''' Handle incoming HTTP request '''

        if self.requesttype == 'GET':
            self.kvp = self.normalize_kvp(self.kvp)
            if (('version' in self.kvp and self.kvp['version'] == '2.0.2')
                    or ('acceptversions' in self.kvp
                        and '2.0.2' in self.kvp['acceptversions'])):
                self.request_version = '2.0.2'
        elif self.requesttype == 'POST':
            if self.request.find('2.0.2') != -1:
                self.request_version = '2.0.2'

        if (not isinstance(self.kvp, str) and 'mode' in self.kvp
                and self.kvp['mode'] == 'sru'):
            self.mode = 'sru'
            self.request_version = '2.0.2'
            LOGGER.debug('SRU mode detected; processing request.')
            self.kvp = self.sru().request_sru2csw(self.kvp)

        if (not isinstance(self.kvp, str) and 'mode' in self.kvp
                and self.kvp['mode'] == 'oaipmh'):
            self.mode = 'oaipmh'
            self.request_version = '2.0.2'
            LOGGER.debug('OAI-PMH mode detected; processing request.')
            self.oaiargs = dict((k, v) for k, v in self.kvp.items() if k)
            self.kvp = self.oaipmh().request(self.kvp)

        if self.request_version == '2.0.2':
            self.iface = csw2.Csw2(server_csw=self)
            self.context.set_model('csw')

        # configure transaction support, if specified in config
        self._gen_manager()

        # generate domain model
        # NOTE: We should probably avoid this sort of mutable state for WSGI
        if 'GetDomain' not in self.context.model['operations']:
            self.context.model['operations']['GetDomain'] = \
            self.context.gen_domains()

        # generate distributed search model, if specified in config
        if self.config.has_option('server', 'federatedcatalogues'):
            LOGGER.debug('Configuring distributed search.')

            self.context.model['constraints']['FederatedCatalogues'] = \
            {'values': []}

            for fedcat in \
            self.config.get('server', 'federatedcatalogues').split(','):
                self.context.model\
                ['constraints']['FederatedCatalogues']['values'].append(fedcat)

        for key, value in self.outputschemas.iteritems():
            self.context.model['operations']['GetRecords']['parameters'][
                'outputSchema']['values'].append(value.NAMESPACE)
            self.context.model['operations']['GetRecordById']['parameters'][
                'outputSchema']['values'].append(value.NAMESPACE)
            if 'Harvest' in self.context.model['operations']:
                self.context.model['operations']['Harvest']['parameters'][
                    'ResourceType']['values'].append(value.NAMESPACE)

        LOGGER.debug('Setting MaxRecordDefault')
        if self.config.has_option('server', 'maxrecords'):
            self.context.model['constraints']['MaxRecordDefault']['values'] = \
            [self.config.get('server', 'maxrecords')]

        # load profiles
        if self.config.has_option('server', 'profiles'):
            self.profiles = pprofile.load_profiles(
                os.path.join('pycsw', 'plugins', 'profiles'), pprofile.Profile,
                self.config.get('server', 'profiles'))

            for prof in self.profiles['plugins'].keys():
                tmp = self.profiles['plugins'][prof](self.context.model,
                                                     self.context.namespaces,
                                                     self.context)

                key = tmp.outputschema  # to ref by outputschema
                self.profiles['loaded'][key] = tmp
                self.profiles['loaded'][key].extend_core(
                    self.context.model, self.context.namespaces, self.config)

            LOGGER.debug('Profiles loaded: %s.' %
                         self.profiles['loaded'].keys())

        # init repository
        # look for tablename, set 'records' as default
        if not self.config.has_option('repository', 'table'):
            self.config.set('repository', 'table', 'records')

        repo_filter = None
        if self.config.has_option('repository', 'filter'):
            repo_filter = self.config.get('repository', 'filter')

        if (self.config.has_option('repository', 'source')
                and self.config.get('repository', 'source') == 'geonode'):

            # load geonode repository
            from pycsw.plugins.repository.geonode import geonode_

            try:
                self.repository = \
                geonode_.GeoNodeRepository(self.context)
                LOGGER.debug('GeoNode repository loaded (geonode): %s.' % \
                self.repository.dbtype)
            except Exception as err:
                self.response = self.iface.exceptionreport(
                    'NoApplicableCode', 'service',
                    'Could not load repository (geonode): %s' % str(err))

        elif (self.config.has_option('repository', 'source')
              and self.config.get('repository', 'source') == 'odc'):

            # load odc repository
            from pycsw.plugins.repository.odc import odc

            try:
                self.repository = \
                odc.OpenDataCatalogRepository(self.context)
                LOGGER.debug('OpenDataCatalog repository loaded (geonode): %s.' % \
                self.repository.dbtype)
            except Exception as err:
                self.response = self.iface.exceptionreport(
                    'NoApplicableCode', 'service',
                    'Could not load repository (odc): %s' % str(err))

        else:  # load default repository
            self.orm = 'sqlalchemy'
            from pycsw.core import repository
            try:
                self.repository = \
                repository.Repository(self.config.get('repository', 'database'),
                self.context, self.environ.get('local.app_root', None),
                self.config.get('repository', 'table'), repo_filter)
                LOGGER.debug('Repository loaded (local): %s.' \
                % self.repository.dbtype)
            except Exception as err:
                self.response = self.iface.exceptionreport(
                    'NoApplicableCode', 'service',
                    'Could not load repository (local): %s' % str(err))

        if self.requesttype == 'POST':
            LOGGER.debug(self.iface.version)
            self.kvp = self.iface.parse_postdata(self.request)

        error = 0

        if isinstance(self.kvp, str):  # it's an exception
            error = 1
            locator = 'service'
            text = self.kvp
            if (self.kvp.find('the document is not valid') != -1
                    or self.kvp.find('document not well-formed') != -1):
                code = 'NoApplicableCode'
            else:
                code = 'InvalidParameterValue'

        LOGGER.debug('HTTP Headers:\n%s.' % self.environ)
        LOGGER.debug('Parsed request parameters: %s' % self.kvp)

        if (not isinstance(self.kvp, str) and 'mode' in self.kvp
                and self.kvp['mode'] == 'opensearch'):
            self.mode = 'opensearch'
            LOGGER.debug('OpenSearch mode detected; processing request.')
            self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom'

        if ((self.kvp == {
                '': ''
        } and self.request_version == '3.0.0')
                or (len(self.kvp) == 1 and 'config' in self.kvp)):
            LOGGER.debug('Turning on default csw30:Capabilities for base URL')
            self.kvp = {
                'service': 'CSW',
                'acceptversions': '3.0.0',
                'request': 'GetCapabilities'
            }
            if 'HTTP_ACCEPT' in self.environ and 'application/opensearchdescription+xml' in self.environ[
                    'HTTP_ACCEPT']:
                self.mode = 'opensearch'
                self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom'

        if error == 0:
            # test for the basic keyword values (service, version, request)
            basic_options = ['service', 'request']
            if self.request_version == '2.0.2':
                basic_options.append('version')

            for k in basic_options:
                if k not in self.kvp:
                    if (k in ['version', 'acceptversions']
                            and 'request' in self.kvp
                            and self.kvp['request'] == 'GetCapabilities'):
                        pass
                    else:
                        error = 1
                        locator = k
                        code = 'MissingParameterValue'
                        text = 'Missing keyword: %s' % k
                        break

            # test each of the basic keyword values
            if error == 0:
                # test service
                if self.kvp['service'] != 'CSW':
                    error = 1
                    locator = 'service'
                    code = 'InvalidParameterValue'
                    text = 'Invalid value for service: %s.\
                    Value MUST be CSW' % self.kvp['service']

                # test version
                if ('version' in self.kvp
                        and util.get_version_integer(self.kvp['version']) !=
                        util.get_version_integer(self.request_version)
                        and self.kvp['request'] != 'GetCapabilities'):
                    error = 1
                    locator = 'version'
                    code = 'InvalidParameterValue'
                    text = 'Invalid value for version: %s.\
                    Value MUST be 2.0.2 or 3.0.0' % self.kvp['version']

                # check for GetCapabilities acceptversions
                if 'acceptversions' in self.kvp:
                    for vers in self.kvp['acceptversions'].split(','):
                        if (util.get_version_integer(vers) ==
                                util.get_version_integer(
                                    self.request_version)):
                            break
                        else:
                            error = 1
                            locator = 'acceptversions'
                            code = 'VersionNegotiationFailed'
                            text = 'Invalid parameter value in acceptversions:\
                            %s. Value MUST be 2.0.2 or 3.0.0'                                                              % \
                            self.kvp['acceptversions']

                # test request
                if self.kvp['request'] not in \
                    self.context.model['operations'].keys():
                    error = 1
                    locator = 'request'
                    if self.kvp['request'] in ['Transaction', 'Harvest']:
                        code = 'OperationNotSupported'
                        text = '%s operations are not supported' % \
                        self.kvp['request']
                    else:
                        code = 'InvalidParameterValue'
                        text = 'Invalid value for request: %s' % \
                        self.kvp['request']

        if error == 1:  # return an ExceptionReport
            self.response = self.iface.exceptionreport(code, locator, text)

        else:  # process per the request value

            if 'responsehandler' in self.kvp:
                # set flag to process asynchronously
                import threading
                self. async = True
                if ('requestid' not in self.kvp
                        or self.kvp['requestid'] is None):
                    import uuid
                    self.kvp['requestid'] = str(uuid.uuid4())

            if self.kvp['request'] == 'GetCapabilities':
                self.response = self.iface.getcapabilities()
            elif self.kvp['request'] == 'DescribeRecord':
                self.response = self.iface.describerecord()
            elif self.kvp['request'] == 'GetDomain':
                self.response = self.iface.getdomain()
            elif self.kvp['request'] == 'GetRecords':
                if self. async:  # process asynchronously
                    threading.Thread(target=self.iface.getrecords).start()
                    self.response = self.iface._write_acknowledgement()
                else:
                    self.response = self.iface.getrecords()
            elif self.kvp['request'] == 'GetRecordById':
                self.response = self.iface.getrecordbyid()
            elif self.kvp['request'] == 'GetRepositoryItem':
                self.response = self.iface.getrepositoryitem()
            elif self.kvp['request'] == 'Transaction':
                self.response = self.iface.transaction()
            elif self.kvp['request'] == 'Harvest':
                if self. async:  # process asynchronously
                    threading.Thread(target=self.iface.harvest).start()
                    self.response = self.iface._write_acknowledgement()
                else:
                    self.response = self.iface.harvest()
            else:
                self.response = self.iface.exceptionreport(
                    'InvalidParameterValue', 'request',
                    'Invalid request parameter: %s' % self.kvp['request'])

        if self.mode == 'sru':
            LOGGER.debug('SRU mode detected; processing response.')
            self.response = self.sru().response_csw2sru(
                self.response, self.environ)
        elif self.mode == 'opensearch':
            LOGGER.debug('OpenSearch mode detected; processing response.')
            self.response = self.opensearch().response_csw2opensearch(
                self.response, self.config)

        elif self.mode == 'oaipmh':
            LOGGER.debug('OAI-PMH mode detected; processing response.')
            self.response = self.oaipmh().response(
                self.response, self.oaiargs, self.repository,
                self.config.get('server', 'url'))

        return self._write_response()
Example #15
0
def load(pycsw_config, ckan_url):

    database = pycsw_config.get("repository", "database")
    table_name = pycsw_config.get("repository", "table", "records")

    context = pycsw.core.config.StaticContext()
    repo = repository.Repository(database, context, table=table_name)

    log.info("Started gathering CKAN datasets identifiers: {0}".format(
        str(datetime.datetime.now())))

    query = 'api/search/dataset?qjson={"fl":"id,metadata_modified,extras_harvest_object_id,' \
            'extras_metadata_source", "q":"harvest_object_id:[\\"\\" TO *]", "limit":1000, "start":%s}'

    start = 0

    gathered_records = {}

    while True:
        url = ckan_url + query % start

        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError("Wrong API response: %s" % listing)
        results = listing.get("results")
        if not results:
            break
        for result in results:
            gathered_records[result["id"]] = {
                "metadata_modified": result["metadata_modified"],
                "harvest_object_id": result["extras"]["harvest_object_id"],
                "source": result["extras"].get("metadata_source"),
            }

        start = start + 1000
        log.debug("Gathered %s" % start)

    log.info("Gather finished ({0} datasets): {1}".format(
        len(gathered_records.keys()), str(datetime.datetime.now())))

    existing_records = {}

    query = repo.session.query(repo.dataset.ckan_id,
                               repo.dataset.ckan_modified)
    for row in query:
        existing_records[row[0]] = row[1]
    repo.session.close()

    new = set(gathered_records) - set(existing_records)
    deleted = set(existing_records) - set(gathered_records)
    changed = set()

    for key in set(gathered_records) & set(existing_records):
        if gathered_records[key]["metadata_modified"] > existing_records[key]:
            changed.add(key)

    for ckan_id in deleted:
        try:
            repo.session.begin()
            repo.session.query(
                repo.dataset.ckan_id).filter_by(ckan_id=ckan_id).delete()
            log.info("Deleted %s" % ckan_id)
            repo.session.commit()
        except Exception:
            repo.session.rollback()
            raise

    for ckan_id in new:
        ckan_info = gathered_records[ckan_id]
        record = get_record(context, repo, ckan_url, ckan_id, ckan_info)
        if not record:
            log.info("Skipped record %s" % ckan_id)
            continue
        try:
            repo.insert(record, "local", util.get_today_and_now())
            log.info("Inserted %s" % ckan_id)
        except Exception as err:
            log.error("ERROR: not inserted %s Error:%s" % (ckan_id, err))

    for ckan_id in changed:
        ckan_info = gathered_records[ckan_id]
        record = get_record(context, repo, ckan_url, ckan_id, ckan_info)
        if not record:
            continue
        update_dict = dict([(getattr(repo.dataset, key), getattr(record, key))
                            for key in record.__dict__.keys()
                            if key != "_sa_instance_state"])
        try:
            repo.session.begin()
            repo.session.query(
                repo.dataset).filter_by(ckan_id=ckan_id).update(update_dict)
            repo.session.commit()
            log.info("Changed %s" % ckan_id)
        except Exception as err:
            repo.session.rollback()
            raise RuntimeError("ERROR: %s" % str(err))
Example #16
0
def load_records(context,
                 database,
                 table,
                 xml_dirpath,
                 recursive=False,
                 force_update=False):
    """Load metadata records from directory of files to database"""
    from sqlalchemy.exc import DBAPIError

    repo = repository.Repository(database, context, table=table)

    file_list = []

    loaded_files = set()
    if os.path.isfile(xml_dirpath):
        file_list.append(xml_dirpath)
    elif recursive:
        for root, dirs, files in os.walk(xml_dirpath):
            for mfile in files:
                if mfile.endswith('.xml'):
                    file_list.append(os.path.join(root, mfile))
    else:
        for rec in glob(os.path.join(xml_dirpath, '*.xml')):
            file_list.append(rec)

    total = len(file_list)
    counter = 0

    for recfile in sorted(file_list):
        counter += 1
        LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total)
        # read document
        try:
            exml = etree.parse(recfile, context.parser)
        except etree.XMLSyntaxError as err:
            LOGGER.error('XML document "%s" is not well-formed',
                         recfile,
                         exc_info=True)
            continue
        except Exception as err:
            LOGGER.exception('XML document "%s" is not well-formed', recfile)
            continue

        try:
            record = metadata.parse_record(context, exml, repo)
        except Exception as err:
            LOGGER.exception('Could not parse "%s" as an XML record', recfile)
            continue

        for rec in record:
            LOGGER.info('Inserting %s %s into database %s, table %s ....',
                        rec.typename, rec.identifier, database, table)

            # TODO: do this as CSW Harvest
            try:
                repo.insert(rec, 'local', util.get_today_and_now())
                loaded_files.add(recfile)
                LOGGER.info('Inserted %s', recfile)
            except Exception as err:
                if force_update:
                    LOGGER.info('Record exists. Updating.')
                    repo.update(rec)
                    LOGGER.info('Updated %s', recfile)
                    loaded_files.add(recfile)
                else:
                    if isinstance(err, DBAPIError) and err.args:
                        # Pull a decent database error message and not the full SQL that was run
                        # since INSERT SQL statements are rather large.
                        LOGGER.error('ERROR: %s not inserted: %s',
                                     recfile,
                                     err.args[0],
                                     exc_info=True)
                    else:
                        LOGGER.error('ERROR: %s not inserted: %s',
                                     recfile,
                                     err,
                                     exc_info=True)

    return tuple(loaded_files)
Example #17
0
def optimize_db(context, database, table):
    """Optimize database"""

    LOGGER.info('Optimizing database %s', database)
    repos = repository.Repository(database, context, table=table)
    repos.engine.connect().execute('VACUUM ANALYZE').close()
Example #18
0
    def dispatch(self, writer=sys.stdout, write_headers=True):
        """ Handle incoming HTTP request """

        if self.requesttype == 'GET':
            self.kvp = self.normalize_kvp(self.kvp)
            version_202 = ('version' in self.kvp and
                           self.kvp['version'] == '2.0.2')
            accept_version_202 = ('acceptversions' in self.kvp and
                                  '2.0.2' in self.kvp['acceptversions'])
            if version_202 or accept_version_202:
                self.request_version = '2.0.2'
        elif self.requesttype == 'POST':
            if self.request.find(b'2.0.2') != -1:
                self.request_version = '2.0.2'

        if (not isinstance(self.kvp, str) and 'mode' in self.kvp and
                self.kvp['mode'] == 'sru'):
            self.mode = 'sru'
            self.request_version = '2.0.2'
            LOGGER.info('SRU mode detected; processing request')
            self.kvp = self.sru().request_sru2csw(self.kvp)

        if (not isinstance(self.kvp, str) and 'mode' in self.kvp and
                self.kvp['mode'] == 'oaipmh'):
            self.mode = 'oaipmh'
            self.request_version = '2.0.2'
            LOGGER.info('OAI-PMH mode detected; processing request.')
            self.oaiargs = dict((k, v) for k, v in self.kvp.items() if k)
            self.kvp = self.oaipmh().request(self.kvp)

        if self.request_version == '2.0.2':
            self.iface = csw2.Csw2(server_csw=self)
            self.context.set_model('csw')

        # configure transaction support, if specified in config
        self._gen_manager()

        namespaces = self.context.namespaces
        ops = self.context.model['operations']
        constraints = self.context.model['constraints']
        # generate domain model
        # NOTE: We should probably avoid this sort of mutable state for WSGI
        if 'GetDomain' not in ops:
            ops['GetDomain'] = self.context.gen_domains()

        # generate distributed search model, if specified in config
        if self.config.has_option('server', 'federatedcatalogues'):
            LOGGER.info('Configuring distributed search')

            constraints['FederatedCatalogues'] = {'values': []}

            for fedcat in self.config.get('server',
                                          'federatedcatalogues').split(','):
                LOGGER.debug('federated catalogue: %s', fedcat)
                constraints['FederatedCatalogues']['values'].append(fedcat)

        for key, value in self.outputschemas.items():
            get_records_params = ops['GetRecords']['parameters']
            get_records_params['outputSchema']['values'].append(
                value.NAMESPACE)
            get_records_by_id_params = ops['GetRecordById']['parameters']
            get_records_by_id_params['outputSchema']['values'].append(
                value.NAMESPACE)
            if 'Harvest' in ops:
                harvest_params = ops['Harvest']['parameters']
                harvest_params['ResourceType']['values'].append(
                    value.NAMESPACE)

        LOGGER.info('Setting MaxRecordDefault')
        if self.config.has_option('server', 'maxrecords'):
            constraints['MaxRecordDefault']['values'] = [
                self.config.get('server', 'maxrecords')]

        # load profiles
        if self.config.has_option('server', 'profiles'):
            self.profiles = pprofile.load_profiles(
                os.path.join('pycsw', 'plugins', 'profiles'),
                pprofile.Profile,
                self.config.get('server', 'profiles')
            )

            for prof in self.profiles['plugins'].keys():
                tmp = self.profiles['plugins'][prof](self.context.model,
                                                     namespaces,
                                                     self.context)

                key = tmp.outputschema  # to ref by outputschema
                self.profiles['loaded'][key] = tmp
                self.profiles['loaded'][key].extend_core(self.context.model,
                                                         namespaces,
                                                         self.config)

            LOGGER.debug('Profiles loaded: %s' % list(self.profiles['loaded'].keys()))

        # init repository
        # look for tablename, set 'records' as default
        if not self.config.has_option('repository', 'table'):
            self.config.set('repository', 'table', 'records')

        repo_filter = None
        if self.config.has_option('repository', 'filter'):
            repo_filter = self.config.get('repository', 'filter')

        if self.config.has_option('repository', 'source'):  # load custom repository
            rs = self.config.get('repository', 'source')
            rs_modname, rs_clsname = rs.rsplit('.', 1)

            rs_mod = __import__(rs_modname, globals(), locals(), [rs_clsname])
            rs_cls = getattr(rs_mod, rs_clsname)

            try:
                self.repository = rs_cls(self.context, repo_filter)
                LOGGER.debug('Custom repository %s loaded (%s)', rs, self.repository.dbtype)
            except Exception as err:
                msg = 'Could not load custom repository'
                LOGGER.exception(msg)
                self.response = self.iface.exceptionreport(
                    'NoApplicableCode', 'service', msg)

        else:  # load default repository
            self.orm = 'sqlalchemy'
            from pycsw.core import repository
            try:
                LOGGER.info('Loading default repository')
                self.repository = repository.Repository(
                    self.config.get('repository', 'database'),
                    self.context,
                    self.environ.get('local.app_root', None),
                    self.config.get('repository', 'table'),
                    repo_filter
                )
                LOGGER.debug(
                    'Repository loaded (local): %s.' % self.repository.dbtype)
            except Exception as err:
                msg = 'Could not load repository (local)'
                LOGGER.exception(msg)
                self.response = self.iface.exceptionreport(
                    'NoApplicableCode', 'service', msg)

        if self.requesttype == 'POST':
            LOGGER.debug('HTTP POST request')
            LOGGER.debug('CSW version: %s', self.iface.version)
            self.kvp = self.iface.parse_postdata(self.request)

        error = 0

        if isinstance(self.kvp, str):  # it's an exception
            error = 1
            locator = 'service'
            text = self.kvp
            if (self.kvp.find('the document is not valid') != -1 or
                    self.kvp.find('document not well-formed') != -1):
                code = 'NoApplicableCode'
            else:
                code = 'InvalidParameterValue'

        LOGGER.debug('HTTP Headers:\n%s.', self.environ)
        LOGGER.debug('Parsed request parameters: %s', self.kvp)

        if (not isinstance(self.kvp, str) and 'mode' in self.kvp and
                self.kvp['mode'] == 'opensearch'):
            self.mode = 'opensearch'
            LOGGER.info('OpenSearch mode detected; processing request.')
            self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom'

        if ((len(self.kvp) == 0 and self.request_version == '3.0.0') or
                (len(self.kvp) == 1 and 'config' in self.kvp)):
            LOGGER.info('Turning on default csw30:Capabilities for base URL')
            self.kvp = {
                'service': 'CSW',
                'acceptversions': '3.0.0',
                'request': 'GetCapabilities'
            }
            http_accept = self.environ.get('HTTP_ACCEPT', '')
            if 'application/opensearchdescription+xml' in http_accept:
                self.mode = 'opensearch'
                self.kvp['outputschema'] = 'http://www.w3.org/2005/Atom'

        if error == 0:
            # test for the basic keyword values (service, version, request)
            basic_options = ['service', 'request']
            request = self.kvp.get('request', '')
            own_version_integer = util.get_version_integer(
                self.request_version)
            if self.request_version == '2.0.2':
                basic_options.append('version')

            for k in basic_options:
                if k not in self.kvp:
                    if (k in ['version', 'acceptversions'] and
                            request == 'GetCapabilities'):
                        pass
                    else:
                        error = 1
                        locator = k
                        code = 'MissingParameterValue'
                        text = 'Missing keyword: %s' % k
                        break

            # test each of the basic keyword values
            if error == 0:
                # test service
                if self.kvp['service'] != 'CSW':
                    error = 1
                    locator = 'service'
                    code = 'InvalidParameterValue'
                    text = 'Invalid value for service: %s.\
                    Value MUST be CSW' % self.kvp['service']

                # test version
                kvp_version = self.kvp.get('version', '')
                try:
                    kvp_version_integer = util.get_version_integer(kvp_version)
                except Exception as err:
                    kvp_version_integer = 'invalid_value'
                if (request != 'GetCapabilities' and
                        kvp_version_integer != own_version_integer):
                    error = 1
                    locator = 'version'
                    code = 'InvalidParameterValue'
                    text = ('Invalid value for version: %s. Value MUST be '
                            '2.0.2 or 3.0.0' % kvp_version)

                # check for GetCapabilities acceptversions
                if 'acceptversions' in self.kvp:
                    for vers in self.kvp['acceptversions'].split(','):
                        vers_integer = util.get_version_integer(vers)
                        if vers_integer == own_version_integer:
                            break
                        else:
                            error = 1
                            locator = 'acceptversions'
                            code = 'VersionNegotiationFailed'
                            text = ('Invalid parameter value in '
                                    'acceptversions: %s. Value MUST be '
                                    '2.0.2 or 3.0.0' %
                                    self.kvp['acceptversions'])

                # test request
                if self.kvp['request'] not in \
                    self.context.model['operations']:
                    error = 1
                    locator = 'request'
                    if request in ['Transaction', 'Harvest']:
                        code = 'OperationNotSupported'
                        text = '%s operations are not supported' % request
                    else:
                        code = 'InvalidParameterValue'
                        text = 'Invalid value for request: %s' % request

        if error == 1:  # return an ExceptionReport
            LOGGER.error('basic service options error: %s, %s, %s', code, locator, text)
            self.response = self.iface.exceptionreport(code, locator, text)

        else:  # process per the request value

            if 'responsehandler' in self.kvp:
                # set flag to process asynchronously
                import threading
                self.async = True
                request_id = self.kvp.get('requestid', None)
                if request_id is None:
                    import uuid
                    self.kvp['requestid'] = str(uuid.uuid4())

            if self.kvp['request'] == 'GetCapabilities':
                self.response = self.iface.getcapabilities()
            elif self.kvp['request'] == 'DescribeRecord':
                self.response = self.iface.describerecord()
            elif self.kvp['request'] == 'GetDomain':
                self.response = self.iface.getdomain()
            elif self.kvp['request'] == 'GetRecords':
                if self.async:  # process asynchronously
                    threading.Thread(target=self.iface.getrecords).start()
                    self.response = self.iface._write_acknowledgement()
                else:
                    self.response = self.iface.getrecords()
            elif self.kvp['request'] == 'GetRecordById':
                self.response = self.iface.getrecordbyid()
            elif self.kvp['request'] == 'GetRepositoryItem':
                self.response = self.iface.getrepositoryitem()
            elif self.kvp['request'] == 'Transaction':
                self.response = self.iface.transaction()
            elif self.kvp['request'] == 'Harvest':
                if self.async:  # process asynchronously
                    threading.Thread(target=self.iface.harvest).start()
                    self.response = self.iface._write_acknowledgement()
                else:
                    self.response = self.iface.harvest()
            else:
                self.response = self.iface.exceptionreport(
                    'InvalidParameterValue', 'request',
                    'Invalid request parameter: %s' % self.kvp['request']
                )

        LOGGER.info('Request processed')
        if self.mode == 'sru':
            LOGGER.info('SRU mode detected; processing response.')
            self.response = self.sru().response_csw2sru(self.response,
                                                        self.environ)
        elif self.mode == 'opensearch':
            LOGGER.info('OpenSearch mode detected; processing response.')
            self.response = self.opensearch().response_csw2opensearch(
                self.response, self.config)

        elif self.mode == 'oaipmh':
            LOGGER.info('OAI-PMH mode detected; processing response.')
            self.response = self.oaipmh().response(
                self.response, self.oaiargs, self.repository,
                self.config.get('server', 'url')
            )

        return self._write_response()
Example #19
0
    def __init__(self, config: ConfigParser):
        """
        constructor

        :param config: ConfigParser pycsw configuration dict

        :returns: `pycsw.ogc.api.API` instance
        """

        self.config = config

        log.setup_logger(self.config)

        if self.config['server']['url'].startswith('${'):
            LOGGER.debug(f"Server URL is an environment variable: {self.config['server']['url']}")
            url_ = match_env_var(self.config['server']['url'])
        else:
            url_ = self.config['server']['url']

        LOGGER.debug(f'Server URL: {url_}')
        self.config['server']['url'] = url_.rstrip('/')

        self.context = StaticContext()

        LOGGER.debug('Setting maxrecords')
        try:
            self.maxrecords = int(self.config['server']['maxrecords'])
        except KeyError:
            self.maxrecords = 10
        LOGGER.debug(f'maxrecords: {self.maxrecords}')

        repo_filter = None
        if self.config.has_option('repository', 'filter'):
            repo_filter = self.config.get('repository', 'filter')

        custom_mappings_path = self.config.get('repository', 'mappings', fallback=None)
        if custom_mappings_path is not None:
            md_core_model = load_custom_repo_mappings(custom_mappings_path)
            if md_core_model is not None:
                self.context.md_core_model = md_core_model
            else:
                LOGGER.exception(
                    'Could not load custom mappings: %s', custom_mappings_path)

        self.orm = 'sqlalchemy'
        from pycsw.core import repository
        try:
            LOGGER.info('Loading default repository')
            self.repository = repository.Repository(
                self.config.get('repository', 'database'),
                self.context,
                # self.environ.get('local.app_root', None),
                None,
                self.config.get('repository', 'table'),
                repo_filter
            )
            LOGGER.debug(f'Repository loaded {self.repository.dbtype}')
        except Exception as err:
            msg = f'Could not load repository {err}'
            LOGGER.exception(msg)
            raise

        self.query_mappings = {
            'type': self.repository.dataset.type,
            'parentidentifier': self.repository.dataset.parentidentifier,
            'collections': self.repository.dataset.parentidentifier,
            'recordUpdated': self.repository.dataset.insert_date,
            'title': self.repository.dataset.title,
            'description': self.repository.dataset.abstract,
            'keywords': self.repository.dataset.keywords,
            'anytext': self.repository.dataset.anytext,
            'bbox': self.repository.dataset.wkt_geometry,
            'date': self.repository.dataset.date,
            'time_begin': self.repository.dataset.time_begin,
            'time_end': self.repository.dataset.time_end,
            'platform': self.repository.dataset.platform,
            'instrument': self.repository.dataset.instrument,
            'sensortype': self.repository.dataset.sensortype
        }
        if self.repository.dbtype == 'postgresql+postgis+native':
            self.query_mappings['bbox'] = self.repository.dataset.wkb_geometry