Esempio n. 1
0
def load_records(context,
                 database,
                 table,
                 xml_dirpath,
                 recursive=False,
                 force_update=False):
    """Load metadata records from directory of files to database"""
    repo = repository.Repository(database, context, table=table)

    file_list = []

    if recursive:
        for root, dirs, files in os.walk(xml_dirpath):
            for mfile in files:
                if mfile.endswith('.xml'):
                    file_list.append(os.path.join(root, mfile))
    else:
        for rec in glob(os.path.join(xml_dirpath, '*.xml')):
            file_list.append(rec)

    total = len(file_list)
    counter = 0

    for recfile in sorted(file_list):
        counter += 1
        LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total)
        # read document
        try:
            exml = etree.parse(recfile)
        except Exception as err:
            LOGGER.warn('XML document is not well-formed: %s', str(err))
            continue

        record = metadata.parse_record(context, exml, repo)

        for rec in record:
            LOGGER.info('Inserting %s %s into database %s, table %s ....',
                        rec.typename, rec.identifier, database, table)

            # TODO: do this as CSW Harvest
            try:
                repo.insert(rec, 'local', util.get_today_and_now())
                LOGGER.info('Inserted')
            except RuntimeError as err:
                if force_update:
                    LOGGER.info('Record exists. Updating.')
                    repo.update(rec)
                    LOGGER.info('Updated')
                else:
                    LOGGER.warn('ERROR: not inserted %s', err)
Esempio n. 2
0
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False):
    """Load metadata records from directory of files to database"""
    repo = repository.Repository(database, context, table=table)

    file_list = []

    if recursive:
        for root, dirs, files in os.walk(xml_dirpath):
            for mfile in files:
                if mfile.endswith('.xml'):
                    file_list.append(os.path.join(root, mfile))
    else:
        for rec in glob(os.path.join(xml_dirpath, '*.xml')):
            file_list.append(rec)

    total = len(file_list)
    counter = 0

    for recfile in sorted(file_list):
        counter += 1
        LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total)
        # read document
        try:
            exml = etree.parse(recfile)
        except Exception as err:
            LOGGER.warn('XML document is not well-formed: %s', str(err))
            continue

        record = metadata.parse_record(context, exml, repo)

        for rec in record:
            LOGGER.info('Inserting %s %s into database %s, table %s ....',
                        rec.typename, rec.identifier, database, table)

            # TODO: do this as CSW Harvest
            try:
                repo.insert(rec, 'local', util.get_today_and_now())
                LOGGER.info('Inserted')
            except RuntimeError as err:
                if force_update:
                    LOGGER.info('Record exists. Updating.')
                    repo.update(rec)
                    LOGGER.info('Updated')
                else:
                    LOGGER.warn('ERROR: not inserted %s', err)
Esempio n. 3
0
def load_records(database, table, xml_dirpath, recursive=False):
    ''' Load metadata records from directory of files to database ''' 
    REPO = repository.Repository(database, CONTEXT, table=table)

    file_list = []

    if recursive:
        for root, dirs, files in os.walk(xml_dirpath):
            for mfile in files:
                if mfile.endswith('.xml'):
                    file_list.append(os.path.join(root, mfile)) 
    else:
        for r in glob(os.path.join(xml_dirpath, '*.xml')):
            file_list.append(r)

    total = len(file_list)
    counter = 0

    for r in file_list:
        counter += 1
        print 'Processing file %s (%d of %d)' % (r, counter, total)
        # read document
        try:
            e = etree.parse(r)
        except Exception, err:
            print 'XML document is not well-formed: %s' % str(err)
            continue

        record = metadata.parse_record(CONTEXT, e, REPO)

        for rec in record:
            print 'Inserting %s %s into database %s, table %s ....' % \
            (rec.typename, rec.identifier, database, table)

            # TODO: do this as CSW Harvest
            try:
                REPO.insert(rec, 'local', util.get_today_and_now())
                print 'Inserted'
            except Exception, err:
                print 'ERROR: not inserted %s' % err
def get_record(context, repo, ckan_url, ckan_id, ckan_info):
    query = ckan_url + 'harvest/object/%s'
    url = query % ckan_info['harvest_object_id']
    response = requests.get(url)

    if ckan_info['source'] == 'arcgis':
        return

    try:
        xml = etree.parse(io.BytesIO(response.content))
    except Exception, err:
        log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err))
        return

    try:
        record = metadata.parse_record(context, xml, repo)[0]
    except Exception, err:
        log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err))
        return

    if not record.identifier:
        record.identifier = ckan_id
    record.ckan_id = ckan_id
    record.ckan_modified = ckan_info['metadata_modified']

    return record


usage = '''
Manages the CKAN-pycsw integration
Esempio n. 5
0
def get_record(context, repo, ckan_url, ckan_id, ckan_info):
    query = ckan_url + 'harvest/object/%s'
    url = query % ckan_info['harvest_object_id']
    response = requests.get(url)

    if ckan_info['source'] == 'arcgis':
        return

    try:
        xml = etree.parse(io.BytesIO(response.content))
    except Exception, err:
        log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err))
        return

    try:
        record = metadata.parse_record(context, xml, repo)[0]
    except Exception, err:
        log.error('Could not extract metadata from %s, Error: %s' %
                  (ckan_id, err))
        return

    if not record.identifier:
        record.identifier = ckan_id
    record.ckan_id = ckan_id
    record.organization = ckan_info['organization']
    record.ckan_modified = ckan_info['metadata_modified']

    return record


usage = '''
def load(pycsw_config, ckan_url):
    """
    Take ISO 19139 XML data from a CKAN package and insert it into the PyCSW database.  This function
    runs selectively, meaning that it will only return packages for resources in the CKAN datastore
    database.  It builds a URL for querying the datastore, returns a list of the datastore resource IDs,
    builds URLs for querying the resources, runs a regular expression to determine what the
    package ID of a datastored resource is, builds a URL to scrape each package's ISO XML record and then
    inserts the XML as a record in the PyCSW database.

    @param pycsw_config: pycsw.cfg file that should have been configured upon installing
    PyCSW.  Should contain auth information about the database to connect to.
    @param ckan_url: e.g http://127.0.0.1:5000
    """

    def parse_datastore(ckan_url):
        """
        Scrape and return every resource ID in the datastore database, accessing the information through
        CKAN's REST API.

        @param ckan_url: e.g. http://127.0.0.1:5000
        @return: a list of datastored resource object IDs
        """
        api_query = 'api/3/action/datastore_search?resource_id=_table_metadata'
        ignore_names = ['_table_metadata', 'geography_columns', 'geometry_columns', 'spatial_ref_sys']
        url = ckan_url + api_query
        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError, 'Wrong API response: %s' % listing
        results = listing['result']['records']
        resource_names = []
        # Should use a list/dict comprehension here
        for result in results:
            if not result['name'] in ignore_names:
                resource_names.append(result['name'])
        return resource_names

    def parse_resource(resource_id, ckan_url):
        """
        CKAN's search API doesn't allow querying packages by their resources.  Thankfully,
        each resource is returned with a URL which contains the package id between the
        paths "dataset" and "resource", (at least for datastore items) so we can use a RegEx
        to figure out what the package of a resource is.  This is not an ideal solution, but
        it's the cleanest way to solve the problem until the CKAN team decides to organize
        their data in a less authoritative manner.

        @param resource_id: the id of a datastored resource object
        @param ckan_url: http://127.0.0.1:5000
        """
        api_query = 'api/3/action/resource_show?id=%s' % resource_id
        url = ckan_url + api_query
        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError, 'Wrong API response: %s' % listing
        package_url = listing['result']['url']
        # Here's that RegEx.  Ugh.
        package_id = re.findall('dataset/(.*?)/resource', package_url, re.DOTALL)
        return package_id[0]

    def get_record(context, repo, ckan_url, ckan_id, ckan_info):
        """
        Hit the CKAN REST API for an ISO 19139 XML representation of a package with data
        uploaded into the datastore.

        @param context: Vanilla-CKAN auth noise
        @param repo: PyCSW repository (database)
        @param ckan_url: e.g. http://127.0.0.1:5000
        @param ckan_id: Package ID
        @param ckan_info: Package data
        @return: ISO 19139 XML data
        """
        query = ckan_url + 'package_iso/object/%s'
        url = query % ckan_info['id']
        response = requests.get(url)
        try:
            xml = etree.parse(io.BytesIO(response.content))
        except Exception, err:
            log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err))
            return
        try:
            record = metadata.parse_record(context, xml, repo)[0]
        except Exception, err:
            log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err))
            return
Esempio n. 7
0
def load(pycsw_config, ckan_url):
    """
    Take ISO 19139 XML data from a CKAN package and insert it into the PyCSW database.  This function
    runs selectively, meaning that it will only return packages for resources in the CKAN datastore
    database.  It builds a URL for querying the datastore, returns a list of the datastore resource IDs,
    builds URLs for querying the resources, runs a regular expression to determine what the
    package ID of a datastored resource is, builds a URL to scrape each package's ISO XML record and then
    inserts the XML as a record in the PyCSW database.

    @param pycsw_config: pycsw.cfg file that should have been configured upon installing
    PyCSW.  Should contain auth information about the database to connect to.
    @param ckan_url: e.g http://127.0.0.1:5000
    """
    def parse_datastore(ckan_url):
        """
        Scrape and return every resource ID in the datastore database, accessing the information through
        CKAN's REST API.

        @param ckan_url: e.g. http://127.0.0.1:5000
        @return: a list of datastored resource object IDs
        """
        api_query = 'api/3/action/datastore_search?resource_id=_table_metadata'
        ignore_names = [
            '_table_metadata', 'geography_columns', 'geometry_columns',
            'spatial_ref_sys'
        ]
        url = ckan_url + api_query
        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError, 'Wrong API response: %s' % listing
        results = listing['result']['records']
        resource_names = []
        # Should use a list/dict comprehension here
        for result in results:
            if not result['name'] in ignore_names:
                resource_names.append(result['name'])
        return resource_names

    def parse_resource(resource_id, ckan_url):
        """
        CKAN's search API doesn't allow querying packages by their resources.  Thankfully,
        each resource is returned with a URL which contains the package id between the
        paths "dataset" and "resource", (at least for datastore items) so we can use a RegEx
        to figure out what the package of a resource is.  This is not an ideal solution, but
        it's the cleanest way to solve the problem until the CKAN team decides to organize
        their data in a less authoritative manner.

        @param resource_id: the id of a datastored resource object
        @param ckan_url: http://127.0.0.1:5000
        """
        api_query = 'api/3/action/resource_show?id=%s' % resource_id
        url = ckan_url + api_query
        response = requests.get(url)
        listing = response.json()
        if not isinstance(listing, dict):
            raise RuntimeError, 'Wrong API response: %s' % listing
        # skip Authorization Error, most likely due to deleted packages.
        if 'error' in listing:
            if ("Not Found Error" == listing['error']['__type']) or (
                    "Authorization Error" == listing['error']['__type']):
                return None
        log.info('listing is %r' % listing)
        if listing['result']:
            package_url = listing['result']['url']
        else:
            return None

        # Here's that RegEx.  Ugh.
        package_id = re.findall('dataset/(.*?)/resource', package_url,
                                re.DOTALL)
        if package_id:
            return package_id[0]
        else:
            return None

    def get_record(context, repo, ckan_url, ckan_id, ckan_info):
        """
        Hit the CKAN REST API for an ISO 19139 XML representation of a package with data
        uploaded into the datastore.

        @param context: Vanilla-CKAN auth noise
        @param repo: PyCSW repository (database)
        @param ckan_url: e.g. http://127.0.0.1:5000
        @param ckan_id: Package ID
        @param ckan_info: Package data
        @return: ISO 19139 XML data
        """
        query = ckan_url + 'package_iso/object/%s'
        url = query % ckan_info['id']
        response = requests.get(url)
        try:
            xml = etree.parse(io.BytesIO(response.content))
        except Exception, err:
            log.error('Could not pass xml doc from %s, Error: %s' %
                      (ckan_id, err))
            return
        try:
            record = metadata.parse_record(context, xml, repo)[0]
        except Exception, err:
            log.error('Could not extract metadata from %s, Error: %s' %
                      (ckan_id, err))
            return