Exemple #1
0
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')

    args = argparser.parse_args()
    factory = MetadataDatasetModelGeogratisFactory()

    now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')

    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(
        remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    scan_date = None
    setting = get_setting('last_conversion_run')
    if args.since != '':
        try:
            scan_date = datetime.fromtimestamp(
                time.mktime(time.strptime(args.since, '%Y-%m-%d')))
        except ValueError:
            logging.error("Incorrect since date format. Use YYYY-MM-DD")
            session.close()
            exit()
        except Exception, e:
            logging.error(e.message)
            session.close()
            exit()
Exemple #2
0
def main():

    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_url = ini_config.get('ckan', 'ckan.remote_portal')
    api_key = ini_config.get('ckan', 'ckan.api_key')
    user_agent = ini_config.get('ckan', 'ckan.user_agent')

    ckansite = ckanapi.RemoteCKAN(remote_url, apikey=api_key,
                                  user_agent=user_agent)

    session = connect_to_database()
    last_id = 0
    while True:
        package_stream = session.query(Packages).filter(Packages.id > last_id)
        package_stream = package_stream.filter(Packages.status.in_(["new", "update"])).\
                                               order_by(Packages.id).all()

        if len(package_stream) == 0:
            break
        else:
            for r in package_stream:
                sleep(60)
                print u'Processing dataset {0}'.format(r.id)
                try:
                    new_pkg_dict = json.loads(r.ckan_json.decode('utf-8'))
                except AttributeError as a:
                    print u'AttributeError {0}'.format(unicode(a))
                    continue
                is_new = False
                try:
                    pkg_info = ckansite.action.package_show(id=r.uuid)
                except ckanapi.NotFound:
                    is_new = True
                try:
                    if is_new:
                        ckansite.call_action('package_create', new_pkg_dict)
                    else:
                        ckansite.call_action('package_update', new_pkg_dict)
                    r.status = 'posted'
                    r.status_message = ''
                    r.latest_posted = datetime.now()
                    add_record(session, r)
                    continue
                except ckanapi.NotAuthorized as e:
                    print u'Not Authorized {0}'.format(unicode(e))
                    continue
                except ckanapi.CKANAPIError as c:
                    r.status = 'error'
                    r.status_message = u'CKAN API error {0}'.format(unicode(c))
                    add_record(session, r)
                    print r.status_message
                    continue
                except ckanapi.errors.ValidationError as v:
                    r.status = 'error'
                    r.status_message = u'Validation error {0}'.format(unicode(v.error_dict))
                    add_record(session, r)
                    print r.status_message
                    continue
            break
Exemple #3
0
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')

    args = argparser.parse_args()
    factory = MetadataDatasetModelGeogratisFactory()

    now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')

    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    scan_date = None
    setting = get_setting('last_conversion_run')
    if args.since != '':
        try:
            scan_date = datetime.fromtimestamp(time.mktime(time.strptime(args.since, '%Y-%m-%d')))
        except ValueError:
            logging.error("Incorrect since date format. Use YYYY-MM-DD")
            session.close()
            exit()
        except Exception, e:
            logging.error(e.message)
            session.close()
            exit()
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')

    factory = MetadataDatasetModelGeogratisFactory()

    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(
        remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    # Page through the datasets on

    session = connect_to_database()
    last_id = 0

    try:
        while True:
            ckan_records = ckan_portal.action.package_search(
                q=
                'extras_collection:geogratis AND extras_org_title_at_publication:"Natural Resources Canada"',
                rows=100,
                start=last_id)
            if not ckan_records:
                break
            else:
                for r in ckan_records['results']:
                    rp = session.query(GeogratisRecord).filter(
                        GeogratisRecord.uuid == r['name']).all()
                    if not rp:
                        print r['name']
            last_id += 100
    except Exception, e:
        print >> stderr, e.message
        pass
Exemple #5
0
    def load_naps(self):

        ns = Namespaces()
        gmd = ns.get_namespace('gmd')
        session = connect_to_database()

        for napid in self.napids:

            print '{0}Full NAP Record for {1}{2}'.format(
                Fore.GREEN, Fore.CYAN, napid)
            self.csw.getrecordbyid(id=[napid], outputschema=gmd)

            ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord)

            if ec_rec is None:
                ec_rec = ECRecord(
                    uuid=self.csw.records[napid].identifier,
                    title=self.csw.records[napid].identification.title,
                    state='active',
                    nap_record=self.csw.records[napid].xml,
                    csw_scanned=datetime.now().isoformat())
            else:
                ec_rec.title = self.csw.records[napid].identification.title,
                ec_rec.state = 'active',
                ec_rec.nap_record = self.csw.records[napid].xml,
                ec_rec.csw_scanned = datetime.now().isoformat()

            add_record(session, ec_rec)

        session.close_all()
Exemple #6
0
def main():

    factory = MetadataDatasetModelGeogratisFactory()

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    while True:
        known_records = find_all_records(session, query_limit=10, limit_id=last_id)

        if len(known_records) == 0:
            break
        else:
            for geo_rec in known_records:
                print 'ID: {0} UUID: {1}'.format(geo_rec.id, geo_rec.uuid)
                try:
                    # In order to avoid multiple updates, only allow for one instance of an update per uuid.
                    # Previous updates are overridden with the latest update
                    pkg_update = find_record_by_uuid(session, geo_rec.uuid, query_class=Packages)
                    if pkg_update is None:
                        pkg_update = Packages()
                    pkg_update.status = 'new'
                    if geo_rec.state == 'active':
                        ckan_record = factory.create_model_ckan(geo_rec.uuid)
                        geogratis_record = factory.create_model_geogratis(geo_rec.uuid)
                        pkg_update.uuid = geo_rec.uuid

                        # Set the dataset for immediate release on the Registry
                        geogratis_record.portal_release_date = time.strftime("%Y-%m-%d")
                        geogratis_record.ready_to_publish = True

                        if not ckan_record is None:

                            if not geogratis_record.equals(ckan_record):
                                diffs = geogratis_record.compare(ckan_record, self_label="Geogratis", other_label="CKAN")
                                pkg_update.differences = "\n".join(item for item in diffs)
                                geo_rec.od_status = 'Needs Update'
                                pkg_update.ckan_json = json.dumps(geogratis_record.as_dict())
                                pkg_update.status = 'update'
                            else:
                                geo_rec.od_status = 'Current'
                        else:
                            pkg_update.ckan_json = json.dumps(geogratis_record.as_dict())
                            geo_rec.od_status = 'New Record'
                    else:
                        geo_rec.od_status = 'Ineligible'
                    pkg_update.last_comparison = datetime.now()
                    add_record(session, geo_rec)
                    if geo_rec.od_status == 'New Record' or geo_rec.od_status == "Needs Update":
                        add_record(session, pkg_update)
                    last_id = geo_rec.id
                except Exception, e:
                    logging.error(e.message)
    def create_model(self, uuid):
        session = connect_to_database()
        try:
            geogratis_rec = find_record_by_uuid(session, uuid)
            geo_rec_en = json.loads(geogratis_rec.json_record_en)
            geo_rec_fr = json.loads(geogratis_rec.json_record_fr)
        finally:
            session.close()

        # Even if the French or English record is missing, create an object with

        return self.convert_geogratis_json(geo_rec_en, geo_rec_fr)
    def create_model_geogratis(self, uuid):
        session = connect_to_database()
        try:
            geogratis_rec = find_record_by_uuid(session, uuid)
            geo_rec_en = json.loads(geogratis_rec.json_record_en)
            geo_rec_fr = json.loads(geogratis_rec.json_record_fr)
        finally:
            session.close()

        # Even if the French or English record is missing, create an object with

        return self.convert_geogratis_json(geo_rec_en, geo_rec_fr)
def main(since, dumpfile, scan_type):
    ini_config = ConfigParser()
    ini_config.read('harvester.ini')

    session = connect_to_database()
    last_id = 0

    while True:

        if args.monitor:
            last_run_setting = get_setting('last_conversion_' + scan_type)
            if last_run_setting.setting_value:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > last_run_setting.setting_value).\
                    filter(Packages.source == scan_type).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.source == scan_type).\
                    order_by(Packages.id).limit(10).all()
        elif args.since != '':
            package_stream = session.query(Packages).filter(Packages.id > last_id).\
                filter(Packages.updated > args.since).\
                filter(Packages.source == scan_type).\
                order_by(Packages.id).limit(10).all()
        else:
            package_stream = session.query(Packages).filter(Packages.id > last_id).\
                filter(Packages.source == scan_type).\
                order_by(Packages.id).limit(10).all()
        if len(package_stream) == 0:
            break
        else:
            if dumpfile != '':
                with open(dumpfile, 'a') as dfile:
                    for r in package_stream:
                        print u'Processing dataset {0}'.format(r.id)
                        dfile.write(r.ckan_json + '\n')
                        last_id = r.id
            else:
                for r in package_stream:
                    print r.ckan_json + '\n'
                    last_id = r.id

    session.close()
Exemple #10
0
def main(since, scan_type):

    now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')
    if scan_type == 'gr':
        factory = MetadataDatasetModelGeogratisFactory()
        setting = get_setting('last_conversion_gr')
        query_class = GeogratisRecord
        if setting is None:
            setting = Settings()
            setting.setting_name = 'last_conversion_gr'
    else:
        factory = MetadataDatasetModelECFactory()
        setting = get_setting('last_conversion_ec')
        query_class = ECRecord
        if setting is None:
            setting = Settings()
            setting.setting_name = 'last_conversion_ec'

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    scan_date = None

    if since != '':
        try:
            scan_date = datetime.fromtimestamp(
                time.mktime(time.strptime(args.since, '%Y-%m-%d')))
        except ValueError:
            logging.error("Incorrect since date format. Use YYYY-MM-DD")
            session.close()
            exit()
        except Exception, e:
            logging.error(e.message)
            session.close()
            exit()
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')
    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(
        remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    last_id = 0
    last_run_setting = get_setting('last_conversion_run')
    session = connect_to_database()

    while True:
        if args.monitor:

            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .filter(GeogratisRecord.updated > last_run_setting.setting_value)\
                .order_by(GeogratisRecord.id).limit(10).all()
        else:
            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .order_by(GeogratisRecord.id).limit(10).all()

        if len(geogratis_stream) == 0:
            break
        else:
            for r in geogratis_stream:

                # Determine if the record is already on the OD portal
                try:
                    ckan_portal.action.package_show(id=r.uuid)
                    # If the record does not exist, then a NotFound exception will be thrown
                    print u'{0}'.format(r.uuid)
                except NotFound, e:
                    pass
                last_id = r.id
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')
    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    last_id = 0
    last_run_setting = get_setting('last_conversion_run')
    session = connect_to_database()

    while True:
        if args.monitor:

            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .filter(GeogratisRecord.updated > last_run_setting.setting_value)\
                .order_by(GeogratisRecord.id).limit(10).all()
        else:
            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .order_by(GeogratisRecord.id).limit(10).all()

        if len(geogratis_stream) == 0:
            break
        else:
            for r in geogratis_stream:

                # Determine if the record is already on the OD portal
                try:
                    ckan_portal.action.package_show(id=r.uuid)
                    # If the record does not exist, then a NotFound exception will be thrown
                    print u'{0}'.format(r.uuid)
                except NotFound, e:
                    pass
                last_id = r.id
                       dest='outfile',
                       help='Write extracted CKAN JSONL to this file')
argparser.add_argument('-m', '--maxrecords',
                       action='store',
                       default=0,
                       type=int,
                       dest='maxrecords',
                       help='Maximum number of records to retrieve. 0 means retrieve all')
argparser.add_argument('-n', '--newonly',
                       action='store_true',
                       dest='newonly',
                       default=False,
                       help='Only extract new records')
args = argparser.parse_args()

session = connect_to_database()
last_id = 0
jfile = open(args.outfile, mode='w')
rec_count = 1
while True:
    known_records = find_all_records(session, query_class=Packages, query_limit=10, limit_id=last_id)
    if len(known_records) == 0:
        break
    else:
        for r in known_records:
            if args.newonly and r.status == 'update':
                continue
            if (r.status == 'new' or r.status == 'update') and r.package is not None:
                print >> jfile, r.package
                rec_count += 1
                if 0 < args.maxrecords < rec_count:
Exemple #14
0
def main(since='', start_index='', monitor=False):
    geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100'
    monitor_setting = get_setting(u'monitor_link')
    if monitor:
        if monitor_setting.setting_value is None:
            geog_url =\
                'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2001-01-01&alt=json&max-results=100'
        else:
            geog_url = monitor_setting.setting_value
    elif since != '':
        geog_url =\
            'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format(since)
    elif start_index != '':
        geog_url =\
            'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.\
            format(start_index)
    print ('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url))
    r = requests.get(geog_url)
    logging.info('HTTP Response Status {0}'.format(r.status_code))
    session = None
    try:
        session = connect_to_database()
        # Get the first page of the feed
        if r.status_code == 200:
            feed_page = r.json()

            # Save the monitor link for future use
            monitor_link = _get_link(feed_page, 'monitor')
            if monitor_link != '':

                monitor_setting.setting_value = monitor_link
                save_setting(monitor_setting)
                print  "{0}Next Monitor Link: {1}{2}".format(Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value)
            next_link = _get_link(feed_page)

            print ('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count']))

            if 'products' in feed_page:
                for product in feed_page['products']:
                    try:
                        save_geogratis_record(session, product['id'])
                    except Exception, e:
                        logging.error('{0} failed to load'.format(product['id']))
                        logging.error(e)

            # Keep polling until exhausted
            while next_link != '':
                geog_url = next_link
                r = requests.get(geog_url)
                feed_page = r.json()
                next_link = _get_link(feed_page)
                print '{0}Next page link: {1}{2}'.format(Fore.YELLOW, Fore.BLUE, next_link)
                if 'products' in feed_page:
                    for product in feed_page['products']:

                        # Don't crash on every call - log the error and continue
                        try:
                            save_geogratis_record(session, product['id'])
                        except Exception, e:
                            logging.error('{0} failed to load'.format(product['id']))
                            logging.error(e)
                save_setting(monitor_setting)
    def create_model(self, uuid):

        # Get the previously harvested NAP XML
        session = connect_to_database()
        try:
            ec_rec = find_record_by_uuid(session, uuid, query_class=ECRecord)
            self.root = etree.fromstring(ec_rec.nap_record)
        finally:
            session.close()
        """Convert a NAP file into an Open Data record"""

        ds = MetadataDatasetModel()
        ds.owner_org = 'ec'
        ds.catalog_type = u'Geo Data | G\u00e9o'

        self.valid = True

        try:
            # Boilerplate fields for the Open Data record

            ds.author_email = "*****@*****.**"
            ds.language = "eng; CAN | fra; CAN"
            ds.owner_org = "ec"
            ds.department_number = "99"
            ds.catalog_type = u"Geo Data | G\u00e9o"
            ds.license_id = u"ca-ogl-lgo"
            ds.attribution = u"Contains information licensed under the Open Government Licence \u2013 Canada."
            ds.attribution_fra = u"Contient des informations autoris\u00e9es sous la Licence du gouvernement ouvert- Canada"
            ds.ready_to_publish = True
            ds.portal_release_date = ""
            ds.presentation_form = u"Document Digital | Document num\u00e9rique"
            ds.spatial_representation_type = "Vector | Vecteur"

            # Read in NAP fields and populate the OD dataset

            # UUID identifier

            ds.id = self._get_first_text(
                '/gmd:MD_Metadata/gmd:fileIdentifier/gco:CharacterString')

            # Title - English and French

            ds.title = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString'
            )
            if len(ds.title) == 0:
                print(ds.id + 'No English Title Given')
                self.valid = False

            ds.title_fra = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString'
            )
            if len(ds.title_fra) == 0:
                print(ds.id + ' No French Title Given')
                self.valid = False

            # Description - English and French
            ds.notes = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString'
            ).replace(u"\u2019", "'")
            ds.notes_fra = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString'
            ).replace(u"\u2019", "'")

            # Time Period Coverage - Start and End (optional)

            coverage_start_time = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition'
            )
            if not coverage_start_time is None:
                if len(coverage_start_time) == 4:
                    coverage_start_time = "%s-01-01" % coverage_start_time
                ds.time_period_coverage_start = coverage_start_time

            coverage_end_time = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition'
            ).strip()
            # The time period coverage end time is not always present - it's not mandatory
            if (coverage_end_time.lower() <>
                    u"ongoing") and (not len(coverage_end_time) == 0):
                if len(coverage_end_time) == 4:
                    coverage_end_time = "%s-12-31" % coverage_end_time
                ds.time_period_coverage_end = coverage_end_time

            # Homepage and Endpoint URLs - English and French

            sup_text = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:supplementalInformation/gco:CharacterString'
            )
            urls_en = []
            if len(sup_text) > 0:
                urls_en = self._get_urls_from_string(sup_text)

            sup_text = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:supplementalInformation/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString'
            )
            urls_fr = []
            if len(sup_text) > 0:
                urls_fr = self._get_urls_from_string(sup_text)

            if len(urls_en) > 0:
                ds.url = urls_en[0]
            if len(urls_fr) > 0:
                ds.url_fra = urls_fr[0]

            if len(urls_en) > 1:
                ds.endpoint_url = urls_en[1]
            if len(urls_fr) > 1:
                ds.url_fra = urls_fr[1]

            # GoC Subject

            topics_subjects = self._get_gc_subject_category(
                self.root.xpath(
                    '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:topicCategory/gmd:MD_TopicCategoryCode',
                    namespaces=self.nap_namespaces))

            ds.subject = topics_subjects['subjects']
            if len(ds.subject) == 0:
                self.valid = False
                print(ds.id + ' No GC Subjects')

            # GoC Topic

            ds.topic_category = topics_subjects['topics']
            if len(ds.topic_category) == 0:
                self.valid = False
                print(ds.id + ' No GC Topics')

            # Tags - English and French

            ds.keywords = []
            keywords_en = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString'
            )
            keywords_en = keywords_en.replace(';', ' ')
            if len(keywords_en) == 0:
                self.valid = False
                print(ds.id + ' No English Keywords')
            else:
                ds.keywords = keywords_en.split(',')
            ds.keywords_fra = []
            keywords_fr = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString'
            )
            keywords_fr = keywords_fr.replace(u"/u2019", "'").replace(";", " ")
            if len(keywords_fr) == 0:
                self.valid = False
                print(ds.id + ' No French Keywords')
            else:
                ds.keywords_fra = keywords_fr.split(',')

            # Spatial - Convert a bounding box into a GeoJSON polygon

            westLong = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:westBoundLongitude/gco:Decimal'
            )

            eastLong = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:eastBoundLongitude/gco:Decimal'
            )

            northLat = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:northBoundLatitude/gco:Decimal'
            )

            southLat = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:southBoundLatitude/gco:Decimal'
            )

            # convert these 4 points into a bounding box
            ds.spatial = '{\"type\": \"Polygon\", \"coordinates\": [[[%s, %s], [%s, %s], [%s, %s], [%s, %s], [%s, %s]]]}' % (
                westLong, northLat, eastLong, northLat, eastLong, southLat,
                westLong, southLat, westLong, northLat)

            # Data Published

            ds.date_published = self._get_first_text(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:Date'
            )

            # Browse Graphic File Name

            try:
                ds.browse_graphic_url = self._get_first_text(
                    '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:graphicOverview/gmd:MD_BrowseGraphic/gmd:fileName/gco:CharacterString'
                )
                if len(ds.browse_graphic_url) == 0:
                    ds.browse_graphic_url = '/static/img/canada_default.png'
            except:
                ds.browse_graphic_url = '/static/img/canada_default.png'

            # Frequency
            frequency_node = self.root.xpath(
                '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode/@codeListValue',
                namespaces=self.nap_namespaces)
            if len(frequency_node) > 0:
                ds.maintenance_and_update_frequency = self._get_update_frequency(
                    frequency_node[0])
            else:
                ds.maintenance_and_update_frequency = self._get_update_frequency(
                    '')

            # Data Series Name, Issue Identification, DOI; These fields are not present in the EC ISO 19115 NAP files.

            ds.data_series_name = ''
            ds.data_series_name_fra = ''
            ds.data_series_issue_identification = ''
            ds.data_series_issue_identification_fra = ''
            ds.digital_object_identifier = ""

            # Load the Resources

            resources = self.root.xpath(
                '/gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:onLine',
                namespaces=self.nap_namespaces)
            od_resources = []
            for resource in resources:
                od_resource = MetadataResourcesModel()
                lang_code = resource.xpath('@xlink:role',
                                           namespaces=self.nap_namespaces)[0]
                if lang_code == "urn:xml:lang:eng-CAN":
                    od_resource.language = 'eng; CAN'
                elif lang_code == "urn:xml:lang:fra-CAN":
                    od_resource.language = 'fra; CAN'
                else:
                    od_resource.language = 'zxx; CAN'
                if len(
                        resource.xpath(
                            'gmd:CI_OnlineResource/gmd:name/gco:CharacterString',
                            namespaces=self.nap_namespaces)) > 0:
                    od_resource.name = resource.xpath(
                        'gmd:CI_OnlineResource/gmd:name/gco:CharacterString',
                        namespaces=self.nap_namespaces)[0].text
                else:
                    if lang_code == "urn:xml:lang:eng-CAN":
                        od_resource.name = "Dataset"
                    else:
                        od_resource.name = u"Donn\u00e9es"
                od_resource.name_fra = od_resource.name
                od_resource.resource_type = "file"
                od_resource.url = resource.xpath(
                    'gmd:CI_OnlineResource/gmd:linkage/gmd:URL',
                    namespaces=self.nap_namespaces)[0].text
                od_resource.size = ''
                od_resource.format = self._guess_resource_type(
                    od_resource.name)
                if not od_resource.format == 'none':
                    od_resources.append(od_resource)
            ds.resources = od_resources

        except Exception as e:
            print("Failure: ", e)
            traceback.print_exc()

        if self.valid:
            ds.state = 'active'
            return ds
        else:
            return None
Exemple #16
0
def main():

    factory = MetadataDatasetModelGeogratisFactory()

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    while True:
        known_records = find_all_records(session,
                                         query_limit=10,
                                         limit_id=last_id)

        if len(known_records) == 0:
            break
        else:
            for geo_rec in known_records:
                print 'ID: {0} UUID: {1}'.format(geo_rec.id, geo_rec.uuid)
                try:
                    # In order to avoid multiple updates, only allow for one instance of an update per uuid.
                    # Previous updates are overridden with the latest update
                    pkg_update = find_record_by_uuid(session,
                                                     geo_rec.uuid,
                                                     query_class=Packages)
                    if pkg_update is None:
                        pkg_update = Packages()
                    pkg_update.status = 'new'
                    if geo_rec.state == 'active':
                        ckan_record = factory.create_model_ckan(geo_rec.uuid)
                        geogratis_record = factory.create_model_geogratis(
                            geo_rec.uuid)
                        pkg_update.uuid = geo_rec.uuid

                        # Set the dataset for immediate release on the Registry
                        geogratis_record.portal_release_date = time.strftime(
                            "%Y-%m-%d")
                        geogratis_record.ready_to_publish = True

                        if not ckan_record is None:

                            if not geogratis_record.equals(ckan_record):
                                diffs = geogratis_record.compare(
                                    ckan_record,
                                    self_label="Geogratis",
                                    other_label="CKAN")
                                pkg_update.differences = "\n".join(
                                    item for item in diffs)
                                geo_rec.od_status = 'Needs Update'
                                pkg_update.ckan_json = json.dumps(
                                    geogratis_record.as_dict())
                                pkg_update.status = 'update'
                            else:
                                geo_rec.od_status = 'Current'
                        else:
                            pkg_update.ckan_json = json.dumps(
                                geogratis_record.as_dict())
                            geo_rec.od_status = 'New Record'
                    else:
                        geo_rec.od_status = 'Ineligible'
                    pkg_update.last_comparison = datetime.now()
                    add_record(session, geo_rec)
                    if geo_rec.od_status == 'New Record' or geo_rec.od_status == "Needs Update":
                        add_record(session, pkg_update)
                    last_id = geo_rec.id
                except Exception, e:
                    logging.error(e.message)
def main(since, dumpfile):
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')

    session = connect_to_database()
    last_id = 0

    while True:
        # @todo clean - up the messy if statement here
        if args.monitor:
            last_run_setting = get_setting('last_conversion_run')
            if last_run_setting.setting_value:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        order_by(Packages.id).limit(10).all()
            else:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        order_by(Packages.id).limit(10).all()
        elif args.since != '':
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
        else:
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    order_by(Packages.id).limit(10).all()
        if len(package_stream) == 0:
            break
        else:
            if dumpfile != '':
                with open(dumpfile, 'a') as dfile:
                    for r in package_stream:
                        print u'Processing dataset {0}'.format(r.id)
                        dfile.write(r.ckan_json + '\n')
                        last_id = r.id
            else:
                for r in package_stream:
                    print r.ckan_json + '\n'
                    last_id = r.id

    session.close()
Exemple #18
0
def main(since, dumpfile):
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')

    session = connect_to_database()
    last_id = 0

    while True:
        # @todo clean - up the messy if statement here
        if args.monitor:
            last_run_setting = get_setting('last_conversion_run')
            if last_run_setting.setting_value:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        order_by(Packages.id).limit(10).all()
            else:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        order_by(Packages.id).limit(10).all()
        elif args.since != '':
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
        else:
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    order_by(Packages.id).limit(10).all()
        if len(package_stream) == 0:
            break
        else:
            if dumpfile != '':
                with open(dumpfile, 'a') as dfile:
                    for r in package_stream:
                        print u'Processing dataset {0}'.format(r.id)
                        dfile.write(r.ckan_json + '\n')
                        last_id = r.id
            else:
                for r in package_stream:
                    print r.ckan_json + '\n'
                    last_id = r.id

    session.close()
Exemple #19
0
    '-m',
    '--maxrecords',
    action='store',
    default=0,
    type=int,
    dest='maxrecords',
    help='Maximum number of records to retrieve. 0 means retrieve all')
argparser.add_argument('-n',
                       '--newonly',
                       action='store_true',
                       dest='newonly',
                       default=False,
                       help='Only extract new records')
args = argparser.parse_args()

session = connect_to_database()
last_id = 0
jfile = open(args.outfile, mode='w')
rec_count = 1
while True:
    known_records = find_all_records(session,
                                     query_class=Packages,
                                     query_limit=10,
                                     limit_id=last_id)
    if len(known_records) == 0:
        break
    else:
        for r in known_records:
            if args.newonly and r.status == 'update':
                continue
            if (r.status == 'new'
Exemple #20
0
def main():

    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_url = ini_config.get('ckan', 'ckan.remote_portal')
    api_key = ini_config.get('ckan', 'ckan.api_key')
    user_agent = ini_config.get('ckan', 'ckan.user_agent')

    ckansite = ckanapi.RemoteCKAN(remote_url,
                                  apikey=api_key,
                                  user_agent=user_agent)

    session = connect_to_database()
    last_id = 0
    while True:
        package_stream = session.query(Packages).filter(Packages.id > last_id)
        package_stream = package_stream.filter(Packages.status.in_(["new", "update"])).\
                                               order_by(Packages.id).all()

        if len(package_stream) == 0:
            break
        else:
            for r in package_stream:
                sleep(60)
                print u'Processing dataset {0}'.format(r.id)
                try:
                    new_pkg_dict = json.loads(r.ckan_json.decode('utf-8'))
                except AttributeError as a:
                    print u'AttributeError {0}'.format(unicode(a))
                    continue
                is_new = False
                try:
                    pkg_info = ckansite.action.package_show(id=r.uuid)
                except ckanapi.NotFound:
                    is_new = True
                try:
                    if is_new:
                        ckansite.call_action('package_create', new_pkg_dict)
                    else:
                        ckansite.call_action('package_update', new_pkg_dict)
                    r.status = 'posted'
                    r.status_message = ''
                    r.latest_posted = datetime.now()
                    add_record(session, r)
                    continue
                except ckanapi.NotAuthorized as e:
                    print u'Not Authorized {0}'.format(unicode(e))
                    continue
                except ckanapi.CKANAPIError as c:
                    r.status = 'error'
                    r.status_message = u'CKAN API error {0}'.format(unicode(c))
                    add_record(session, r)
                    print r.status_message
                    continue
                except ckanapi.errors.ValidationError as v:
                    r.status = 'error'
                    r.status_message = u'Validation error {0}'.format(
                        unicode(v.error_dict))
                    add_record(session, r)
                    print r.status_message
                    continue
            break
Exemple #21
0
def main(since='', start_index='', monitor=False):
    geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100'
    monitor_setting = get_setting('monitor_link')
    if monitor:
        if monitor_setting.setting_value is None:
            geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2015-01-01&alt=json&max-results=100'
        else:
            geog_url = monitor_setting.setting_value
    elif since != '':
        geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format(
            since)
    elif start_index != '':
        geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.format(
            start_index)
    print('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url))
    r = requests.get(geog_url)
    logging.info('HTTP Response Status {0}'.format(r.status_code))
    session = None
    try:
        session = connect_to_database()
        # Get the first page of the feed
        if r.status_code == 200:
            feed_page = r.json()

            # Save the monitor link for future use
            monitor_link = _get_link(feed_page, 'monitor')
            if monitor_link != '':

                monitor_setting.setting_value = monitor_link
                save_setting(monitor_setting)
                print "{0}Next Monitor Link: {1}{2}".format(
                    Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value)
            next_link = _get_link(feed_page)

            print('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count']))

            if 'products' in feed_page:
                for product in feed_page['products']:
                    try:
                        save_geogratis_record(session, product['id'])
                    except Exception, e:
                        logging.error('{0} failed to load'.format(
                            product['id']))
                        logging.error(e)

            # Keep polling until exhausted
            while next_link != '':
                geog_url = next_link
                r = requests.get(geog_url)
                feed_page = r.json()
                next_link = _get_link(feed_page)
                print '{0}Next page link: {1}{2}'.format(
                    Fore.YELLOW, Fore.BLUE, next_link)
                if 'products' in feed_page:
                    for product in feed_page['products']:

                        # Don't crash on every call - log the error and continue
                        try:
                            save_geogratis_record(session, product['id'])
                        except Exception, e:
                            logging.error('{0} failed to load'.format(
                                product['id']))
                            logging.error(e)
                save_setting(monitor_setting)