コード例 #1
0
ファイル: csw_scanner.py プロジェクト: thriuin/od-harvester
    def load_naps(self):

        ns = Namespaces()
        gmd = ns.get_namespace('gmd')
        session = connect_to_database()

        for napid in self.napids:

            print '{0}Full NAP Record for {1}{2}'.format(
                Fore.GREEN, Fore.CYAN, napid)
            self.csw.getrecordbyid(id=[napid], outputschema=gmd)

            ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord)

            if ec_rec is None:
                ec_rec = ECRecord(
                    uuid=self.csw.records[napid].identifier,
                    title=self.csw.records[napid].identification.title,
                    state='active',
                    nap_record=self.csw.records[napid].xml,
                    csw_scanned=datetime.now().isoformat())
            else:
                ec_rec.title = self.csw.records[napid].identification.title,
                ec_rec.state = 'active',
                ec_rec.nap_record = self.csw.records[napid].xml,
                ec_rec.csw_scanned = datetime.now().isoformat()

            add_record(session, ec_rec)

        session.close_all()
コード例 #2
0
ファイル: comparer.py プロジェクト: thriuin/geogratis-sync
def main():

    factory = MetadataDatasetModelGeogratisFactory()

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    while True:
        known_records = find_all_records(session, query_limit=10, limit_id=last_id)

        if len(known_records) == 0:
            break
        else:
            for geo_rec in known_records:
                print 'ID: {0} UUID: {1}'.format(geo_rec.id, geo_rec.uuid)
                try:
                    # In order to avoid multiple updates, only allow for one instance of an update per uuid.
                    # Previous updates are overridden with the latest update
                    pkg_update = find_record_by_uuid(session, geo_rec.uuid, query_class=Packages)
                    if pkg_update is None:
                        pkg_update = Packages()
                    pkg_update.status = 'new'
                    if geo_rec.state == 'active':
                        ckan_record = factory.create_model_ckan(geo_rec.uuid)
                        geogratis_record = factory.create_model_geogratis(geo_rec.uuid)
                        pkg_update.uuid = geo_rec.uuid

                        # Set the dataset for immediate release on the Registry
                        geogratis_record.portal_release_date = time.strftime("%Y-%m-%d")
                        geogratis_record.ready_to_publish = True

                        if not ckan_record is None:

                            if not geogratis_record.equals(ckan_record):
                                diffs = geogratis_record.compare(ckan_record, self_label="Geogratis", other_label="CKAN")
                                pkg_update.differences = "\n".join(item for item in diffs)
                                geo_rec.od_status = 'Needs Update'
                                pkg_update.ckan_json = json.dumps(geogratis_record.as_dict())
                                pkg_update.status = 'update'
                            else:
                                geo_rec.od_status = 'Current'
                        else:
                            pkg_update.ckan_json = json.dumps(geogratis_record.as_dict())
                            geo_rec.od_status = 'New Record'
                    else:
                        geo_rec.od_status = 'Ineligible'
                    pkg_update.last_comparison = datetime.now()
                    add_record(session, geo_rec)
                    if geo_rec.od_status == 'New Record' or geo_rec.od_status == "Needs Update":
                        add_record(session, pkg_update)
                    last_id = geo_rec.id
                except Exception, e:
                    logging.error(e.message)
コード例 #3
0
ファイル: scanner.py プロジェクト: thriuin/geogratis-sync
def save_geogratis_record(session, uuid):
    msg = 'Retrieving data set {0}'.format(uuid)
    logging.info(msg)
    print(msg)
    geo_rec_en = get_geogratis_rec(uuid)
    geo_rec_fr = get_geogratis_rec(uuid, 'fr')
    if not geo_rec_en is None:
        state = u'deleted'
        title_fr = ''
        if geo_rec_en['deleted'] == 'false':
            state = u'active'
        if geo_rec_fr is None:
            state = u'missing french'
        else:
            title_fr = geo_rec_fr['title']
        new_rec = find_record_by_uuid(session, geo_rec_en['id'])

        created_date = u'2000-01-01'
        updated_date = u'2000-01-01'
        edited_date = u'2000-01-01'
        geogratis_scanned = datetime.now().isoformat()
        if state != 'deleted':
            created_date = geo_rec_en['publishedDate']
            updated_date = geo_rec_en['updatedDate']
            edited_date = geo_rec_en['editedDate']

        if new_rec is None:
            new_rec = GeogratisRecord(uuid=geo_rec_en['id'],
                                      title_en=geo_rec_en['title'],
                                      title_fr=title_fr,
                                      json_record_en=json.dumps(geo_rec_en),
                                      json_record_fr=json.dumps(geo_rec_fr),
                                      created=created_date,
                                      updated=updated_date,
                                      edited=edited_date,
                                      state=state,
                                      geogratis_scanned=geogratis_scanned)
        else:
            new_rec.title_en = geo_rec_en['title']
            new_rec.title_fr = title_fr
            new_rec.json_record_en = json.dumps(geo_rec_en)
            new_rec.json_record_fr = json.dumps(geo_rec_fr),
            new_rec.created = created_date,
            new_rec.updated = updated_date,
            new_rec.edited = edited_date,
            new_rec.state = state
            new_rec.geogratis_scanned = geogratis_scanned

        add_record(session, new_rec)
コード例 #4
0
def save_geogratis_record(session, uuid):
    msg = 'Retrieving data set {0}'.format(uuid)
    logging.info(msg)
    print(msg)
    geo_rec_en = get_geogratis_rec(uuid)
    geo_rec_fr = get_geogratis_rec(uuid, 'fr')
    if not geo_rec_en is None:
        state = 'deleted'
        title_fr = ''
        if geo_rec_en['deleted'] == 'false':
            state = 'active'
        if geo_rec_fr is None:
            state = 'missing french'
        else:
            title_fr = geo_rec_fr['title']
        new_rec = find_record_by_uuid(session, geo_rec_en['id'])

        created_date = '2000-01-01'
        updated_date = '2000-01-01'
        edited_date = '2000-01-01'
        geogratis_scanned = datetime.now().isoformat()
        if state != 'deleted':
            created_date = geo_rec_en['publishedDate']
            updated_date = geo_rec_en['updatedDate']
            edited_date = geo_rec_en['editedDate']

        if new_rec is None:
            new_rec = GeogratisRecord(uuid=geo_rec_en['id'],
                                      title_en=geo_rec_en['title'],
                                      title_fr=title_fr,
                                      json_record_en=json.dumps(geo_rec_en),
                                      json_record_fr=json.dumps(geo_rec_fr),
                                      created=created_date,
                                      updated=updated_date,
                                      edited=edited_date,
                                      state=state,
                                      geogratis_scanned=geogratis_scanned)
        else:
            new_rec.title_en = geo_rec_en['title']
            new_rec.title_fr = title_fr
            new_rec.json_record_en = json.dumps(geo_rec_en)
            new_rec.json_record_fr = json.dumps(geo_rec_fr),
            new_rec.created = created_date,
            new_rec.updated = updated_date,
            new_rec.edited = edited_date,
            new_rec.state = state
            new_rec.geogratis_scanned = geogratis_scanned

        add_record(session, new_rec)
コード例 #5
0
ファイル: updater.py プロジェクト: thriuin/geogratis-sync
def main():

    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_url = ini_config.get('ckan', 'ckan.remote_portal')
    api_key = ini_config.get('ckan', 'ckan.api_key')
    user_agent = ini_config.get('ckan', 'ckan.user_agent')

    ckansite = ckanapi.RemoteCKAN(remote_url, apikey=api_key,
                                  user_agent=user_agent)

    session = connect_to_database()
    last_id = 0
    while True:
        package_stream = session.query(Packages).filter(Packages.id > last_id)
        package_stream = package_stream.filter(Packages.status.in_(["new", "update"])).\
                                               order_by(Packages.id).all()

        if len(package_stream) == 0:
            break
        else:
            for r in package_stream:
                sleep(60)
                print u'Processing dataset {0}'.format(r.id)
                try:
                    new_pkg_dict = json.loads(r.ckan_json.decode('utf-8'))
                except AttributeError as a:
                    print u'AttributeError {0}'.format(unicode(a))
                    continue
                is_new = False
                try:
                    pkg_info = ckansite.action.package_show(id=r.uuid)
                except ckanapi.NotFound:
                    is_new = True
                try:
                    if is_new:
                        ckansite.call_action('package_create', new_pkg_dict)
                    else:
                        ckansite.call_action('package_update', new_pkg_dict)
                    r.status = 'posted'
                    r.status_message = ''
                    r.latest_posted = datetime.now()
                    add_record(session, r)
                    continue
                except ckanapi.NotAuthorized as e:
                    print u'Not Authorized {0}'.format(unicode(e))
                    continue
                except ckanapi.CKANAPIError as c:
                    r.status = 'error'
                    r.status_message = u'CKAN API error {0}'.format(unicode(c))
                    add_record(session, r)
                    print r.status_message
                    continue
                except ckanapi.errors.ValidationError as v:
                    r.status = 'error'
                    r.status_message = u'Validation error {0}'.format(unicode(v.error_dict))
                    add_record(session, r)
                    print r.status_message
                    continue
            break
コード例 #6
0
                        try:
                            ckan_portal.action.package_show(
                                id=scan_record.uuid)
                            # If the record does not exist, then a NotFound exception will be thrown
                            pkg_update_record.existing = True
                        except NotFound, e:
                            pass

                        # Set the dataset for immediate release on the Registry
                        geo_record.portal_release_date = time.strftime(
                            "%Y-%m-%d")
                        geo_record.ready_to_publish = True

                        pkg_update_record.ckan_json = json.dumps(
                            geo_record.as_dict())

                        current_time_str = time.strftime("%Y-%m-%d %H:%M:%S")
                        if not pkg_update_record.created:
                            pkg_update_record.created = current_time_str
                        pkg_update_record.updated = current_time_str
                        add_record(session, pkg_update_record)

                except Exception, e:
                    logging.error(e.message)
                last_id = scan_record.id
    save_setting(setting)
    session.close()


main()
コード例 #7
0
ファイル: converter.py プロジェクト: thriuin/geogratis-sync
                        pkg_update_record.uuid = scan_record.uuid

                        # Determine if the record is already on the OD portal
                        try:
                            ckan_portal.action.package_show(id=scan_record.uuid)
                            # If the record does not exist, then a NotFound exception will be thrown
                            pkg_update_record.existing = True
                        except NotFound, e:
                            pass

                        # Set the dataset for immediate release on the Registry
                        geo_record.portal_release_date = time.strftime("%Y-%m-%d")
                        geo_record.ready_to_publish = True

                        pkg_update_record.ckan_json = json.dumps(geo_record.as_dict())

                        current_time_str = time.strftime("%Y-%m-%d %H:%M:%S")
                        if not pkg_update_record.created:
                            pkg_update_record.created = current_time_str
                        pkg_update_record.updated = current_time_str
                        add_record(session, pkg_update_record)

                except Exception, e:
                    logging.error(e.message)
                last_id = scan_record.id
    save_setting(setting)
    session.close()


main()
コード例 #8
0
ファイル: updater.py プロジェクト: thriuin/geogratis-sync
def main():

    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_url = ini_config.get('ckan', 'ckan.remote_portal')
    api_key = ini_config.get('ckan', 'ckan.api_key')
    user_agent = ini_config.get('ckan', 'ckan.user_agent')

    ckansite = ckanapi.RemoteCKAN(remote_url,
                                  apikey=api_key,
                                  user_agent=user_agent)

    session = connect_to_database()
    last_id = 0
    while True:
        package_stream = session.query(Packages).filter(Packages.id > last_id)
        package_stream = package_stream.filter(Packages.status.in_(["new", "update"])).\
                                               order_by(Packages.id).all()

        if len(package_stream) == 0:
            break
        else:
            for r in package_stream:
                sleep(60)
                print u'Processing dataset {0}'.format(r.id)
                try:
                    new_pkg_dict = json.loads(r.ckan_json.decode('utf-8'))
                except AttributeError as a:
                    print u'AttributeError {0}'.format(unicode(a))
                    continue
                is_new = False
                try:
                    pkg_info = ckansite.action.package_show(id=r.uuid)
                except ckanapi.NotFound:
                    is_new = True
                try:
                    if is_new:
                        ckansite.call_action('package_create', new_pkg_dict)
                    else:
                        ckansite.call_action('package_update', new_pkg_dict)
                    r.status = 'posted'
                    r.status_message = ''
                    r.latest_posted = datetime.now()
                    add_record(session, r)
                    continue
                except ckanapi.NotAuthorized as e:
                    print u'Not Authorized {0}'.format(unicode(e))
                    continue
                except ckanapi.CKANAPIError as c:
                    r.status = 'error'
                    r.status_message = u'CKAN API error {0}'.format(unicode(c))
                    add_record(session, r)
                    print r.status_message
                    continue
                except ckanapi.errors.ValidationError as v:
                    r.status = 'error'
                    r.status_message = u'Validation error {0}'.format(
                        unicode(v.error_dict))
                    add_record(session, r)
                    print r.status_message
                    continue
            break
コード例 #9
0
def main():

    factory = MetadataDatasetModelGeogratisFactory()

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    while True:
        known_records = find_all_records(session,
                                         query_limit=10,
                                         limit_id=last_id)

        if len(known_records) == 0:
            break
        else:
            for geo_rec in known_records:
                print 'ID: {0} UUID: {1}'.format(geo_rec.id, geo_rec.uuid)
                try:
                    # In order to avoid multiple updates, only allow for one instance of an update per uuid.
                    # Previous updates are overridden with the latest update
                    pkg_update = find_record_by_uuid(session,
                                                     geo_rec.uuid,
                                                     query_class=Packages)
                    if pkg_update is None:
                        pkg_update = Packages()
                    pkg_update.status = 'new'
                    if geo_rec.state == 'active':
                        ckan_record = factory.create_model_ckan(geo_rec.uuid)
                        geogratis_record = factory.create_model_geogratis(
                            geo_rec.uuid)
                        pkg_update.uuid = geo_rec.uuid

                        # Set the dataset for immediate release on the Registry
                        geogratis_record.portal_release_date = time.strftime(
                            "%Y-%m-%d")
                        geogratis_record.ready_to_publish = True

                        if not ckan_record is None:

                            if not geogratis_record.equals(ckan_record):
                                diffs = geogratis_record.compare(
                                    ckan_record,
                                    self_label="Geogratis",
                                    other_label="CKAN")
                                pkg_update.differences = "\n".join(
                                    item for item in diffs)
                                geo_rec.od_status = 'Needs Update'
                                pkg_update.ckan_json = json.dumps(
                                    geogratis_record.as_dict())
                                pkg_update.status = 'update'
                            else:
                                geo_rec.od_status = 'Current'
                        else:
                            pkg_update.ckan_json = json.dumps(
                                geogratis_record.as_dict())
                            geo_rec.od_status = 'New Record'
                    else:
                        geo_rec.od_status = 'Ineligible'
                    pkg_update.last_comparison = datetime.now()
                    add_record(session, geo_rec)
                    if geo_rec.od_status == 'New Record' or geo_rec.od_status == "Needs Update":
                        add_record(session, pkg_update)
                    last_id = geo_rec.id
                except Exception, e:
                    logging.error(e.message)