def latest_post(): '''Return the most recent blog post. Returns None if there are no blog posts. :rtype: ckanext.sweden.blog.model.post.Post or None ''' try: from ckanext.sweden.blog.model.post import Post post = Session.query(Post).\ filter(Post.visible == True).\ order_by('created desc').\ first() except NoResultFound: return None if post is None: return None post.content_markdown = markdown( unicode(truncate(post.content, length=320, indicator='...', whole_word=True))) post.post_author = (model.User.get(post.user_id) or Session.query(model.User).filter_by( id=post.user_id).first()) return post
def record_existing_unique_identifier(package_id, identifier): """ Based on a provided identifier, checks datacite for an existing DOI Saves to local CKAN database :param package_id: string :param identifier: string :return DOI object if saved, false if it didn't exist in datacite """ datacite_api = DOIDataCiteAPI() # Check this identifier doesn't exist in the table existing_doi = Session.query(DOI).filter(DOI.identifier == identifier).first() if not existing_doi: # And check against the datacite service try: datacite_doi = datacite_api.get(identifier) if datacite_doi.text: # Determine whether or not we need to delete a doi that points to the current dataset doi_for_this_pkg = Session.query(DOI).filter(DOI.package_id == package_id).first() if doi_for_this_pkg: datacite_api doi = DOI(package_id=package_id, identifier=identifier) Session.add(doi) Session.commit() return doi except HTTPError: pass
def test_create_extent(self): package = factories.Dataset() geojson = json.loads(self.geojson_examples["point"]) shape = asShape(geojson) package_extent = PackageExtent( package_id=package["id"], the_geom=WKTElement(shape.wkt, self.db_srid), ) package_extent.save() assert (package_extent.package_id == package["id"]) if legacy_geoalchemy: assert (Session.scalar( package_extent.the_geom.x) == geojson["coordinates"][0]) assert (Session.scalar( package_extent.the_geom.y) == geojson["coordinates"][1]) assert (Session.scalar( package_extent.the_geom.srid) == self.db_srid) else: from sqlalchemy import func assert (Session.query(func.ST_X(package_extent.the_geom)).first() [0] == geojson["coordinates"][0]) assert (Session.query(func.ST_Y(package_extent.the_geom)).first() [0] == geojson["coordinates"][1]) assert (package_extent.the_geom.srid == self.db_srid)
def make_package_name(self, title, exclude_existing_package): ''' Creates a URL friendly name from a title If the name already exists, it will add some random characters at the end ''' name = munge_title_to_name(title).replace('_', '-') while '--' in name: name = name.replace('--', '-') name = name[0:90] # max length is 100 # Is this slug already in use (and if we're updating a package, is it in # use by a different package?). pkg_obj = Session.query(Package).filter(Package.name == name).filter(Package.id != exclude_existing_package).first() if not pkg_obj: # The name is available, so use it. Note that if we're updating an # existing package we will be updating this package's URL, so incoming # links may break. return name if exclude_existing_package: # The name is not available, and we're updating a package. Chances # are the package's name already had some random string attached # to it last time. Prevent spurrious updates to the package's URL # (choosing new random text) by just reusing the existing package's # name. pkg_obj = Session.query(Package).filter(Package.id == exclude_existing_package).first() if pkg_obj: # the package may not exist yet because we may be passed the desired package GUID before a new package is instantiated return pkg_obj.name # Append some random text to the URL. Hope that with five character # there will be no collsion. return name + "-" + str(uuid.uuid4())[:5]
def test_create_extent(self): package = factories.Dataset() geojson = json.loads(self.geojson_examples['point']) shape = asShape(geojson) package_extent = PackageExtent(package_id=package['id'], the_geom=WKTElement( shape.wkt, self.db_srid)) package_extent.save() assert_equals(package_extent.package_id, package['id']) if legacy_geoalchemy: assert_equals(Session.scalar(package_extent.the_geom.x), geojson['coordinates'][0]) assert_equals(Session.scalar(package_extent.the_geom.y), geojson['coordinates'][1]) assert_equals(Session.scalar(package_extent.the_geom.srid), self.db_srid) else: from sqlalchemy import func assert_equals( Session.query(func.ST_X(package_extent.the_geom)).first()[0], geojson['coordinates'][0]) assert_equals( Session.query(func.ST_Y(package_extent.the_geom)).first()[0], geojson['coordinates'][1]) assert_equals(package_extent.the_geom.srid, self.db_srid)
def test_create_extent(self): package = factories.Dataset() geojson = json.loads(self.geojson_examples['point']) shape = asShape(geojson) package_extent = PackageExtent(package_id=package['id'], the_geom=WKTElement(shape.wkt, self.db_srid)) package_extent.save() assert_equals(package_extent.package_id, package['id']) if legacy_geoalchemy: assert_equals(Session.scalar(package_extent.the_geom.x), geojson['coordinates'][0]) assert_equals(Session.scalar(package_extent.the_geom.y), geojson['coordinates'][1]) assert_equals(Session.scalar(package_extent.the_geom.srid), self.db_srid) else: from sqlalchemy import func assert_equals( Session.query(func.ST_X(package_extent.the_geom)).first()[0], geojson['coordinates'][0]) assert_equals( Session.query(func.ST_Y(package_extent.the_geom)).first()[0], geojson['coordinates'][1]) assert_equals(package_extent.the_geom.srid, self.db_srid)
def test_zfaulty_xml_unknown_errors(self): harv, job = self._create_harvester() res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD2355.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj)) print Package.text_search(\ Session.query(Package), 'Kansalaiskeskustelu ydinvoimasta 2006').all() self.assert_(len(Package.text_search(\ Session.query(Package), 'Kansalaiskeskustelu ydinvoimasta 2006').all()) >= 1) res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD2362.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj)) self.assert_(len(Package.text_search(\ Session.query(Package), 'Energia-asennetutkimus 2004').all()) >= 1)
def test_spatial_extra_base(self, app): user = factories.User() env = {"REMOTE_USER": user["name"].encode("ascii")} dataset = factories.Dataset(user=user) offset = url_for("dataset.edit", id=dataset["id"]) res = app.get(offset, extra_environ=env) data = { "name": dataset['name'], "extras__0__key": u"spatial", "extras__0__value": self.geojson_examples["point"] } res = app.post(offset, environ_overrides=env, data=data) assert "Error" not in res, res package_extent = (Session.query(PackageExtent).filter( PackageExtent.package_id == dataset["id"]).first()) geojson = json.loads(self.geojson_examples["point"]) assert package_extent.package_id == dataset["id"] from sqlalchemy import func assert (Session.query(func.ST_X( package_extent.the_geom)).first()[0] == geojson["coordinates"][0]) assert (Session.query(func.ST_Y( package_extent.the_geom)).first()[0] == geojson["coordinates"][1]) assert package_extent.the_geom.srid == self.db_srid
def _create_unique_identifier(package_doi_identifier=None): datacite_api = DOIDataCiteAPI() while True: if package_doi_identifier: identifier = os.path.join(package_doi_identifier, '{0:03}'.format(random.randint(1, 999))) query = Session.query(CeonResourceDOI) query = query.filter(CeonResourceDOI.identifier == identifier) exists = query.count() else: identifier = os.path.join( get_doi_prefix(), '{0:07}'.format(random.randint(1, 9999999))) query = Session.query(CeonPackageDOI) query = query.filter(CeonPackageDOI.identifier == identifier) exists = query.count() # Check this identifier doesn't exist in the table if not exists: # And check against the datacite service try: datacite_doi = datacite_api.get(identifier) except HTTPError: pass # TODO remove the nest 2 lines (ConnectionError) ignoring except ConnectionError: pass else: if datacite_doi.text: continue return identifier
def test_zharvester_import(self, mocked=True): harvest_object, harv = self._create_harvester() self.assert_(harv.info()['name'] == 'OAI-PMH') real_content = json.loads(harvest_object.content) self.assert_(real_content) self.assert_(harv.import_stage(harvest_object)) the_package = Session.query(Package).filter(Package.title == u"homer") print the_package the_package = the_package[0] self.assert_(the_package) self.assert_(len(the_package.get_tags()) == 4) self.assert_(len(the_package.get_groups()) == 3) self.assert_(the_package.url == "http://helda.helsinki.fi/oai/request?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc" % the_package.id) # Test with empty request Session.remove() CreateTestData.delete() Session.query(Package).delete() harvest_object, harv = self._create_harvester(config=False) real_content = json.loads(harvest_object.content) self.assert_(harv.import_stage(harvest_object) == False) errs = Session.query(HarvestGatherError).all() self.assert_(len(errs) == 2) errs = Session.query(HarvestObjectError).all() self.assert_(len(errs) == 3)
def _create_unique_identifier(package_doi_identifier=None): datacite_api = DOIDataCiteAPI() while True: if package_doi_identifier: identifier = os.path.join(package_doi_identifier, '{0:03}'.format(random.randint(1, 999))) query = Session.query(CeonResourceDOI) query = query.filter(CeonResourceDOI.identifier == identifier) exists = query.count() else: identifier = os.path.join(get_doi_prefix(), '{0:07}'.format(random.randint(1, 9999999))) query = Session.query(CeonPackageDOI) query = query.filter(CeonPackageDOI.identifier == identifier) exists = query.count() # Check this identifier doesn't exist in the table if not exists: # And check against the datacite service try: datacite_doi = datacite_api.get(identifier) except HTTPError: pass # TODO remove the nest 2 lines (ConnectionError) ignoring except ConnectionError: pass else: if datacite_doi.text: continue return identifier
def delete_tests(self): print 'Deleting all test DOIs' Session.query(DOI).filter( DOI.identifier.like('%' + TEST_PREFIX + '%')).delete(synchronize_session=False) Session.commit()
def get(cls, reference): """Returns a IssueCategory object referenced by its id or name.""" if type(reference) is int: # if reference is an integer, get by ID return Session.query(cls).filter(cls.id == reference).first() else: # if not, get by name return Session.query(cls).filter(cls.name == reference).first()
def delete_tests(self): '''Delete all test DOIs.''' print u'Deleting all test DOIs' Session.query(DOI).filter( DOI.identifier.like(u'%' + get_prefix() + u'%')).delete(synchronize_session=False) Session.commit()
def _gather_entry(self, entry, auth=None): # Create a harvest object for each entry entry_guid = entry['guid'] log.debug('gathering %s', entry_guid) entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '') # noqa: E501 entry_restart_date = entry['restart_date'] package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = None obj.save() return obj.id
def clear(cls): Session.query(LocalizedLicenseName).delete() Session.query(cls).delete() try: rev = Session.revision except AttributeError: rev = None Session.flush() Session.revision = rev
def listSets(self, cursor=None, batch_size=None): '''List all sets in this repository, where sets are groups. ''' data = [] if not cursor: groups = Session.query(Group).all() else: groups = Session.query(Group).all()[:cursor] for dataset in groups: data.append((dataset.id, dataset.name, dataset.description)) return data
def listIdentifiers(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''List all identifiers for this repository. ''' data = [] packages = [] group = None if not set: if not from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').\ filter(Package.private!=True).filter(Package.state=='active').all() else: if from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if until and not from_: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if from_ and until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() else: group = Group.get(set) if group: packages = group.packages(return_query=True).filter(Package.type=='dataset').\ filter(Package.private!=True).filter(Package.state=='active') if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active') if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active') if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active') packages = packages.all() if cursor: packages = packages[cursor:] for package in packages: spec = package.name if group: spec = group.name else: if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name group = None data.append(common.Header('', package.id, package.metadata_created, [spec], False)) return data
def populate_harvest_job(self, harvest_job, set_ids, config, client): # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source == harvest_job.source) \ .filter(HarvestJob.gather_finished != None) \ .filter(HarvestJob.id != harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() last_time = None if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started: last_time = previous_job.gather_started.isoformat() # Collect package ids package_ids = list(self.get_package_ids(set_ids, config, last_time, client)) log.debug('Identifiers: %s', package_ids) if not self._recreate(harvest_job) and package_ids: converted_identifiers = {} for identifier in package_ids: converted_identifiers[datapid_to_name(identifier)] = identifier if identifier.endswith(u'm'): converted_identifiers[datapid_to_name(u"%ss" % identifier[0:-1])] = identifier for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all(): converted_name = package.name if converted_identifiers[converted_name] not in package_ids: converted_name = "%sm" % converted_name[0:-1] package_ids.remove(converted_identifiers[converted_name]) if previous_job: for previous_error in [error.guid for error in Session.query(HarvestObject). filter(HarvestObject.harvest_job_id == previous_job.id). filter(HarvestObject.state == 'ERROR').all()]: if previous_error not in package_ids: package_ids.append(previous_error) try: object_ids = [] if len(package_ids): for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) log.debug('Object ids: {i}'.format(i=object_ids)) return object_ids else: self._save_gather_error('No packages received for URL: {u}'.format( u=harvest_job.source.url), harvest_job) return None except Exception as e: self._save_gather_error('Gather: {e}'.format(e=e), harvest_job) raise
def test_store_log_data(self): Session.query(RequestAuditModel).delete() req_audit = RequestAudit() req_audit.log({ 'remote_ip': '192.168.1.10', 'remote_user': '******', 'session': '123456789', 'current_language': 'en', 'access_time': datetime.now(), 'request_url': '/test/url', 'http_method': 'GET', 'http_path': '/test/url?some=value', 'http_query_params': 'some=value', 'http_user_agent': 'Test Agent String', 'client_os': 'linux', 'client_device': 'firefox', }) req_audit.log({ 'remote_ip': '192.168.1.10', 'remote_user': '******', 'session': '123456789', 'current_language': 'en', 'access_time': datetime.now(), 'request_url': '/test/url', 'http_method': 'GET', 'http_path': '/test/url?some=value', 'http_query_params': 'some=value', 'http_user_agent': 'Test Agent String', 'client_os': 'linux', 'client_device': 'firefox', }) req_audit.log({ 'remote_ip': '192.168.1.10', 'remote_user': '******', 'session': '123456789', 'current_language': 'en', 'access_time': datetime.now(), 'request_url': '/test/url', 'http_method': 'GET', 'http_path': '/test/url?some=value', 'http_query_params': 'some=value', 'http_user_agent': 'Test Agent String', 'client_os': 'linux', 'client_device': 'firefox', }) sleep(1) req_audit.shutdown() count, results = RequestAuditModel.get_all(offset=0, limit=10) assert_equals(count, 3)
def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''Show a selection of records, basically lists all datasets. ''' data = [] packages = [] group = None if not set: if not from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.state=='active').all() if from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp > from_).filter(Package.name==PackageRevision.name).\ filter(Package.state=='active').all() if until and not from_: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp < until).filter(Package.name==PackageRevision.name).\ filter(Package.state=='active').all() if from_ and until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() else: group = Group.get(set) if group: packages = group.packages(return_query=True) if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if cursor: packages = packages[cursor:] for res in packages: spec = res.name if group: spec = group.name else: if res.owner_org: group = Group.get(res.owner_org) if group and group.name: spec = group.name group = None data.append(self._record_for_dataset(res, spec)) return data
def update_package(cls, package_id, **kwargs): ''' Update the package_id and/or published fields of a record associated with a given package. :param package_id: the id of the package :param kwargs: the values to be updated :return: the updated record object ''' update_dict = {k: v for k, v in kwargs.items() if k in cls.cols} Session.query(DOI).filter( DOI.package_id == package_id).update(update_dict) Session.commit() return cls.read_package(package_id)
def clear(self): sq = Session.query(VocabularyTerm.id).filter(VocabularyTerm.vocabulary_id==self.id) Session.query(VocabularyLabel)\ .filter(VocabularyLabel.term_id.in_(sq.subquery())).delete(synchronize_session=False) #for vl in q: # Session.delete(vl) sq.delete() #for vl in q: # Session.delete(vl) Session.flush()
def update_doi(cls, identifier, **kwargs): ''' Update the package_id and/or published fields of a record with a given DOI. :param identifier: the DOI string :param kwargs: the values to be updated :return: the updated record object ''' update_dict = {k: v for k, v in kwargs.items() if k in cls.cols} Session.query(DOI).filter( DOI.identifier == identifier).update(update_dict) Session.commit() return cls.read_doi(identifier)
def _clean_groups(package): """ Clears package's groups """ if isinstance(package, dict): package_id = package['id'] else: package_id = package.id Session.query(Member).filter(Member.table_name == 'package', Member.table_id == package_id, Member.capacity != 'admin')\ .update({'state':'deleted'})
def _filter_packages(set, cursor, from_, until, batch_size): '''Get a part of datasets for "listNN" verbs. ''' packages = [] setspc = None if not set: packages = Session.query(Package).filter(Package.type=='dataset'). \ filter(Package.state == 'active').filter(Package.private!=True) if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name) if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name) if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name) if batch_size: packages = packages.limit(batch_size) if cursor: packages = packages.offset(cursor) packages = packages.all() elif set == 'openaire_data': oa_tag = Session.query(Tag).filter( Tag.name == 'openaire_data').first() if oa_tag: packages = oa_tag.packages setspc = set else: group = Group.get(set) if group: # Note that group.packages never returns private datasets regardless of 'with_private' parameter. packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \ filter(Package.state == 'active') if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name) if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name) if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name) if batch_size: packages = packages.limit(batch_size) if cursor: packages = packages.offset(cursor) packages = packages.all() # if cursor is not None: # cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages) # packages = packages[cursor:cursor_end] return packages, setspc
def get_contributions(cls, pkg_id): pkg = Session.query(Package).get(pkg_id) if pkg is None: pkg = Session.query(Package).filter(Package.name == pkg_id).first() if pkg is None: raise toolkit.ObjectNotFound('Package does not exist.') else: pkg_id = pkg.id link_records = PackageContributionActivityQuery.read_package(pkg_id) activities = sorted([r.contribution_activity for r in link_records], key=lambda x: x.activity) return activities
def get(cls, vocab, name): if isinstance(vocab, Vocabulary): item = Session.query(cls).filter(cls.vocabulary_id==vocab.id, cls.name==name).first() else: item = Session.query(cls).join(Vocabulary)\ .filter(Vocabulary.name==vocab, cls.name==name).first() if not item: log.info(_("No term {} for vocabulary {}").format(name, vocab)) return item
def test_update_extent(self): package = factories.Dataset() geojson = json.loads(self.geojson_examples["point"]) shape = asShape(geojson) package_extent = PackageExtent( package_id=package["id"], the_geom=WKTElement(shape.wkt, self.db_srid), ) package_extent.save() if legacy_geoalchemy: assert( Session.scalar(package_extent.the_geom.geometry_type) == "ST_Point" ) else: from sqlalchemy import func assert( Session.query( func.ST_GeometryType(package_extent.the_geom) ).first()[0] == "ST_Point" ) # Update the geometry (Point -> Polygon) geojson = json.loads(self.geojson_examples["polygon"]) shape = asShape(geojson) package_extent.the_geom = WKTElement(shape.wkt, self.db_srid) package_extent.save() assert(package_extent.package_id == package["id"]) if legacy_geoalchemy: assert( Session.scalar(package_extent.the_geom.geometry_type) == "ST_Polygon" ) assert( Session.scalar(package_extent.the_geom.srid) == self.db_srid ) else: assert( Session.query( func.ST_GeometryType(package_extent.the_geom) ).first()[0] == "ST_Polygon" ) assert(package_extent.the_geom.srid == self.db_srid)
def test_harvester_urlerror(self): harv, job = self._create_harvester() urllib2.urlopen = realopen self.assert_(harv.gather_stage(job) == None) errs = Session.query(HarvestGatherError).all() self.assert_(len(errs) == 1) harv_obj = HarvestObject() harv_obj.job = job harv_obj.content = json.dumps({'url': "http://foo"}) # XML error and URL error, also the lack of url in content self.assert_(harv.import_stage(harv_obj) == False) errs = Session.query(HarvestObjectError).all() print errs self.assert_(len(errs) == 1)
def rename_term_in_extras(self, old_term, new_term): if not self.valid_term(old_term): raise ValueError(u"Old term {} is not valid".format(old_term)) if not self.valid_term(new_term): raise ValueError(u"New term {} is not valid".format(new_term)) if self.is_multivalued: q = Session.query(PackageExtra.package_id).join(Package, Package.id==PackageExtra.package_id)\ .filter(PackageExtra.key==self.field_name, PackageExtra.value.like('%{}%'.format(old_term)), Package.type=='dataset', Package.state=='active') else: q = Session.query(PackageExtra.package_id).join(Package, Package.id==PackageExtra.package_id)\ .filter(PackageExtra.key==self.field_name, PackageExtra.value==old_term, Package.type=='dataset', Package.state=='active') # import in function to avoid circular dependencies from ckanext.faociok.validators import _serialize_to_array, _deserialize_from_array ctx = {'ignore_auth': True, 'user': _get_user()['name']} pshow = t.get_action('package_show') pupdate = t.get_action('package_update') counter = 0 for pdata in q: pkg = pshow(ctx.copy(), {'name_or_id': pdata[0]}) fdata = pkg.get(self.field_name) affected = False if self.is_multivalued: fdata = _deserialize_from_array(fdata) if old_term in fdata: fdata.remove(old_term) fdata.append(new_term) fdata = _serialize_to_array(fdata) affected = True else: fdata = new_term affected = True if affected: pkg[self.field_name] = fdata pkg.pop('metadata_modified', None) pupdate(ctx.copy(), pkg) counter += 1 return counter
def test_edit(self): name = 'annakarenina' offset = url_for(controller='package', action='edit', id=name) res = self.app.get(offset, extra_environ=self.extra_environ) assert 'Edit - Datasets' in res fv = res.forms['dataset-edit'] prefix = '' fv[prefix + 'extras__1__key'] = u'spatial' fv[prefix + 'extras__1__value'] = self.geojson_examples['point'] res = fv.submit('save', extra_environ=self.extra_environ) assert not 'Error' in res, res package = Package.get(name) # Check that a PackageExtent object has been created package_extent = Session.query(PackageExtent).filter( PackageExtent.package_id == package.id).first() geojson = json.loads(self.geojson_examples['point']) assert package_extent assert package_extent.package_id == package.id assert Session.scalar( package_extent.the_geom.x) == geojson['coordinates'][0] assert Session.scalar( package_extent.the_geom.y) == geojson['coordinates'][1] assert Session.scalar(package_extent.the_geom.srid) == self.db_srid # Update the spatial extra offset = url_for(controller='package', action='edit', id=name) res = self.app.get(offset, extra_environ=self.extra_environ) assert 'Edit - Datasets' in res fv = res.forms['dataset-edit'] prefix = '' fv[prefix + 'extras__1__value'] = self.geojson_examples['polygon'] res = fv.submit('save', extra_environ=self.extra_environ) assert not 'Error' in res, res # Check that the PackageExtent object has been updated package_extent = Session.query(PackageExtent).filter( PackageExtent.package_id == package.id).first() assert package_extent assert package_extent.package_id == package.id assert Session.scalar( package_extent.the_geom.geometry_type) == 'ST_Polygon' assert Session.scalar(package_extent.the_geom.srid) == self.db_srid
def listIdentifiers(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''List all identifiers for this repository. ''' data = [] packages = [] if not set: if not from_ and not until: packages = Session.query(Package).all() else: if from_: packages = Session.query(Package).\ filter(PackageRevision.revision_timestamp > from_).\ all() if until: packages = Session.query(Package).\ filter(PackageRevision.revision_timestamp < until).\ all() if from_ and until: packages = Session.query(Package).\ filter(between(PackageRevision.revision_timestamp, from_, until)\ ).all() else: group = Group.get(set) if group: packages = group.active_packages() if from_ and not until: packages = packages.\ filter(PackageRevision.revision_timestamp > from_) if until and not from_: packages = packages.\ filter(PackageRevision.revision_timestamp < until) if from_ and until: packages = packages.filter( between(PackageRevision.revision_timestamp, from_, until)) packages = packages.all() if cursor: packages = packages[:cursor] for package in packages: data.append(common.Header(package.id, package.metadata_created, [package.name], False)) return data
def harvest_source_url_validator(key,data,errors,context): new_url = _normalize_url(data[key]) source_id = data.get(('id',),'') if source_id: # When editing a source we need to avoid its own URL existing_sources = Session.query(HarvestSource.url,HarvestSource.active) \ .filter(HarvestSource.id!=source_id).all() else: existing_sources = Session.query(HarvestSource.url,HarvestSource.active).all() for url,active in existing_sources: url = _normalize_url(url) if url == new_url: raise Invalid('There already is a Harvest Source for this URL: %s' % data[key]) return data[key]
def list_packages(cls): xmlns = "urn:nbn:se:uu:ub:epc-schema:rs-location-mapping" def locns(loc): return "{%s}%s" % (xmlns, loc) xsi = "http://www.w3.org/2001/XMLSchema-instance" schemaLocation = "urn:nbn:se:uu:ub:epc-schema:rs-location-mapping http://urn.kb.se/resolve?urn=urn:nbn:se:uu:ub:epc-schema:rs-location-mapping&godirectly" records = Element("{" + xmlns + "}records", attrib={"{" + xsi + "}schemaLocation": schemaLocation}, nsmap={'xsi': xsi, None: xmlns}) q = Session.query(Package) q = q.filter(Package.name.ilike('urn:nbn:fi:csc-kata%')) pkgs = q.all() prot = SubElement(records, locns('protocol-version')) prot.text = '3.0' datestmp = SubElement(records, locns('datestamp'), attrib={'type': 'modified'}) now = datetime.datetime.now().isoformat() datestmp.text = now for pkg in pkgs: record = SubElement(records, locns('record')) header = SubElement(record, locns('header')) datestmp = SubElement(header, locns('datestamp'), attrib={'type': 'modified'}) datestmp.text = now identifier = SubElement(header, locns('identifier')) identifier.text = pkg.name destinations = SubElement(header, locns('destinations')) destination = SubElement(destinations, locns('destination'), attrib={'status': 'activated'}) datestamp = SubElement(destination, locns('datestamp'), attrib={'type': 'activated'}) url = SubElement(destination, locns('url')) url.text = "%s%s" % (config.get('ckan.site_url', ''), helpers.url_for(controller='package', action='read', id=pkg.name)) return tostring(records)
def _get_broken_resource_links(organisation_id=None): organisation_id = None query = Session.query(Package.name, Package.title, PackageExtra.value, Resource)\ .join(PackageExtra)\ .join(ResourceGroup, Package.id==ResourceGroup.package_id)\ .join(Resource)\ .join(TaskStatus, TaskStatus.entity_id==Resource.id)\ .filter(TaskStatus.key==u'openness_score')\ .filter(TaskStatus.value==u'0')\ .filter(or_( and_(PackageExtra.key=='published_by', PackageExtra.value.like('%%[%s]' % (organisation_id is None and '%' or organisation_id))), and_(PackageExtra.key=='published_via', PackageExtra.value.like('%%[%s]' % (organisation_id is None and '%' or organisation_id))), )\ )\ .distinct() context = {'model': model, 'session': model.Session} data = [] for row in query: resource = resource_dictize(row.Resource, context) task_data = {'entity_id': resource['id'], 'task_type': 'qa', 'key': 'openness_score_reason'} status = get_action('task_status_show')(context, task_data) resource['openness_score'] = u'0' resource['openness_score_reason'] = status.get('value') data.append([row.name, row.title, row.value, resource]) return _collapse(data, [_extract_publisher, _extract_dataset])
def package_update_rest_minimal(context, data_dict): setup() package= '' fulltext = '' old_fulltext = '' if data_dict.has_key('extras'): if 'full_text_search' in data_dict['extras'].keys(): fulltext = data_dict['extras']['full_text_search'] data_dict = _del_extra_field_from_list(data_dict, 'full_text_search') package = update.package_update_rest(context, data_dict) old_fulltext = None if package.has_key('id'): old_fulltext = Session.query(PackageFulltext) \ .filter(PackageFulltext.package_id==package['id']) \ .first() fulltext_dict_save(fulltext, old_fulltext, package, context) else: package = update.package_update(context, data_dict) else: package = update.package_update_rest(context, data_dict) if check_logged_in(context): fulltext = _get_fulltext(package['id']) if fulltext: package['extras']['full_text_search'] = fulltext.text return package minimal_package = _del_extra_field_from_list(package) minimal_package = _del_main_field_from_dict(minimal_package) return minimal_package
def five_stars(id=None): """ Return a list of dicts: 1 for each dataset that has an openness score. Each dict is of the form: {'name': <string>, 'title': <string>, 'openness_score': <int>} """ if id: pkg = model.Package.get(id) if not pkg: return "Not found" # take the maximum openness score among dataset resources to be the # overall dataset openness core query = Session.query(Package.name, Package.title, func.max(TaskStatus.value).label('value'))\ .join(ResourceGroup, Package.id==ResourceGroup.package_id)\ .join(Resource)\ .join(TaskStatus, TaskStatus.entity_id==Resource.id)\ .filter(TaskStatus.key==u'openness_score')\ .group_by(Package.name, Package.title)\ .distinct() if id: query = query.filter(Package.id==pkg.id) results = [] for row in query: results.append({ 'name': row.name, 'title': row.title, 'openness_score': row.value }) return results
def create_unique_identifier(package_id): """ Create a unique identifier, using the prefix and a random number: 10.5072/0044634 Checks the random number doesn't exist in the table or the datacite repository All unique identifiers are created with @return: """ datacite_api = DOIDataCiteAPI() while True: identifier = os.path.join(get_prefix(), '{0:07}'.format(random.randint(1, 100000))) # Check this identifier doesn't exist in the table if not Session.query(DOI).filter(DOI.identifier == identifier).count(): # And check against the datacite service try: datacite_doi = datacite_api.get(identifier) except HTTPError: pass else: if datacite_doi.text: continue doi = DOI(package_id=package_id, identifier=identifier) Session.add(doi) Session.commit() return doi
def all_active(cls): query = Session.query(cls).filter( cls.status == 'active', cls.board_id.in_([b.id for b in Board.filter_active()])) if hasattr(cls, 'order_by') and isCallable(cls.order_by): query = cls.order_by(query) return query
def test_new(self): name = "test-spatial-dataset-1" offset = url_for(controller="package", action="new") res = self.app.get(offset, extra_environ=self.extra_environ) assert "Add - Datasets" in res fv = res.forms["dataset-edit"] prefix = "" fv[prefix + "name"] = name fv[prefix + "extras__0__key"] = u"spatial" fv[prefix + "extras__0__value"] = self.geojson_examples["point"] res = fv.submit("save", extra_environ=self.extra_environ) assert not "Error" in res, res package = Package.get(name) # Check that a PackageExtent object has been created package_extent = Session.query(PackageExtent).filter(PackageExtent.package_id == package.id).first() geojson = json.loads(self.geojson_examples["point"]) assert package_extent assert package_extent.package_id == package.id assert Session.scalar(package_extent.the_geom.x) == geojson["coordinates"][0] assert Session.scalar(package_extent.the_geom.y) == geojson["coordinates"][1] assert Session.scalar(package_extent.the_geom.srid) == self.db_srid
def broken_resource_links_by_dataset(): """ Return a list of named tuples, one for each dataset that contains broken resource links (defined as resources with an openness score of 0). The named tuple is of the form: (name (str), title (str), resources (list of dicts)) """ query = Session.query(Package.name, Package.title, Resource)\ .join(ResourceGroup, Package.id==ResourceGroup.package_id)\ .join(Resource)\ .join(TaskStatus, TaskStatus.entity_id==Resource.id)\ .filter(TaskStatus.key==u'openness_score')\ .filter(TaskStatus.value==u'0')\ .distinct() context = {'model': model, 'session': model.Session} results = {} for name, title, resource in query: resource = resource_dictize(resource, context) data = {'entity_id': resource['id'], 'task_type': 'qa', 'key': 'openness_score_reason'} status = get_action('task_status_show')(context, data) resource['openness_score_reason'] = status.get('value') if name in results: results[name].resources.append(resource) else: DatasetTuple = namedtuple('DatasetTuple', ['name', 'title', 'resources']) results[name] = DatasetTuple(name, title or name, [resource]) return results.values()
def gather_stage(self,harvest_job): log.debug('In OOEHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(harvest_job.source.config) # Get source URL base_url = harvest_job.source.url.rstrip('/') # Check if previous jobs exist and when they took place previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0): if not self.config.get('force_all',False): get_all_packages = False # Request only the packages modified since last harvest job last_time = harvest_job.gather_started.strftime("%Y-%m-%d") url = base_url + '?since_time=%s' % last_time else: url = base_url else: # Request all remote packages url = base_url + '/search' log.debug("url: %s" % url) try: content = self._get_content(url) except Exception,e: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) return None
def get_controlled_vocabulary_values(vocabulary_id, thesaurus_id, keywords): log.debug( '::::: Collecting thesaurus data for dcatapit skos {0} from the metadata keywords :::::' .format(vocabulary_id)) values = [] # # Get all the places tag names by the vocabulary id # tag_names_list = get_vocabulary_tag_names(vocabulary_id) if len(tag_names_list) > 0: for key in keywords: if thesaurus_id and (thesaurus_id in key['thesaurus-identifier'] or thesaurus_id in key['thesaurus-title']): for k in key['keyword']: query = Session.query(DCATAPITTagVocabulary) \ .filter(DCATAPITTagVocabulary.text==k, DCATAPITTagVocabulary.tag_name.in_(tag_names_list)) query = query.autoflush(True) theme = query.first() if theme and theme.tag_name: values.append(theme.tag_name) return values
def _search_package(self, identifier): name = identifier.lower() replace_chars = [',', ':', '.', '/', '-'] for x in replace_chars: name = name.replace(x, '_') name = name.replace('oai_ebas_oai_pmh_nilu_no_', '') template_name = name[0:42] MAX_NUMBER_APPENDED = 999999 PACKAGE_NAME_MAX_LENGTH = 99 APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) # Find out which package names have been taken. Restrict it to names # derived from the ideal name plus and numbers added like_q = u'%s%%' % \ template_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] results = Session.query(Package)\ .filter(Package.name.ilike(like_q))\ .all() if results: for package in results: package_dict = self._get_package_dict(package) extra_identifier = self._get_package_extra( package_dict, 'identifier') if identifier == extra_identifier: return package else: return None else: return None
def gather_stage(self,harvest_job): log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) doc = html.parse(data) for td in doc.findall("//td[@class='left_p12_title']/a"): link = td.get('href') if re.match(r"/search/fsciitem", link): id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job= harvest_job, content=link) obj.save() package_ids.append(obj.id) self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() return package_ids
def package_update_rest_minimal(context, data_dict): setup() package = '' fulltext = '' old_fulltext = '' if data_dict.has_key('extras'): if 'full_text_search' in data_dict['extras'].keys(): fulltext = data_dict['extras']['full_text_search'] data_dict = _del_extra_field_from_list(data_dict, 'full_text_search') package = update.package_update_rest(context, data_dict) old_fulltext = None if package.has_key('id'): old_fulltext = Session.query(PackageFulltext) \ .filter(PackageFulltext.package_id==package['id']) \ .first() fulltext_dict_save(fulltext, old_fulltext, package, context) else: package = update.package_update(context, data_dict) else: package = update.package_update_rest(context, data_dict) if check_logged_in(context): fulltext = _get_fulltext(package['id']) if fulltext: package['extras']['full_text_search'] = fulltext.text return package minimal_package = _del_extra_field_from_list(package) minimal_package = _del_main_field_from_dict(minimal_package) return minimal_package
def tags(self): idea_tags = Session.query(IdeaTag) \ .join(Idea) \ .filter(Idea.id==self.id) \ .all() return idea_tags
def _get_content(id): from ckanext.harvest.model import HarvestObject obj = Session.query(HarvestObject).filter(HarvestObject.id == id).first() if obj: return obj.content else: return None
def authenticate(self, environ, identity): if not 'login' in identity or not 'password' in identity: return None user = User.by_name(identity.get('login')) if user is None: log.debug('Login failed - username %r not found', identity.get('login')) return None seedUser = Session.query(SEEDUser).filter_by( name=identity.get('login')).first() if seedUser.login_attempts >= 10: log.debug('Login as %r failed - account is locked', identity.get('login')) elif user.validate_password(identity.get('password')): # reset attempt count to 0 seedUser.login_attempts = 0 Session.commit() return user.name else: log.debug('Login as %r failed - password not valid', identity.get('login')) seedUser.login_attempts += 1 Session.commit() return None
def tags(self): app_tags = Session.query(ApplicationTag) \ .join(Application) \ .filter(Application.id==self.id) \ .all() return app_tags
def data(self): # Get the Europe dataset rootdir = get_root_dir() data_file = os.path.join(rootdir, 'ckanext', 'offenedaten', 'data', 'eu.json') f = open(data_file, 'r') o = json.load(f) # Get the package count by country q = Session.query( distinct(PackageExtra.value), func.count(PackageExtra.value) ).\ filter(PackageExtra.key == u'eu_country').\ group_by(PackageExtra.value) values = dict(q.all()) # Set the package count for each country for ft in o['features']: code = ft['properties']['NUTS'] ft['properties']['packages'] = (values.get(code, 0)) response.content_type = 'application/json' response.pragma = None response.cache_control = 'public; max-age: 3600' response.cache_expires(seconds=3600) return json.dumps(o)
def get_package_ids_in_poly(coords,db_srid): """ TODO: This needs to be removed as spatial backend is changed to Solr. """ poly_template_str = '' x = coords i=0 for item in x['poly']: print item if i==0: poly_template_str = poly_template_str + ''+str(item[1]) +' '+ str(item[0])+', ' elif i==len(x['poly'])-1: poly_template_str = poly_template_str +''+str(item[1]) +' '+ str(item[0]) else: poly_template_str = poly_template_str +''+str(item[1]) +' '+ str(item[0])+', ' i=i+1 poly_template_str = 'POLYGON (('+poly_template_str + ', '+str(x['poly'][0][1]) +' '+ str(x['poly'][0][0])+'))' # bbox_template = Template('POLYGON (($minx $miny, $minx $maxy, $maxx $maxy, $maxx $miny, $minx $miny))') wkt = poly_template_str input_geometry = WKTSpatialElement(wkt,db_srid) extents = Session.query(PackageExtent).filter(PackageExtent.package_id==Package.id).filter(PackageExtent.the_geom.intersects(input_geometry)).filter(Package.state==u'active').all() ids = [extent.package_id for extent in extents] return ids
def test_harvest_basic(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/waf/index.html", "type": u"gemini-waf"} source, job = self._create_source_and_job(source_fixture) harvester = GeminiWafHarvester() # We need to send an actual job, not the dict object_ids = harvester.gather_stage(job) assert len(object_ids) == 2 # Fetch stage always returns True for Waf harvesters assert harvester.fetch_stage(object_ids) == True objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) pkgs = Session.query(Package).all() assert len(pkgs) == 2 pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def for_select(cls, lang): q = Session.query(cls, LocalizedLicenseName.label)\ .join(LocalizedLicenseName)\ .filter(LocalizedLicenseName.lang==lang, cls.rank_order>1)\ .order_by(cls.path) return list(q)
def update_extents(): from ckan.model import PackageExtra, Package, Session conn = Session.connection() packages = [extra.package \ for extra in \ Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()] errors = [] count = 0 for package in packages: try: value = package.extras['spatial'] log.debug('Received: %r' % value) geometry = json.loads(value) count += 1 except ValueError as e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id, six.text_type(e))) except TypeError as e: errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id, six.text_type(e))) save_package_extent(package.id, geometry) Session.commit() if errors: msg = 'Errors were found:\n%s' % '\n'.join(errors) print(msg) msg = "Done. Extents generated for %i out of %i packages" % (count, len(packages)) print(msg)
def package_update_minimal(context, data_dict): '''Update a dataset (package). You must be authorized to edit the dataset and the groups that it belongs to. It is recommended to call :py:func:`ckan.logic.action.get.package_show`, make the desired changes to the result, and then call ``package_update()`` with it. Plugins may change the parameters of this function depending on the value of the dataset's ``type`` attribute, see the :py:class:`~ckan.plugins.interfaces.IDatasetForm` plugin interface. For further parameters see :py:func:`~ckan.logic.action.create.package_create`. :param id: the name or id of the dataset to update :type id: string :returns: the updated dataset (if ``'return_package_dict'`` is ``True`` in the context, which is the default. Otherwise returns just the dataset id) :rtype: dictionary ''' setup() package= '' fulltext = '' old_fulltext = '' if data_dict.has_key('extras'): contains = _contains_key(data_dict['extras'], 'full_text_search') if(contains): fulltext = contains data_dict = _del_extra_field_from_dict(data_dict, 'full_text_search') package = update.package_update(context, data_dict) old_fulltext = None if package.has_key('id'): old_fulltext = Session.query(PackageFulltext) \ .filter(PackageFulltext.package_id==package['id']) \ .first() fulltext_dict_save(fulltext, old_fulltext, package, context) else: package = update.package_update(context, data_dict) else: package = update.package_update(context, data_dict) if check_logged_in(context): fulltext = _get_fulltext(package['id']) if fulltext: fulltext_dict = { 'key': 'full_text_search', 'value': fulltext.text } package['extras'].append(fulltext_dict) return package minimal_package = _del_extra_field_from_dict(package) minimal_package = _del_main_field_from_dict(minimal_package) return minimal_package
def _ensure_name_is_unique(ideal_name, existing_name=None, append_type="number-sequence"): """ Returns a dataset name based on the ideal_name, only it will be guaranteed to be different than all the other datasets, by adding a number on the end if necessary. If generating a new name because the title of the dataset has changed, specify the existing name, in case the name doesn't need to change after all. The maximum dataset name length is taken account of. :param ideal_name: the desired name for the dataset, if its not already been taken (usually derived by munging the dataset title) :type ideal_name: string :param existing_name: the current name of the dataset - only specify this if the dataset exists :type existing_name: string :param append_type: the type of characters to add to make it unique - either 'number-sequence' or 'random-hex'. :type append_type: string """ ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH] if existing_name == ideal_name: return ideal_name if append_type == "number-sequence": MAX_NUMBER_APPENDED = 999 APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) elif append_type == "random-hex": APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations else: raise NotImplementedError("append_type cannot be %s" % append_type) # Find out which package names have been taken. Restrict it to names # derived from the ideal name plus and numbers added like_q = u"%s%%" % ideal_name[: PACKAGE_NAME_MAX_LENGTH - APPEND_MAX_CHARS] name_results = Session.query(Package.name).filter(Package.name.ilike(like_q)).all() taken = set([name_result[0] for name_result in name_results]) if existing_name and existing_name in taken: taken.remove(existing_name) if ideal_name not in taken: # great, the ideal name is available return ideal_name elif existing_name and existing_name.startswith(ideal_name): # the ideal name is not available, but its an existing dataset with # a name based on the ideal one, so there's no point changing it to # a different number return existing_name elif append_type == "number-sequence": # find the next available number counter = 1 while counter <= MAX_NUMBER_APPENDED: candidate_name = ideal_name[: PACKAGE_NAME_MAX_LENGTH - len(str(counter))] + str(counter) if candidate_name not in taken: return candidate_name counter = counter + 1 return None elif append_type == "random-hex": return ideal_name[: PACKAGE_NAME_MAX_LENGTH - APPEND_MAX_CHARS] + str(uuid.uuid4())[:APPEND_MAX_CHARS]