def score_by_url_extension(resource, score_reasons, log): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' formats_by_extension = Formats.by_extension() extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append('Nepoznata ekstenzija datoteke.') return (None, None) for extension in extension_variants_: if extension.lower() in formats_by_extension: format_ = Formats.by_extension().get(extension.lower()) score = format_['openness'] score_reasons.append( 'URL ekstenzija "%s" je povezana s formatom "%s" i ima ocjenu: %s.' % (extension, format_['display_name'], score)) return score, format_['display_name'] score_reasons.append('URL ekstenzija "%s" je nepoznat format.' % extension) return (None, None)
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_ = Formats.by_display_name().get(format_field) or \ Formats.by_extension().get(format_field.lower()) or \ Formats.by_reduced_name().get(Formats.reduce(format_field)) if not format_: score_reasons.append('Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field) return (None, None) score = format_['openness'] score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \ (format_field, score)) return (score, format_['display_name'])
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_ = Formats.by_display_name().get(format_field) or \ Formats.by_extension().get(format_field.lower()) or \ Formats.by_reduced_name().get(Formats.reduce(format_field)) if not format_: score_reasons.append( 'Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field) return (None, None) score = format_['openness'] score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \ (format_field, score)) return (score, format_['display_name'])
def run_bsd_file(filepath, log): '''Run the BSD command-line tool "file" to determine file type. Returns a Format or None if it fails.''' result = check_output(['file', filepath]) match = re.search('Name of Creating Application: ([^,]*),', result) if match: app_name = match.groups()[0] format_map = { 'Microsoft Office PowerPoint': 'ppt', 'Microsoft PowerPoint': 'ppt', 'Microsoft Excel': 'xls', 'Microsoft Office Word': 'doc', 'Microsoft Word 10.0': 'doc', 'Microsoft Macintosh Word': 'doc', } if app_name in format_map: extension = format_map[app_name] format_ = Formats.by_extension()[extension] log.info('"file" detected file format: %s', format_['display_name']) return format_ match = re.search(': ESRI Shapefile', result) if match: format_ = Formats.by_extension()['shp'] log.info('"file" detected file format: %s', format_['display_name']) return format_ log.info('"file" could not determine file format of "%s": %s', filepath, result)
def test_match(self): res_type_map = { # raw: expected_canonised "xls": "XLS", ".xls": "XLS", ".XLS": "XLS", "csv": "CSV", ".html": "HTML", "html": "HTML", "rdf/xml": "RDF", "rdf": "RDF", ".rdf": "RDF", ".RDF": "RDF", "pdf": "PDF", "PDF ": "PDF", "ppt": "PPT", "odp": "ODP", "shp": "SHP", "kml": "KML", "doc": "DOC", "json": "JSON", } for raw, expected_match in res_type_map.items(): assert Formats.match(raw), raw assert_equal(Formats.match(raw)["display_name"], expected_match)
def test_match(self): res_type_map = { # raw: expected_canonised 'xls': 'XLS', '.xls': 'XLS', '.XLS': 'XLS', 'csv': 'CSV', '.html': 'HTML', 'html': 'HTML', 'rdf/xml': 'RDF', 'rdf': 'RDF', '.rdf': 'RDF', '.RDF': 'RDF', 'pdf': 'PDF', 'PDF ': 'PDF', 'ppt': 'PPT', 'odp': 'ODP', 'shp': 'SHP', 'kml': 'KML', 'doc': 'DOC', 'json': 'JSON', } for raw, expected_match in res_type_map.items(): assert Formats.match(raw), raw assert_equal(Formats.match(raw)['display_name'], expected_match)
def run_bsd_file(filepath, log): '''Run the BSD command-line tool "file" to determine file type. Returns a Format or None if it fails.''' result = check_output(['file', filepath]) match = re.search('Name of Creating Application: ([^,]*),', result) if match: app_name = match.groups()[0] format_map = {'Microsoft Office PowerPoint': 'ppt', 'Microsoft PowerPoint': 'ppt', 'Microsoft Excel': 'xls', 'Microsoft Office Word': 'doc', 'Microsoft Word 10.0': 'doc', 'Microsoft Macintosh Word': 'doc', } if app_name in format_map: extension = format_map[app_name] format_ = Formats.by_extension()[extension] log.info('"file" detected file format: %s', format_['display_name']) return format_ match = re.search(': ESRI Shapefile', result) if match: format_ = Formats.by_extension()['shp'] log.info('"file" detected file format: %s', format_['display_name']) return format_ log.info('"file" could not determine file format of "%s": %s', filepath, result)
def get_xml_variant_without_xml_declaration(buf, log): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' xml_re = '.{0,3}\s*<([^>\s]*)' match = re.match(xml_re, buf) if match: top_level_tag_name = match.groups()[-1].lower() top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms') if top_level_tag_name in Formats.by_extension(): format_ = Formats.by_extension()[top_level_tag_name] log.info('XML variant detected: %s', format_['display_name']) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return Formats.by_extension()['xml'] log.debug('XML tags not found: %s', buf)
def is_html(buf, log): '''If this buffer is HTML, return that format type, else None.''' xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('HTML tag detected') return Formats.by_extension()['html'] log.debug('Not HTML')
def is_iati(buf, log): '''If this buffer is IATI format, return that format type, else None.''' xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('IATI tag detected') return Formats.by_extension()['iati'] log.debug('Not IATI', buf)
def get_xml_variant_without_xml_declaration(buf, log): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' xml_re = '.{0,3}\s*<([^>\s]*)' match = re.match(xml_re, buf) if match: top_level_tag_name = match.groups()[-1].lower() top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace( 'wms_capabilities', 'wms') if top_level_tag_name in Formats.by_extension(): format_ = Formats.by_extension()[top_level_tag_name] log.info('XML variant detected: %s', format_['display_name']) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return Formats.by_extension()['xml'] log.debug('XML tags not found: %s', buf)
def _clean_format(cls, format_string): if isinstance(format_string, basestring): matched_format = Formats.match(format_string) if matched_format: return matched_format['display_name'] return re.sub(cls._disallowed_characters, '', format_string).strip() else: return format_string
def test_fugue_icons_exist(self): # List all icon files in the fugue folder path = os.path.dirname(__file__) # /ckanext/dgu/tests/lib path = os.path.dirname(path) # /ckanext/dgu/tests path = os.path.dirname(path) # /ckanext/dgu # /ckanext/dgu/theme/public/images/fugue path = os.path.join(path, 'theme', 'public', 'images', 'fugue') assert os.path.isdir(path) files = os.listdir(path) # Each format should have an icon in that folder assert 'document.png' in files, 'document.png not found in %s' % path for fmt in Formats.by_display_name().values(): if fmt['icon'] == '': continue icon_name = fmt['icon'] + '.png' assert icon_name in files, '%s not found in %s' % (icon_name, path)
def score_by_url_extension(resource, score_reasons, log): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' formats_by_extension = Formats.by_extension() extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append('Nepoznata ekstenzija datoteke.') return (None, None) for extension in extension_variants_: if extension.lower() in formats_by_extension: format_ = Formats.by_extension().get(extension.lower()) score = format_['openness'] score_reasons.append('URL ekstenzija "%s" je povezana s formatom "%s" i ima ocjenu: %s.' % (extension, format_['display_name'], score)) return score, format_['display_name'] score_reasons.append('URL ekstenzija "%s" je nepoznat format.' % extension) return (None, None)
def test_fugue_icons_exist(self): # List all icon files in the fugue folder path = os.path.dirname(__file__) # /ckanext/dgu/tests/lib path = os.path.dirname(path) # /ckanext/dgu/tests path = os.path.dirname(path) # /ckanext/dgu # /ckanext/dgu/theme/public/images/fugue path = os.path.join(path, "theme", "public", "images", "fugue") assert os.path.isdir(path) files = os.listdir(path) # Each format should have an icon in that folder assert "document.png" in files, "document.png not found in %s" % path for fmt in Formats.by_display_name().values(): if fmt["icon"] == "": continue icon_name = fmt["icon"] + ".png" assert icon_name in files, "%s not found in %s" % (icon_name, path)
def test_by_mime_type(self): assert_equal(Formats.by_mime_type()['text/x-json']['display_name'], 'JSON')
def test_reduce(self): assert_equal(Formats.reduce('.XLS '), 'xls')
def test_by_mime_type(self): assert_equal(Formats.by_mime_type()["text/x-json"]["display_name"], "JSON")
def set_sniffed_format(format_display_name): global sniffed_format if format_display_name: sniffed_format = Formats.by_display_name()[format_display_name] else: sniffed_format = None
def test_by_display_name(self): assert_equal(Formats.by_display_name()["JSON"]["extension"], "json")
def test_reduce(self): assert_equal(Formats.reduce(".XLS "), "xls")
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns Format dict with a key to say if it is contained in a zip or something. e.g. {'display_name': 'CSV', 'container': 'zip', ...} or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type == 'application/msword': # Magic gives this mime-type for other MS Office files too format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = Formats.by_display_name()['IATI'] if format_: return format_ format_ = Formats.by_mime_type().get(mime_type) if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['display_name']) if format_['display_name'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = Formats.by_extension()['ttl'] elif format_['display_name'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = Formats.by_display_name()['RDFa'] else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
def test_by_extension(self): assert_equal(Formats.by_extension()["json"]["display_name"], "JSON")
try: filenames = zip.namelist() finally: zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict( int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() if extension in Formats.by_extension(): format_ = Formats.by_extension()[extension] if format_['openness'] > top_score: top_score = format_['openness'] top_scoring_extension_counts = defaultdict(int) if format_['openness'] == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return Formats.by_display_name()['Zip'] top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1])
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content) ) pkg = dict( title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid } ) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = {'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name(self.munge_title_to_name( '%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def test_by_extension(self): assert_equal(Formats.by_extension()['json']['display_name'], 'JSON')
def get_pkg_dict(cls, resource, ldr): from ckan import model pkg_dict = OrderedDict() extras = OrderedDict() uri = str(resource.identifier) pkg_dict['title'] = unicode(resource[RDFS.label].next()) extras['registry_uri'] = uri # Create or update? pkg = model.Session.query(model.Package) \ .filter_by(state='active') \ .join(model.PackageExtra) \ .filter_by(state='active') \ .filter_by(key='registry_uri') \ .filter_by(value=uri).first() if pkg: pkg_dict['id'] = pkg.id pkg_dict['name'] = pkg.name action = 'update' else: pkg_dict['id'] = unicode(uuid.uuid4()) pkg_dict['name'] = cls._gen_new_name(pkg_dict['title']) action = 'new' dgu_type = cls.get_dgu_type(resource) extras['data_standard_type'] = dgu_type resources = [] for format_display, format_extension, format_dgu in (('RDF ttl', 'ttl', 'RDF'), ('RDF/XML', 'rdf', 'RDF'), ('JSON-LD', 'jsonld', 'JSON')): url = uri + '?_format=%s' % format_extension assert format_dgu in Formats().by_display_name() resources.append({ 'description': '%s as %s' % (dgu_type, format_display), 'url': url, 'format': format_dgu, 'resource_type': 'file' }) resources.append({ 'description': '%s and metadata as %s' % (dgu_type, format_display), 'url': url + METADATA_PARAM, 'format': format_dgu, 'resource_type': 'file' }) pkg_dict['notes'] = unicode(resource[DCT.description].next()) licence_url = str(resource[DCT.license].next()) if 'open-government-licence' in licence_url: pkg_dict['licence_id'] = 'uk-ogl' else: extras['licence_url'] = licence_url # not sure how this will display as just as URL pkg_dict['owner_org'] = cls.get_publisher(resource).id resources.append({ 'description': 'Web page for this %s on a Linked Data Registry' % dgu_type, 'url': uri, 'format': 'HTML', 'resource_type': 'documentation' }) metadata = cls.get_resource_metadata(uri) status = metadata[REG.status].next() extras['status'] = str(status).split('#')[-1] extras['harvested_version'] = str(metadata[OWL.versionInfo].next()) extras['data_standard_type'] = dgu_type pkg_dict['type'] = 'data-standard' pkg_dict['extras'] = [{ 'key': k, 'value': v } for k, v in extras.items()] pkg_dict['resources'] = resources return pkg_dict, action
def test_by_display_name(self): assert_equal(Formats.by_display_name()['JSON']['extension'], 'json')
filenames = zip.namelist() finally: zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict(int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() if extension in Formats.by_extension(): format_ = Formats.by_extension()[extension] if format_['openness'] > top_score: top_score = format_['openness'] top_scoring_extension_counts = defaultdict(int) if format_['openness'] == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return Formats.by_display_name()['Zip'] top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0]
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name( self.munge_title_to_name('%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg