Ejemplo n.º 1
0
 def test_munge_tag_multiple_pass(self):
     '''Munge a list of tags muliple times gives expected results.'''
     for org, exp in self.munge_list:
         first_munge = munge_tag(org)
         assert_equal(first_munge, exp)
         second_munge = munge_tag(first_munge)
         assert_equal(second_munge, exp)
Ejemplo n.º 2
0
def test_munge_tag_multiple_pass(original, expected):
    """Munge a list of tags muliple times gives expected results."""

    first_munge = munge_tag(original)
    assert first_munge == expected
    second_munge = munge_tag(first_munge)
    assert second_munge == expected
Ejemplo n.º 3
0
 def test_munge_tag_muliple_pass(self):
     '''Munge a list of tags muliple times gives expected results.'''
     for org, exp in self.munge_list:
         first_munge = munge_tag(org)
         nose_tools.assert_equal(first_munge, exp)
         second_munge = munge_tag(first_munge)
         nose_tools.assert_equal(second_munge, exp)
Ejemplo n.º 4
0
    def _extract_tags_and_extras(self, content):
        extras = []
        tags = []
        for key, value in content.iteritems():
            if key in self._get_mapping().values():
                continue
            if key in ['type', 'subject']:
                if type(value) is list:
                    tags.extend(value)
                else:
                    tags.extend(value.split(';'))
                continue
            if value and type(value) is list:
                value = value[0]
            if not value:
                value = None
            if key.endswith('date') and value:
                # the ckan indexer can't handle timezone-aware datetime objects
                try:
                    from dateutil.parser import parse
                    date_value = parse(value)
                    date_without_tz = date_value.replace(tzinfo=None)
                    value = date_without_tz.isoformat()
                except (ValueError, TypeError):
                    continue

            extras.append((key, value))

        tags = [munge_tag(tag[:100]) for tag in tags]

        return (tags, extras)
Ejemplo n.º 5
0
    def _extract_tags_and_extras(self, content):
        extras = []
        tags = []
        for key, value in content.items():
            if key in list(self._get_mapping().values()):
                continue
            if key in ['type', 'subject']:
                if type(value) is list:
                    tags.extend(value)
                else:
                    tags.extend(value.split(';'))
                continue
            if value and type(value) is list:
                value = value[0]
            if not value:
                value = None
            if key.endswith('date') and value:
                # the ckan indexer can't handle timezone-aware datetime objects
                try:
                    from dateutil.parser import parse
                    date_value = parse(value)
                    date_without_tz = date_value.replace(tzinfo=None)
                    value = date_without_tz.isoformat()
                except (ValueError, TypeError):
                    continue

            extras.append((key, value))

        tags = [munge_tag(tag[:100]) for tag in tags]

        return (tags, extras)
Ejemplo n.º 6
0
    def _clean_tags(self, tags):
        try:
            def _update_tag(tag_dict, key, newvalue):
                # update the dict and return it
                tag_dict[key] = newvalue
                return tag_dict
                                
            # assume it's in the package_show form                    
            tags = [_update_tag(t, 'name', munge_tag(t['name'])) for t in tags if munge_tag(t['name']) != '']

        except TypeError: # a TypeError is raised if `t` above is a string
           # REST format: 'tags' is a list of strings
           tags = [munge_tag(t) for t in tags if munge_tag(t) != '']                
           tags = list(set(tags))
           return tags
           
        return tags      
Ejemplo n.º 7
0
    def _clean_tags(self, tags):
        try:
            def _update_tag(tag_dict, key, newvalue):
                # update the dict and return it
                tag_dict[key] = newvalue
                return tag_dict

            # assume it's in the package_show form
            tags = [_update_tag(t, 'name', munge_tag(t['name'])) for t in tags if munge_tag(t['name']) != '']

        except TypeError:  # a TypeError is raised if `t` above is a string
            # REST format: 'tags' is a list of strings
            tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
            tags = list(set(tags))
            return tags

        return tags
Ejemplo n.º 8
0
def map_to_ogdch_keywords(geocat_keywords):
    ogdch_keywords = {'fr': [], 'de': [], 'en': [], 'it': []}
    for keyword in geocat_keywords:
        for lang, geocat_keyword in keyword.items():
            if geocat_keyword != \
                    'opendata.swiss' and lang in ['fr', 'de', 'en', 'it']:
                if geocat_keyword:
                    ogdch_keywords[lang].append(munge_tag(geocat_keyword))
    return ogdch_keywords
Ejemplo n.º 9
0
    def _keywords(self, subject, predicate):

        keywords = {}

        for keyword_node in self.g.objects(subject, predicate):
            lang = keyword_node.language
            keyword = munge_tag(unicode(keyword_node))
            keywords.setdefault(lang, []).append(keyword)

        return keywords
    def import_stage(self, harvest_object):
        log.debug('In NadaHarvester import_stage')
        self._set_config(harvest_object.job.source.config)

        if not harvest_object:
            log.error('No harvest object received')
            self._save_object_error('No harvest object received',
                                    harvest_object)
            return False

        try:
            base_url = harvest_object.source.url.rstrip('/')
            ckan_metadata = DdiCkanMetadata()
            pkg_dict = ckan_metadata.load(harvest_object.content)
            pkg_dict = self._convert_to_extras(pkg_dict)

            # update URL with NADA catalog link
            catalog_path = self._get_catalog_path(harvest_object.guid)
            pkg_dict['url'] = base_url + catalog_path

            # set license from harvester config or use CKAN instance default
            if 'license' in self.config:
                pkg_dict['license_id'] = self.config['license']
            else:
                pkg_dict['license_id'] = config.get(
                    'ckanext.ddi.default_license', '')
            tags = []
            for tag in pkg_dict['tags']:
                if isinstance(tag, basestring):
                    tags.append(munge_tag(tag[:100]))
            pkg_dict['tags'] = tags
            pkg_dict['version'] = pkg_dict['version'][:100]

            # add resources
            resources = [
                {
                    'url': base_url + self._get_ddi_api(harvest_object.guid),
                    'name': 'DDI XML of %s' % pkg_dict['title'],
                    'format': 'xml'
                },
                {
                    'url': pkg_dict['url'],
                    'name': 'NADA catalog entry',
                    'format': 'html'
                },
            ]
            pkg_dict['resources'] = resources

            log.debug('package dict: %s' % pkg_dict)
            return self._create_or_update_package(pkg_dict, harvest_object)
        except Exception, e:
            self._save_object_error(('Exception in import stage: %r / %s' %
                                     (e, traceback.format_exc())),
                                    harvest_object)
            return False
    def _keywords(self, subject, predicate):
        keywords = {}
        # initialize the keywords with empty lists for all languages
        for lang in dh.get_langs():
            keywords[lang] = []

        for keyword_node in self.g.objects(subject, predicate):
            lang = keyword_node.language
            keyword = munge_tag(unicode(keyword_node))
            keywords.setdefault(lang, []).append(keyword)

        return keywords
Ejemplo n.º 12
0
def do_load_regions(g, vocab_name):
    concepts = []

    for reg in g.subjects(None, URIRef(REGION_TYPE)):
        names = list(g.objects(reg, URIRef(NAME_TYPE)))
        identifier = munge_tag(reg.split('/')[-1])

        labels = [{'lang': n.language, 'text': n.value} for n in names]

        concepts.append({'name': identifier, 'labels': labels})

    log.info(f'Loaded {len(concepts)} regions')
    return concepts
    def validator(key, data, errors, context):
        if errors[key]:
            return

        value = json.loads(data[key])
        new_value = {}
        for lang in schema['form_languages']:
            new_value[lang] = []
            if lang not in value.keys():
                continue
            for keyword in value[lang]:
                new_value[lang].append(munge_tag(keyword))

        data[key] = json.dumps(new_value)
Ejemplo n.º 14
0
def do_load_regions(g, vocab_name):
    concepts = []
    pref_labels = []
    for reg in g.subjects(None, URIRef(REGION_TYPE)):
        names = list(g.objects(reg, URIRef(NAME_TYPE)))
        identifier = munge_tag(unicode(reg).split('/')[-1])

        concepts.append(identifier)
        for n in names:
            label = {'name': identifier,
                     'lang': n.language,
                     'localized_text': n.value}
            pref_labels.append(label)

    log.info('Loaded %d regions', len(concepts))
    print('Loaded %d regions' % len(concepts))
    return pref_labels, concepts
Ejemplo n.º 15
0
    def importCmd(self, path=None):
        self.ckan = self._ckan_connect()
        
        if (path is None):
            print "Argument 'path' must be set"
            self.helpCmd()
            sys.exit(1)
        
        for root, dirs, files in os.walk(path):
            for dir_name in dirs:
                try:
                    dir_path = os.path.join(root, dir_name)
                    print "dir_path: %s" % dir_path
                    for file_name in os.listdir(dir_path):
                        file_path = os.path.join(dir_path, file_name)
                        if not file_path.endswith('.pdf') or not os.path.isfile(file_path):
                            continue

                        base_name = file_name.split('.')[0]
                        meta_xml_path = os.path.join(dir_path, base_name + '.xml')

                        metadata = self._parse_metadata(meta_xml_path)

                        # read fulltext with tika
                        metadata['full_text_search'] = self.tika_parser.parse_with_tika(file_path)
                        print "FULLTEXT: %s" % metadata['full_text_search']

                        # add tags to structure
                        tags = [
                            metadata.get('source', '').replace('#', ' ').replace('-', ' '),
                            metadata.get('contributor'),
                            metadata.get('creator'),
                            metadata.get('publisher'),
                            metadata.get('pdf_image_color_mode'),
                            metadata.get('pdf_image_color_space'),
                            metadata.get('pdf_image_format'),
                            metadata.get('pdf_image_resolution'),
                        ]
                        tags = [munge_tag(tag) for tag in tags if tag and tag is not None]
                        metadata['tags'] = [{'name': tag} for tag in set(tags)]
                        
                        pkg = self._create_or_update_package(base_name, metadata)
                        self._attach_file(pkg['id'], file_name, file_name, file_path, metadata, 'PDF')
                        self._attach_file(pkg['id'], base_name + '.xml', 'Metadata XML', meta_xml_path, format='XML')
                except Exception, e:
                    traceback.print_exc()
Ejemplo n.º 16
0
    def process(self, record):
        record = record
        data_dict = {
            'id':
            record['ID'],
            'title':
            record['title'].strip('{}'),
            'name':
            munge_title_to_name(record['ID'] + record['title']),
            'notes':
            record['abstract'],
            'harvest_source':
            'MENDELEY',
            'creator':
            record['author'].replace(',', '').split(' and '),
            'tag_string':
            ','.join(munge_tag(tag) for tag in record['keywords'].split(',')),
            'owner_org':
            tk.config.get('ckanext.ingestor.config.mendeley_bib.owner_org',
                          'iaea'),
            'type':
            'publications'
        }
        identifiers = []
        if 'doi' in record:
            identifiers.append('doi:' + record['doi'])
        if 'isbn' in record:
            identifiers.append('isbn:' + record['isbn'])
        if 'pmid' in record:
            identifiers.append('pmid:' + record['pmid'])
        data_dict['identifier'] = identifiers

        if 'editor' in record:
            data_dict['contributor'] = [record['editor']]
        if 'publisher' in record:
            data_dict['publisher'] = [record['publisher']]
        if 'language' in record:
            data_dict['language'] = [record['language']]

        data_dict['source'] = record.get('url')
        user = tk.get_action('get_site_user')({'ignore_auth': True})
        existing = model.Package.get(data_dict['id'])
        action = tk.get_action(
            'package_update' if existing else 'package_create')
        action({'ignore_auth': True, 'user': user['name']}, data_dict)
Ejemplo n.º 17
0
    def _extract_tags_and_extras(self, content):
        extras = []
        tags = []
        for key, value in content.iteritems():
            if key in self._get_mapping().values():
                continue
            if key in ["type", "subject"]:
                if type(value) is list:
                    tags.extend(value)
                else:
                    tags.extend(value.split(";"))
                continue
            if value and type(value) is list:
                value = value[0]
            if not value:
                value = None
            extras.append((key, value))

        tags = [munge_tag(tag[:100]) for tag in tags]

        return (tags, extras)
Ejemplo n.º 18
0
    def _nonEpos_extract_tags_and_extras(self, content):
        extras = []
        tags = []
        for key, value in content.iteritems():
            if key in self._get_mapping().values():
                continue
            if key in ['type', 'subject']:
                if type(value) is list:
                    tags.extend(value)
                else:
                    tags.extend(value.split(';'))
                continue
            if value and type(value) is list:
                value = value[0]
            if not value:
                value = None
            extras.append((key, value))

        tags = [munge_tag(tag[:100]) for tag in tags]

        return (tags, extras)
Ejemplo n.º 19
0
    def parse_dataset(self, dataset_dict, dataset_ref):  # noqa
        log.debug("Parsing dataset '%r'" % dataset_ref)

        dataset_dict['temporals'] = []
        dataset_dict['tags'] = []
        dataset_dict['extras'] = []
        dataset_dict['resources'] = []
        dataset_dict['relations'] = []
        dataset_dict['see_alsos'] = []

        # Basic fields
        for key, predicate in (
            ('identifier', DCT.identifier),
            ('accrual_periodicity', DCT.accrualPeriodicity),
            ('spatial_uri', DCT.spatial),
            ('spatial', DCT.spatial),
            ('url', DCAT.landingPage),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                dataset_dict[key] = value

        # Timestamp fields
        for key, predicate in (
            ('issued', DCT.issued),
            ('modified', DCT.modified),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                dataset_dict[key] = self._clean_datetime(value)

        # Multilingual basic fields
        for key, predicate in (
            ('title', DCT.title),
            ('description', DCT.description),
        ):
            value = self._object_value(dataset_ref, predicate, multilang=True)
            if value:
                dataset_dict[key] = value

        # Tags
        keywords = self._object_value_list(dataset_ref, DCAT.keyword) or []
        for keyword in keywords:
            dataset_dict['tags'].append({'name': munge_tag(unicode(keyword))})

        # Keywords
        dataset_dict['keywords'] = self._keywords(dataset_ref, DCAT.keyword)

        # Themes
        dcat_theme_urls = self._object_value_list(dataset_ref, DCAT.theme)
        if dcat_theme_urls:
            dataset_dict['groups'] = []
            for dcat_theme_url in dcat_theme_urls:
                search_result = slug_id_pattern.search(dcat_theme_url)
                dcat_theme_slug = search_result.group()
                dataset_dict['groups'].append({'name': dcat_theme_slug})

        #  Languages
        languages = self._object_value_list(dataset_ref, DCT.language)
        if languages:
            dataset_dict['language'] = languages

        # Contact details
        dataset_dict['contact_points'] = self._contact_points(
            dataset_ref, DCAT.contactPoint)

        # Publisher
        dataset_dict['publishers'] = self._publishers(dataset_ref,
                                                      DCT.publisher)

        # Relations
        dataset_dict['relations'] = self._relations(dataset_ref, DCT.relation)

        # Temporal
        dataset_dict['temporals'] = self._temporals(dataset_ref, DCT.temporal)

        # References
        see_alsos = self._object_value_list(dataset_ref, RDFS.seeAlso)
        for see_also in see_alsos:
            dataset_dict['see_alsos'].append({'dataset_identifier': see_also})

        # Dataset URI (explicitly show the missing ones)
        dataset_uri = (unicode(dataset_ref) if isinstance(
            dataset_ref, rdflib.term.URIRef) else '')
        dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri})

        # Resources
        for distribution in self._distributions(dataset_ref):

            resource_dict = {
                'media_type': '',
                'language': [],
            }

            #  Simple values
            for key, predicate in (
                ('identifier', DCT.identifier),
                ('format', DCT['format']),
                ('mimetype', DCAT.mediaType),
                ('media_type', DCAT.mediaType),
                ('download_url', DCAT.downloadURL),
                ('url', DCAT.accessURL),
                ('rights', DCT.rights),
                ('license', DCT.license),
            ):
                value = self._object_value(distribution, predicate)
                if value:
                    resource_dict[key] = value

            # if media type is not set, use format as fallback
            if (not resource_dict.get('media_type')
                    and resource_dict.get('format')):
                resource_dict['media_type'] = resource_dict['format']

            # Timestamp fields
            for key, predicate in (
                ('issued', DCT.issued),
                ('modified', DCT.modified),
            ):
                value = self._object_value(distribution, predicate)
                if value:
                    resource_dict[key] = self._clean_datetime(value)

            # Multilingual fields
            for key, predicate in (
                ('title', DCT.title),
                ('description', DCT.description),
            ):
                value = self._object_value(distribution,
                                           predicate,
                                           multilang=True)
                if value:
                    resource_dict[key] = value

            resource_dict['url'] = (
                self._object_value(distribution, DCAT.accessURL)
                or self._object_value(distribution, DCAT.downloadURL) or '')

            # languages
            for language in self._object_value_list(distribution,
                                                    DCT.language):
                resource_dict['language'].append(language)

            # byteSize
            byte_size = self._object_value_int(distribution, DCAT.byteSize)
            if byte_size is not None:
                resource_dict['byte_size'] = byte_size

            # Distribution URI (explicitly show the missing ones)
            resource_dict['uri'] = (unicode(distribution) if isinstance(
                distribution, rdflib.term.URIRef) else '')

            dataset_dict['resources'].append(resource_dict)

        log.debug("Parsed dataset '%r': %s" % (dataset_ref, dataset_dict))

        return dataset_dict
Ejemplo n.º 20
0
    def _build_package_dict(self, context, harvest_object):
        '''
        Build and return a package_dict suitable for use with CKAN
        `package_create` and `package_update`.
        '''

        # Local harvest source organization
        source_dataset = toolkit.get_action('package_show')(
            context.copy(), {
                'id': harvest_object.source.id
            })
        local_org = source_dataset.get('owner_org')

        res = json.loads(harvest_object.content)

        package_dict = {
            'title': res['resource']['name'],
            'name': self._gen_new_name(res['resource']['name']),
            'url': res.get('permalink', ''),
            'notes': res['resource'].get('description', ''),
            'author': res['resource']['attribution'],
            'tags': [],
            'extras': [],
            'identifier': res['resource']['id'],
            'owner_org': local_org,
            'resources': [],
        }

        # Add tags
        package_dict['tags'] = \
            [{'name': munge_tag(t)}
             for t in res['classification'].get('tags', [])
             + res['classification'].get('domain_tags', [])]

        # Add domain_metadata to extras
        package_dict['extras'].extend(res['classification'].get(
            'domain_metadata', []))

        # Add source createdAt to extras
        package_dict['extras'].append({
            'key': 'source_created_at',
            'value': res['resource']['createdAt']
        })

        # Add source updatedAt to extras
        package_dict['extras'].append({
            'key': 'source_updated_at',
            'value': res['resource']['updatedAt']
        })

        # Add owner_display_name to extras
        package_dict['extras'].append({
            'key':
            'owner_display_name',
            'value':
            res.get('owner', {}).get('display_name')
        })

        # Add categories to extras
        package_dict['extras'].append({
            'key':
            'categories',
            'value': [
                t for t in res['classification'].get('categories', []) +
                res['classification'].get('domain_categories', [])
            ],
        })

        # Add Socrata metadata.license if available
        if res['metadata'].get('license', False):
            package_dict['extras'].append({
                'key': 'license',
                'value': res['metadata']['license']
            })

        # Add provenance
        if res['resource'].get('provenance', False):
            package_dict['provenance'] = res['resource']['provenance']

        # Resources
        package_dict['resources'] = [{
            'url':
            DOWNLOAD_ENDPOINT_TEMPLATE.format(
                domain=urlparse(harvest_object.source.url).hostname,
                resource_id=res['resource']['id']),
            'format':
            'CSV'
        }]

        return package_dict
Ejemplo n.º 21
0
    def import_stage(self, harvest_object):
        log.debug('In NadaHarvester import_stage')
        self._set_config(harvest_object.job.source.config)

        if not harvest_object:
            log.error('No harvest object received')
            self._save_object_error(
                'No harvest object received',
                harvest_object
            )
            return False

        try:
            base_url = harvest_object.source.url.rstrip('/')
            
            # Get a class which maps ckan metadata to the DDI equivalent
            ckan_metadata = DdiCkanMetadata()
            # Extract metadata content from XML DDI
            #   put it in a dictionary
            pkg_dict = ckan_metadata.load(harvest_object.content)
            
            # Go through the dictionary and put 'uncrecognised' attributes
            #   into a field called 'extras' (any field which isn't in DEFAULT ATTRIBUTES)
            pkg_dict = self._convert_to_extras(pkg_dict)

            # update URL with NADA catalog link
            catalog_path = self._get_catalog_path(harvest_object.guid)
            pkg_dict['url'] = base_url + catalog_path

            # set license from harvester config or use CKAN instance default
            if 'license' in self.config:
                pkg_dict['license_id'] = self.config['license']
            else:
                pkg_dict['license_id'] = config.get(
                    'ckanext.ddi.default_license',
                    ''
                )
            # Add tags if necessary   
            tags = []
            for tag in pkg_dict['tags']:
                if isinstance(tag, basestring):
                    tags.append(munge_tag(tag[:100]))
            pkg_dict['tags'] = tags
            pkg_dict['version'] = pkg_dict['version'][:100]

            # add resources
            # basically sources
            resources = [
                {
                    'url': base_url + self._get_ddi_api(harvest_object.guid),
                    'name': 'DDI XML of %s' % pkg_dict['title'],
                    'format': 'xml'
                },
                {
                    'url': pkg_dict['url'],
                    'name': 'NADA catalog entry',
                    'format': 'html'
                },
            ]
            pkg_dict['resources'] = resources

            log.debug('package dict: %s' % pkg_dict)
            # Now create the package
            return self._create_or_update_package(pkg_dict, harvest_object)
        except Exception, e:
            self._save_object_error(
                (
                    'Exception in import stage: %r / %s'
                    % (e, traceback.format_exc())
                ),
                harvest_object
            )
            return False
Ejemplo n.º 22
0
def munge_tags(package_dict):
    tags = package_dict.get('tags', [])
    tags = [munge_tag(t['name']) for t in tags if t]
    tags = [t for t in tags if t != '__']  # i.e. just padding
    tags = remove_duplicates_in_a_list(tags)
    package_dict['tags'] = [dict(name=name) for name in tags]
Ejemplo n.º 23
0
    def _build_package_dict(self, context, harvest_object):
        '''
        Build and return a package_dict suitable for use with CKAN
        `package_create` and `package_update`.
        '''

        # Local harvest source organization
        source_dataset = toolkit.get_action('package_show')(
            context.copy(),
            {'id': harvest_object.source.id}
        )
        local_org = source_dataset.get('owner_org')

        res = json.loads(harvest_object.content)

        package_dict = {
            'title': res['resource']['name'],
            'name': self._gen_new_name(res['resource']['name']),
            'url': res.get('permalink', ''),
            'notes': res['resource'].get('description', ''),
            'author': res['resource']['attribution'],
            'tags': [],
            'extras': [],
            'identifier': res['resource']['id'],
            'owner_org': local_org,
            'resources': [],
        }

        # Add tags
        package_dict['tags'] = \
            [{'name': munge_tag(t)}
             for t in res['classification'].get('tags', [])
             + res['classification'].get('domain_tags', [])]

        # Add domain_metadata to extras
        package_dict['extras'].extend(res['classification']
                                      .get('domain_metadata', []))

        # Add source createdAt to extras
        package_dict['extras'].append({
            'key': 'source_created_at',
            'value': res['resource']['createdAt']
        })

        # Add source updatedAt to extras
        package_dict['extras'].append({
            'key': 'source_updated_at',
            'value': res['resource']['updatedAt']
        })

        # Add owner_display_name to extras
        package_dict['extras'].append({
            'key': 'owner_display_name',
            'value': res.get('owner', {}).get('display_name')
        })

        # Add categories to extras
        package_dict['extras'].append({
            'key': 'categories',
            'value': [t
                      for t in res['classification'].get('categories', [])
                      + res['classification'].get('domain_categories', [])],
        })

        # Add Socrata metadata.license if available
        if res['metadata'].get('license', False):
            package_dict['extras'].append({
                'key': 'license',
                'value': res['metadata']['license']
            })

        # Add provenance
        if res['resource'].get('provenance', False):
            package_dict['provenance'] = res['resource']['provenance']

        # Resources
        package_dict['resources'] = [{
            'url': DOWNLOAD_ENDPOINT_TEMPLATE.format(
                domain=urlparse(harvest_object.source.url).hostname,
                resource_id=res['resource']['id']),
            'format': 'CSV'
        }]

        return package_dict
Ejemplo n.º 24
0
Archivo: api.py Proyecto: jgrocha/ckan
 def munge_tag(self):
     tag = request.params.get("tag") or request.params.get("name")
     munged_tag = munge.munge_tag(tag)
     return self._finish_ok(munged_tag)
Ejemplo n.º 25
0
 def test_munge_tag(self):
     '''Munge a list of tags gives expected results.'''
     for org, exp in self.munge_list:
         munge = munge_tag(org)
         nose_tools.assert_equal(munge, exp)
Ejemplo n.º 26
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                existing_package_dict = self._find_existing_package(package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                            existing_package_dict['name'])
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Ejemplo n.º 27
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True
Ejemplo n.º 28
0
    def parse_dataset(self, dataset_dict, dataset_ref):

        dataset_dict['extras'] = []
        dataset_dict['resources'] = []

        # Basic fields
        for key, predicate in (
            ('title', DCT.title),
            ('notes', DCT.description),
            ('url', DCAT.landingPage),
            ('version', OWL.versionInfo),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                dataset_dict[key] = value

        if not dataset_dict.get('version'):
            # adms:version was supported on the first version of the DCAT-AP
            value = self._object_value(dataset_ref, ADMS.version)
            if value:
                dataset_dict['version'] = value

        # Tags
        keywords = self._object_value_list(dataset_ref, DCAT.keyword) or []
        # Split keywords with commas
        keywords_with_commas = [k for k in keywords if ',' in k]
        for keyword in keywords_with_commas:
            keywords.remove(keyword)
            keywords.extend([k.strip() for k in keyword.split(',')])

        # replace munge_tag to noop if there's no need to clean tags
        do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False))
        tags_val = [munge_tag(tag) if do_clean else tag for tag in keywords]
        tags = [{'name': tag} for tag in tags_val]
        dataset_dict['tags'] = tags

        # Extras

        #  Simple values
        for key, predicate in (
            ('issued', DCT.issued),
            ('modified', DCT.modified),
            ('identifier', DCT.identifier),
            ('version_notes', ADMS.versionNotes),
            ('frequency', DCT.accrualPeriodicity),
            ('access_rights', DCT.accessRights),
            ('provenance', DCT.provenance),
            ('dcat_type', DCT.type),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                dataset_dict['extras'].append({'key': key, 'value': value})

        #  Lists
        for key, predicate, in (
            ('language', DCT.language),
            ('theme', DCAT.theme),
            ('alternate_identifier', ADMS.identifier),
            ('conforms_to', DCT.conformsTo),
            ('documentation', FOAF.page),
            ('related_resource', DCT.relation),
            ('has_version', DCT.hasVersion),
            ('is_version_of', DCT.isVersionOf),
            ('source', DCT.source),
            ('sample', ADMS.sample),
        ):
            values = self._object_value_list(dataset_ref, predicate)
            if values:
                dataset_dict['extras'].append({
                    'key': key,
                    'value': json.dumps(values)
                })

        # Contact details
        contact = self._contact_details(dataset_ref, DCAT.contactPoint)
        if not contact:
            # adms:contactPoint was supported on the first version of DCAT-AP
            contact = self._contact_details(dataset_ref, ADMS.contactPoint)

        if contact:
            for key in ('uri', 'name', 'email'):
                if contact.get(key):
                    dataset_dict['extras'].append({
                        'key':
                        'contact_{0}'.format(key),
                        'value':
                        contact.get(key)
                    })

        # Publisher
        publisher = self._publisher(dataset_ref, DCT.publisher)
        for key in ('uri', 'name', 'email', 'url', 'type'):
            if publisher.get(key):
                dataset_dict['extras'].append({
                    'key':
                    'publisher_{0}'.format(key),
                    'value':
                    publisher.get(key)
                })

        # Temporal
        start, end = self._time_interval(dataset_ref, DCT.temporal)
        if start:
            dataset_dict['extras'].append({
                'key': 'temporal_start',
                'value': start
            })
        if end:
            dataset_dict['extras'].append({
                'key': 'temporal_end',
                'value': end
            })

        # Spatial
        spatial = self._spatial(dataset_ref, DCT.spatial)
        for key in ('uri', 'text', 'geom'):
            if spatial.get(key):
                dataset_dict['extras'].append({
                    'key':
                    'spatial_{0}'.format(key) if key != 'geom' else 'spatial',
                    'value':
                    spatial.get(key)
                })

        # Dataset URI (explicitly show the missing ones)
        dataset_uri = (unicode(dataset_ref) if isinstance(
            dataset_ref, rdflib.term.URIRef) else '')
        dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri})

        # License
        if 'license_id' not in dataset_dict:
            dataset_dict['license_id'] = self._license(dataset_ref)

        # Source Catalog
        if toolkit.asbool(config.get(DCAT_EXPOSE_SUBCATALOGS, False)):
            catalog_src = self._get_source_catalog(dataset_ref)
            if catalog_src is not None:
                src_data = self._extract_catalog_dict(catalog_src)
                dataset_dict['extras'].extend(src_data)

        # Resources
        for distribution in self._distributions(dataset_ref):

            resource_dict = {}

            #  Simple values
            for key, predicate in (
                ('name', DCT.title),
                ('description', DCT.description),
                ('download_url', DCAT.downloadURL),
                ('issued', DCT.issued),
                ('modified', DCT.modified),
                ('status', ADMS.status),
                ('rights', DCT.rights),
                ('license', DCT.license),
            ):
                value = self._object_value(distribution, predicate)
                if value:
                    resource_dict[key] = value

            resource_dict['url'] = (
                self._object_value(distribution, DCAT.accessURL)
                or self._object_value(distribution, DCAT.downloadURL))
            #  Lists
            for key, predicate in (
                ('language', DCT.language),
                ('documentation', FOAF.page),
                ('conforms_to', DCT.conformsTo),
            ):
                values = self._object_value_list(distribution, predicate)
                if values:
                    resource_dict[key] = json.dumps(values)

            # Format and media type
            normalize_ckan_format = config.get(
                'ckanext.dcat.normalize_ckan_format', True)
            imt, label = self._distribution_format(distribution,
                                                   normalize_ckan_format)

            if imt:
                resource_dict['mimetype'] = imt

            if label:
                resource_dict['format'] = label
            elif imt:
                resource_dict['format'] = imt

            # Size
            size = self._object_value_int(distribution, DCAT.byteSize)
            if size is not None:
                resource_dict['size'] = size

            # Checksum
            for checksum in self.g.objects(distribution, SPDX.checksum):
                algorithm = self._object_value(checksum, SPDX.algorithm)
                checksum_value = self._object_value(checksum,
                                                    SPDX.checksumValue)
                if algorithm:
                    resource_dict['hash_algorithm'] = algorithm
                if checksum_value:
                    resource_dict['hash'] = checksum_value

            # Distribution URI (explicitly show the missing ones)
            resource_dict['uri'] = (unicode(distribution) if isinstance(
                distribution, rdflib.term.URIRef) else '')

            dataset_dict['resources'].append(resource_dict)

        if self.compatibility_mode:
            # Tweak the resulting dict to make it compatible with previous
            # versions of the ckanext-dcat parsers
            for extra in dataset_dict['extras']:
                if extra['key'] in (
                        'issued',
                        'modified',
                        'publisher_name',
                        'publisher_email',
                ):

                    extra['key'] = 'dcat_' + extra['key']

                if extra['key'] == 'language':
                    extra['value'] = ','.join(
                        sorted(json.loads(extra['value'])))

        return dataset_dict
Ejemplo n.º 29
0
    def handle_fluent_harvest_dictinary(self, field, iso_values, package_dict,
                                        schema, handled_fields,
                                        harvest_config):
        field_name = field['field_name']
        if field_name in handled_fields:
            return

        field_value = {}

        if not field.get('preset', '').startswith(u'fluent'):
            return

        # set default language, default to english
        default_language = iso_values.get('metadata-language', 'en')[0:2]
        if not default_language:
            default_language = 'en'

        # handle tag fields
        if field.get('preset', '') == u'fluent_tags':
            fluent_tags = iso_values.get(field_name, [])
            schema_languages = plugins.toolkit.h.fluent_form_languages(
                schema=schema)
            do_clean = toolkit.asbool(harvest_config.get('clean_tags', False))

            # init language key
            field_value = {sl: [] for sl in schema_languages}

            # process fluent_tags by convert list of language dictionaries into
            # a dictionary of language lists
            for t in fluent_tags:
                tobj = self.from_json(t.get('keyword', t))
                if isinstance(tobj, Number):
                    tobj = str(tobj)
                if isinstance(tobj, dict):
                    for key, value in tobj.items():
                        if key in schema_languages:
                            if do_clean:
                                if isinstance(value, list):
                                    value = [
                                        munge.munge_tag(kw) for kw in value
                                    ]
                                else:
                                    value = munge.munge_tag(value)
                            field_value[key].append(value)
                else:
                    if do_clean:
                        tobj = munge.munge_tag(tobj)
                    field_value[default_language].append(tobj)

            package_dict[field_name] = field_value

            # update tags with all values from fluent_tags
            tag_list = [t['name'] for t in package_dict['tags']]
            for item in field_value.get('en', []) + field_value.get('fr', []):
                if item not in tag_list:
                    tag_list.append(item)
            package_dict['tags'] = [{'name': t} for t in tag_list]

        else:
            # Populate translated fields from core. this could have been done in
            # the spatial extensions. example 'title' -> 'title_translated'

            # strip trailing _translated part of field name
            if field_name.endswith(u'_translated'):
                package_fn = field_name[:-11]
            else:
                package_fn = field_name

            package_val = package_dict.get(package_fn, '')
            field_value = self.from_json(package_val)

            if isinstance(field_value,
                          dict):  # assume bilingual values already in data
                package_dict[field_name] = field_value
            else:
                # create bilingual dictionary. This will likely fail validation as it does not contain all the languages
                package_dict[field_name] = {}
                package_dict[field_name][default_language] = field_value

        handled_fields.append(field_name)
Ejemplo n.º 30
0
 def _clean_keywords(self, pkg_dict):
     clean_keywords = {}
     if 'keywords' in pkg_dict:
         for lang, tag_list in pkg_dict['keywords'].iteritems():
             clean_keywords[lang] = [munge_tag(tag) for tag in tag_list]
     return clean_keywords
Ejemplo n.º 31
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        logger.debug("in import stage: %s" % harvest_object.guid)
        if not harvest_object:
            logger.error('No harvest object received')
            self._save_object_error('No harvest object received')
            return False
        try:
            self._set_config(harvest_object.job.source.config)

            package_dict = json.loads(harvest_object.content)
            data_dict = {}
            data_dict['id'] = package_dict['id']
            data_dict['title'] = package_dict['title']
            data_dict['name'] = munge_title_to_name(package_dict['name'])

            data_dict['notes'] = markdown_extract(
                package_dict.get('description'))

            tags = package_dict.get('keyword', [])
            data_dict['tag_string'] = ', '.join(
                [munge_tag(tag) for tag in tags])

            data_dict['private'] = False

            license_id = package_dict.get('license',
                                          'cc-by').strip('/').split('/')[-1]

            if license_id == 'de2a56f5-a565-481a-8589-406dc40b5588':
                license_id = 'sprep-public-license'
            data_dict['license_id'] = license_id or 'notspecified'

            data_dict['created'] = _parse_drupal_date(package_dict['issued'])
            data_dict['modified'] = _parse_drupal_date(
                package_dict['modified'])

            c_point, c_email = package_dict['contactPoint'][
                'fn'], package_dict['contactPoint']['hasEmail'].split(':')[-1]
            if c_email != '*****@*****.**':
                data_dict['contact_uri'] = c_point
                data_dict['contact_email'] = c_email
            data_dict['resources'] = []
            for res in package_dict.get('distribution', []):

                # res['issued'] = _parse_drupal_date(res.pop('created'))
                # res['modified'] = _parse_drupal_date(
                #     res.pop('last_modified').replace('Date changed ', '')
                # )
                res['url'] = res.get('downloadURL') or res.get('accessURL')
                res['format'] = res['format']
                res['name'] = res['title']
                res['description'] = markdown_extract(res.get('description'))
                data_dict['resources'].append(res)
            if 'spatial' in package_dict:
                data_dict['spatial'] = package_dict['spatial']
                try:
                    data_dict['spatial'] = json.dumps({
                        "type":
                        "Polygon",
                        "coordinates":
                        [[[float(c) for c in pair.split()]
                          for pair in RE_SPATIAL.match(
                              data_dict['spatial']).group(1).split(', ')]]
                    })
                except KeyError:
                    pass
                # package_dict.pop('type')
            # add owner_org
            source_dataset = get_action('package_show')(
                {
                    'ignore_auth': True
                }, {
                    'id': harvest_object.source.id
                })

            owner_org = source_dataset.get('owner_org')
            data_dict['owner_org'] = owner_org
            data_dict['member_countries'] = country_mapping[None]
            if 'isPartOf' in package_dict:
                country = package_dict['isPartOf'].split('.')[0]
                data_dict['member_countries'] = country_mapping.get(
                    country, country_mapping[None])
                org = model.Session.query(
                    model.Group).filter_by(name=country + '-data').first()
                if org:
                    data_dict['owner_org'] = org.id

            data_dict['source'] = package_dict.get('landingPage')

            data_dict['theme'] = package_dict.get('theme', [])
            data_dict['theme'] = package_dict.get('theme', [])

            data_dict['thematic_area_string'] = _map_theme_to_topic(
                data_dict['theme'])

            data_dict['harvest_source'] = 'SPREP'

            self._create_or_update_package(data_dict, harvest_object,
                                           'package_show')

            Session.commit()

            logger.debug("Finished record")
        except:
            logger.exception('Something went wrong!')
            self._save_object_error('Exception in import stage',
                                    harvest_object)
            return False
        return True
Ejemplo n.º 32
0
def split_tags(tag):
    tags = []
    for tag in tag.split(','):
        tags.extend(tag.split('>'))
    return [munge_tag(tag) for tag in tags if munge_tag(tag) != '']
Ejemplo n.º 33
0
 def _clean_keywords(self, pkg_dict):
     clean_keywords = {}
     if 'keywords' in pkg_dict:
         for lang, tag_list in pkg_dict['keywords'].iteritems():
             clean_keywords[lang] = [munge_tag(tag) for tag in tag_list if tag != 'opendata.swiss']  # noqa
     return clean_keywords
Ejemplo n.º 34
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')

                #TODO: use site user when available
                user_name = self.config.get('user', u'harvest')
            else:
                api_version = 2
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags]
            tags = list(set(tags))
            package_dict['tags'] = tags

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)
                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created
                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Ejemplo n.º 35
0
 def import_stage(self, harvest_object):
     """Import the metadata received in the fetch stage to a dataset and
     create groups if ones are defined. Fill in metadata from study and
     document description.
     """
     try:
         xml_dict = {}
         xml_dict["source"] = harvest_object.content
         udict = json.loads(harvest_object.content)
         if "url" in udict:
             f = urllib2.urlopen(udict["url"]).read()
             ddi_xml = BeautifulSoup(f, "xml")
         else:
             self._save_object_error("No url in content!", harvest_object)
             return False
     except urllib2.URLError:
         self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object)
         return False
     except etree.XMLSyntaxError:
         self._save_object_error("Unable to parse XML!", harvest_object)
         return False
     model.repo.new_revision()
     study_descr = ddi_xml.codeBook.stdyDscr
     document_info = ddi_xml.codeBook.docDscr.citation
     title = study_descr.citation.titlStmt.titl.string
     if not title:
         title = document_info.titlStmt.titl.string
     name = study_descr.citation.titlStmt.IDNo.string
     update = True
     pkg = Package.get(name)
     if not pkg:
         pkg = Package(name=name)
         update = False
     producer = study_descr.citation.prodStmt.producer
     if not producer:
         producer = study_descr.citation.rspStmt.AuthEnty
     if not producer:
         producer = study_descr.citation.rspStmt.othId
     pkg.author = producer.string
     pkg.maintainer = producer.string
     if study_descr.citation.distStmt.contact:
         pkg.maintainer = study_descr.citation.distStmt.contact.string
     if document_info.titlStmt.IDNo:
         pkg.id = document_info.titlStmt.IDNo.string
     keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas"))
     keywords = list(set(keywords))
     for kw in keywords:
         if kw:
             vocab = None
             kw_str = ""
             if kw.string:
                 kw_str = kw.string
             if "vocab" in kw.attrs:
                 vocab = kw.attrs.get("vocab", None)
             if vocab and kw.string:
                 kw_str = vocab + " " + kw.string
             pkg.add_tag_by_name(munge_tag(kw_str))
     if study_descr.stdyInfo.abstract:
         description_array = study_descr.stdyInfo.abstract("p")
     else:
         description_array = study_descr.citation.serStmt.serInfo("p")
     pkg.notes = "<br />".join([description.string for description in description_array])
     pkg.title = title[:100]
     pkg.url = udict["url"]
     if not update:
         ofs = get_ofs()
         nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
         idno = study_descr.citation.titlStmt.IDNo
         agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string
         label = "%s/%s.xml" % (nowstr, agencyxml)
         ofs.put_stream(BUCKET, label, f, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f))
         pkg.add_resource(
             url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title
         )
     metas = {}
     descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants]
     for docextra in descendants:
         if isinstance(docextra, Tag):
             if docextra:
                 if docextra.name == "p":
                     docextra.name = docextra.parent.name
                 if not docextra.name in metas and docextra.string:
                     metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra)
                 else:
                     if docextra.string:
                         metas[docextra.name] += (
                             " " + docextra.string if docextra.string else self._collect_attribs(docextra)
                         )
     if ddi_xml.codeBook.dataDscr and not update:
         vars = ddi_xml.codeBook.dataDscr("var")
         heads = self._get_headers()
         c_heads = ["ID", "catValu", "labl", "catStat"]
         f_var = StringIO.StringIO()
         c_var = StringIO.StringIO()
         varwriter = csv.DictWriter(f_var, heads)
         codewriter = csv.DictWriter(c_var, c_heads)
         heading_row = {}
         for head in heads:
             heading_row[head] = head
         c_heading_row = {}
         for head in c_heads:
             c_heading_row[head] = head
         varwriter.writerow(heading_row)
         codewriter.writerow(c_heading_row)
         for var in vars:
             try:
                 varwriter.writerow(self._construct_csv(var, heads))
                 codewriter.writerows(self._create_code_rows(var))
             except ValueError, e:
                 raise IOError("Failed to import DDI to CSV! %s" % e)
         f_var.flush()
         label = "%s/%s_var.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, f_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len)
         label = "%s/%s_code.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, c_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len)
         f_var.seek(0)
         reader = csv.DictReader(f_var)
         for var in reader:
             metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]
Ejemplo n.º 36
0
 def munge_tag(self):
     tag = request.params.get('tag') or request.params.get('name')
     munged_tag = munge.munge_tag(tag)
     return self._finish_ok(munged_tag)
Ejemplo n.º 37
0
 def test_munge_tag(self):
     '''Munge a list of tags gives expected results.'''
     for org, exp in self.munge_list:
         munge = munge_tag(org)
         assert_equal(munge, exp)
Ejemplo n.º 38
0
 def munge_tag(self):
     tag = request.params.get('tag') or request.params.get('name')
     munged_tag = munge.munge_tag(tag)
     return self._finish_ok(munged_tag)
    def import_stage(self, harvest_object):
        log.debug('In DotStatHarvester import_stage')
        self._set_config(harvest_object.job.source.config)

        if not harvest_object:
            log.error('No harvest object received')
            self._save_object_error('No harvest object received',
                                    harvest_object)
            return False

        try:
            base_url = harvest_object.source.url
            # Parse the SDMX as XML with bs4
            soup = BeautifulSoup(harvest_object.content, 'xml')

            # Make a package dict
            pkg_dict = {}
            pkg_dict['id'] = harvest_object.guid

            # Added thematic string
            pkg_dict['thematic_area_string'] = ["Official Statistics"]

            # Open license for all dotStat resources
            pkg_dict['license_id'] = "other-open"

            # Get owner_org if there is one
            source_dataset = get_action('package_show')(
                {
                    'ignore_auth': True
                }, {
                    'id': harvest_object.source.id
                })
            owner_org = source_dataset.get('owner_org')
            pkg_dict['owner_org'] = owner_org

            # Match other fields with tags in XML structure
            agency_id = self.config['agencyId']
            stats_guid = self._get_object_extra(harvest_object, 'stats_guid')

            structure = soup.find('Dataflow')
            pkg_dict['title'] = structure.find('Name', {"xml:lang" : "en"}).text
            pkg_dict['publisher_name'] = structure['agencyID']
            pkg_dict['version'] = structure['version']

            # Need to change url to point to Data Explorer
            de_url = 'https://stats.pacificdata.org/vis?locale=en&dataflow[datasourceId]=SPC2&dataflow[agencyId]={}&dataflow[dataflowId]={}&dataflow[version]={}'.format(
                agency_id,
                stats_guid,
                structure['version']
            )
            pkg_dict['source'] = de_url


            # Set resource to metadata data dictionary (if available)
            annotation = structure.find('Annotations')
            annots = annotation.find_all('Annotation')
            metaurl = None
            for annot in annots:
                metalink = annot.find('AnnotationType')
                if metalink.text == 'EXT_RESOURCE':
                    metaurl = annot.find('AnnotationText', {'xml:lang':'en'}).text.split('|')[1]

            # Set default resource, and metadata pdf if it exists
            if metaurl:
                pkg_dict['resources'] = [
                {
                    'url':
                    'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format(
                        agency_id,
                        stats_guid,
                        structure['version']
                    ),
                    'format': 'CSV',
                    'mimetype': 'CSV',
                    'description': 'All data for {}'.format(pkg_dict['title']),
                    'name': '{} Data CSV'.format(pkg_dict['title'])
                },
                {
                    'url': metaurl,
                    'format': 'PDF',
                    'mimetype': 'PDF',
                    'description': 'Detailed metadata dictionary for {}'.format(pkg_dict['title']),
                    'name': '{} Metadata PDF'.format(pkg_dict['title'])
                }]
            else:
                pkg_dict['resources'] = [
                {
                    'url':
                    'https://stats-nsi-stable.pacificdata.org/rest/data/{},{},{}/all/?format=csv'.format(
                        agency_id,
                        stats_guid,
                        structure['version']
                    ),
                    'format': 'CSV',
                    'mimetype': 'CSV',
                    'description': 'All data for {}'.format(pkg_dict['title']),
                    'name': '{} Data CSV'.format(pkg_dict['title'])
                }]


            # Get notes/description if it exists
            try:
                desc = structure.find('Description', {"xml:lang": "en"}).text
                desc += '\nFind more Pacific data on PDH.stat : https://stats.pacificdata.org/'
                pkg_dict['notes'] = desc
            except Exception as e:
                log.error("An error occured: {}".format(e))
                pkg_dict['notes'] = 'Find more Pacific data on PDH.stat : https://stats.pacificdata.org/'

            # Add tags from CategoryScheme and ConceptScheme
            # List of uninteresting tags
            generic_schemes = ['Time', 'Frequency', 'Observation value', 'Observation Status', 'Confidentiality status', 'Unit of measure', 'Unit multiplier', 'Base period', 'Comment',
                'Decimals', 'Data source', 'Pacific Island Countries and territories', 'Indicator', 'Transformation', 'Reporting type', 'Composite breakdown']
            tag_strings = []
            
            # For finding Category Schemes for tags
            schemes = soup.find('CategorySchemes')
            if schemes is not None:
                catschemes = schemes.find_all('CategoryScheme')
                for catscheme in catschemes:
                    cats = catscheme.find_all('Category')
                    for cat in cats:
                        found = cat.find('Name', {'xml:lang': 'en'}).text
                        if found not in tag_strings:
                            tag_strings.append(found)
           
            # For finding Concept Schemes for tags
            concepts = soup.find('Concepts')
            if concepts is not None:
                concschemes = concepts.find_all('ConceptScheme')
                for concscheme in concschemes:
                    concepts = concscheme.find_all('Concept')
                    for concept in concepts:
                        found = concept.find('Name', {'xml:lang': 'en'}).text
                        if found not in tag_strings:
                            tag_strings.append(found)

            # Tag cleaning
            psp_mapping = {
                'Industry and Services': ['pacific-skills', 'industry', 'training'],
                'Education level': ['pacific-skills', 'education', 'training'],
                'Occupation': ['pacific-skills', 'occupation'],
                'Disability': ['pacific-skills', 'disability'],
                'Economic sector': ['pacific-skills', 'industry', 'training'],
                'Labour force status': ['pacific-skills', 'employment'],
                'Employment status': ['pacific-skills', 'employment'],
                'Labour and employment status': ['pacific-skills', 'employment']
            }

            if len(tag_strings) > 0:
                # Bring in PSP tags
                for tag in tag_strings:
                    if tag in list(psp_mapping.keys()):
                        tag_strings.extend(psp_mapping[tag])
                # Remove duplicates
                tag_strings = list(set(tag_strings))
                # Remove tags found in generic_schemes list
                tags = [x.lower() for x in tag_strings if x not in generic_schemes]
                # Make a string of tags for CKAN
                pkg_dict['tag_string'] = ', '.join([munge_tag(tag) for tag in tags])

            
            '''
            May need modifying when DF_SDG is broken into several DFs
            This gets the list of indicators for SDG-related dataflows
            Stores the list of strings in 'alternate_identifier' field
            '''
            if soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES'
                                            }) is not None:
                pkg_dict['alternate_identifier'] = []
                codelist = soup.find('Codelist', attrs={'id': 'CL_SDG_SERIES'})
                for indic in codelist.findAll('Name', {"xml:lang" : "en"}):
                    if not indic or indic.text == 'SDG Indicator or Series':
                        continue
                    pkg_dict['alternate_identifier'].append(indic.text)
            '''
            When support for metadata endpoints arrives in .Stat, here is how more metadata may be found:
            # Use the metadata/flow endpoint
            metadata = requests.get('{}metadata/data/{}/all?detail=full'.format(base_url, harvest_object.guid))

            # Parse with bs4
            parsed = BeautifulSoup(metadata.text, 'xml')

            # Now search for tags which may be useful as metadata
            # example: getting the name and definition of metadata set
            # (may need tweaking depending on SPC's metadata setup)

            # We can get name from the metadata structure
            set = parsed.find('MetadataSet')
            pkg_dict['name'] = set.find('Name').text

            # Then we can go to the reported attribute structure for more details
            detail = set.find('ReportedAttribute', attrs={'id': 'DEF'})
            pkg_dict['notes'] = detail.find('StructuredText', attrs={'lang': 'en'})
            source_details = set.find('ReportedAttribute', attrs={'id': 'SOURCE_DEF'})
            pkg_dict['source'] = source_details.find('StructuredText', attrs={'lang': 'en'})
            '''

            log.debug('package dict: %s' % pkg_dict)
            content_hash = str(_hashify(pkg_dict))
            harvest_object.extras = [
                HarvestObjectExtra(key='content_hash',
                                   value=content_hash)
            ]

            harvest_object.save()

            prev_object = model.Session.query(HarvestObject).filter(
                HarvestObject.source == harvest_object.source,
                HarvestObject.guid == harvest_object.guid,
                ~HarvestObject.import_finished.is_(None)).order_by(
                    HarvestObject.import_finished.desc()).first()

            obj_hash = self._get_object_extra(prev_object, 'content_hash')
            if obj_hash and obj_hash == content_hash:
                log.debug('Content is not changed. Skip..')
                return True

            # Create or update the package
            return self._create_or_update_package(
                pkg_dict, harvest_object, package_dict_form='package_show')
        except Exception as e:
            self._save_object_error(('Exception in import stage: %r / %s' %
                                     (e, traceback.format_exc())),
                                    harvest_object)
            return False
Ejemplo n.º 40
0
     return []
 else:
     projects = json.load(handle)['projects']
     for project in projects:
         log.debug(project['project_info'])
         # add dataset for project
         metadata = {
             'datasetID': self._get(project['project_info'], 'shortname'),
             'title': self._get(project['project_info'], 'longname'),
             'url': 'http://salsah.org/',
             'notes': 'This project is part of SALSAH.',
             # 'author': ,
             # 'maintainer': ,
             # 'maintainer-email': ,
             'license_id': self._get(project['project_info'], 'ckan_license_id'),
             'tags': [munge_tag(tag[:100]) for tag in self._get(project['project_info'], 'ckan_tags')],
             'resources': [{
                 'name': 'SALSAH API',
                 'resource_type': 'api',
                 'format': 'JSON',
                 'url': harvest_job.source.url.rstrip('/') + '?project=' + self._get(project['project_info'], 'shortname')
             }],
             'groups': [self._get(project['project_info'], 'longname')],
             'extras': [
                 ('level', 'Project')
             ]
         }
         
         pprint(metadata)
         
         obj = HarvestObject(
Ejemplo n.º 41
0
    def _create_or_update_package(self, package_dict, harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form == 'package_show'
                        else 'package_update_rest')(context, package_dict)

                else:
                    log.info('No changes to package with GUID %s, skipping...' % harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form == 'package_show'
                    else 'package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError, e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Ejemplo n.º 42
0
    def _create_or_update_package(self,
                                  package_dict,
                                  harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an existing one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(
                    package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form ==
                        'package_show' else 'package_update_rest')(
                            context, package_dict)

                else:
                    log.info(
                        'No changes to package with GUID %s, skipping...' %
                        harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(
                        package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(
                        package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form ==
                    'package_show' else 'package_create_rest')(context,
                                                               package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError, e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
Ejemplo n.º 43
0
 metadata = {
     'datasetID':
     self._get(project['project_info'], 'shortname'),
     'title':
     self._get(project['project_info'], 'longname'),
     'url':
     'http://salsah.org/',
     'notes':
     'This project is part of SALSAH.',
     # 'author': ,
     # 'maintainer': ,
     # 'maintainer-email': ,
     'license_id':
     self._get(project['project_info'], 'ckan_license_id'),
     'tags': [
         munge_tag(tag[:100]) for tag in self._get(
             project['project_info'], 'ckan_tags')
     ],
     'resources': [{
         'name':
         'SALSAH API',
         'resource_type':
         'api',
         'format':
         'JSON',
         'url':
         harvest_job.source.url.rstrip('/') + '?project=' +
         self._get(project['project_info'], 'shortname')
     }],
     'groups': [self._get(project['project_info'], 'longname')],
     'extras': [('level', 'Project')]