Ejemplo n.º 1
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata_oai_dc = data['metadata']['oai_dc']
    titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces)
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = titles.get('title_0', identifier)
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    esc_identifier = identifier.replace('/','-')
    pkg = Package.get(esc_identifier)
    if not pkg:
        pkg = Package(name=name, title=title, id=esc_identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = titles
    idx = 0
    for s in ('subject', 'type'):
        for tag in metadata_oai_dc.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://www.yso.fi'):
                tags = label_list_yso(tagi)
                extras['tag_source_%i' % idx] = tagi
                idx += 1
            elif tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                #tagi = munge_tag(tagi[:100]) # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                    model.PackageTag.tag_id == tag_obj.id).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    lastidx = 0
    for auth in metadata_oai_dc.get('creator', []):
        extras['organization_%d' % lastidx] = ''
        extras['author_%d' % lastidx] = auth
        lastidx += 1
    extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces))
    extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Causes failure in commit for some reason.
    #for f in _handle_format(metadata.get('formatNode', []), namespaces):
    #    pprint.pprint(f)
    #    pkg.add_resource(**f)
    # There may be multiple identifiers (URL, ISBN, ...) in the metadata.
    id_idx = 0
    for ident in metadata_oai_dc.get('identifier', []):
        extras['identifier_%i' % id_idx] = ident
        id_idx += 1
    # Check that we have a language.
    lang = metadata_oai_dc.get('language', [])
    if lang and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    if 'date' in extras:
        pkg.version = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    
    # Metadata may have different identifiers, pick link, if exists.
    for ids in metadata_oai_dc['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            pkg.add_resource(ids, name=pkg.title, format='html')
    # All belong to the main group even if they do not belong to any set.
    if group:
        group.add_package_by_name(pkg.name)
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for mdp, metadata in data['metadata'].items():
        for key, value in metadata.items():
            if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode',
                                                           'publisherNode', 'creator', 'contributorNode',
                                                           'description', 'identifier', 'language', 'formatNode'):
                continue
            extras[key] = ' '.join(value)
        #description = metadata['description'][0] if len(metadata['description']) else ''
        notes = ' '.join(metadata.get('description', []))
        pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    
    for mdp, resource in data['package_resource'].items():
        ofs = get_ofs()
        ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {})
        pkg.add_resource(**(resource))
    
    if harvest_object:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    
    model.repo.commit()
    return pkg.id
Ejemplo n.º 2
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata = data['metadata']
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = metadata.get('title', identifier)[0]
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    pkg = Package.get(name)
    if not pkg:
        pkg = Package(name=name, title=title, id=identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = {}
    idx = 0
    for s in ('subject', 'type',):
        for tag in metadata.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                        model.PackageTag.tag_id == tag_obj.id
                    ).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    extras.update(
        _handle_contributor(metadata.get('contributorNode', []), namespaces))
    extras.update(
        _handle_publisher(metadata.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Check that we have a language.
    lang = metadata.get('language', [])
    if lang is not None and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for key, value in metadata.items():
        if value is None or len(value) == 0 or key in (
            'title',
            'description',
            'publisherNode',
            'contributorNode',
            'formatNode',
            'identifier',
            'source',
            'rightsNode'
        ):
            continue
        extras[key] = value[0]
    #description = metadata['description'][0] if len(metadata['description']) else ''
    notes = ' '.join(metadata.get('description', []))
    pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    if 'date' in extras:
        pkg.version = extras['date']
        extras['modified'] = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    if 'package_resource' in data:
        try:
            ofs = get_ofs()
            ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {})
            pkg.add_resource(**(data['package_resource']))
        except KeyError:
            pass
    if harvest_object is not None:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    # Metadata may have different identifiers, pick link, if exists.

    # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png
    # "Data" format is used by CKAN to identify unknown resources.
    # You can use it if you want (default format is "html"). For example:
    # - http://my.data.com/my-generated-resource?data
    # - http://my.data.com/my-resource.data
    available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html']
    default_format = 'html'

    for ids in metadata['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            # The end of the URL must be the format, otherwise it will use "html" by default
            infer_format = default_format

            for ext in available_formats:
                if ids.endswith(ext):
                    infer_format = ext

            pkg.add_resource(ids, name=pkg.title, format=infer_format)
    # All belong to the main group even if they do not belong to any set.
    if group is not None:
        group.add_package_by_name(pkg.name)
    model.repo.commit()
    return pkg.id
Ejemplo n.º 3
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True
Ejemplo n.º 4
0
 def import_stage(self, harvest_object):
     """Import the metadata received in the fetch stage to a dataset and
     create groups if ones are defined. Fill in metadata from study and
     document description.
     """
     try:
         xml_dict = {}
         xml_dict["source"] = harvest_object.content
         udict = json.loads(harvest_object.content)
         if "url" in udict:
             f = urllib2.urlopen(udict["url"]).read()
             ddi_xml = BeautifulSoup(f, "xml")
         else:
             self._save_object_error("No url in content!", harvest_object)
             return False
     except urllib2.URLError:
         self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object)
         return False
     except etree.XMLSyntaxError:
         self._save_object_error("Unable to parse XML!", harvest_object)
         return False
     model.repo.new_revision()
     study_descr = ddi_xml.codeBook.stdyDscr
     document_info = ddi_xml.codeBook.docDscr.citation
     title = study_descr.citation.titlStmt.titl.string
     if not title:
         title = document_info.titlStmt.titl.string
     name = study_descr.citation.titlStmt.IDNo.string
     update = True
     pkg = Package.get(name)
     if not pkg:
         pkg = Package(name=name)
         update = False
     producer = study_descr.citation.prodStmt.producer
     if not producer:
         producer = study_descr.citation.rspStmt.AuthEnty
     if not producer:
         producer = study_descr.citation.rspStmt.othId
     pkg.author = producer.string
     pkg.maintainer = producer.string
     if study_descr.citation.distStmt.contact:
         pkg.maintainer = study_descr.citation.distStmt.contact.string
     if document_info.titlStmt.IDNo:
         pkg.id = document_info.titlStmt.IDNo.string
     keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas"))
     keywords = list(set(keywords))
     for kw in keywords:
         if kw:
             vocab = None
             kw_str = ""
             if kw.string:
                 kw_str = kw.string
             if "vocab" in kw.attrs:
                 vocab = kw.attrs.get("vocab", None)
             if vocab and kw.string:
                 kw_str = vocab + " " + kw.string
             pkg.add_tag_by_name(munge_tag(kw_str))
     if study_descr.stdyInfo.abstract:
         description_array = study_descr.stdyInfo.abstract("p")
     else:
         description_array = study_descr.citation.serStmt.serInfo("p")
     pkg.notes = "<br />".join([description.string for description in description_array])
     pkg.title = title[:100]
     pkg.url = udict["url"]
     if not update:
         ofs = get_ofs()
         nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
         idno = study_descr.citation.titlStmt.IDNo
         agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string
         label = "%s/%s.xml" % (nowstr, agencyxml)
         ofs.put_stream(BUCKET, label, f, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f))
         pkg.add_resource(
             url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title
         )
     metas = {}
     descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants]
     for docextra in descendants:
         if isinstance(docextra, Tag):
             if docextra:
                 if docextra.name == "p":
                     docextra.name = docextra.parent.name
                 if not docextra.name in metas and docextra.string:
                     metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra)
                 else:
                     if docextra.string:
                         metas[docextra.name] += (
                             " " + docextra.string if docextra.string else self._collect_attribs(docextra)
                         )
     if ddi_xml.codeBook.dataDscr and not update:
         vars = ddi_xml.codeBook.dataDscr("var")
         heads = self._get_headers()
         c_heads = ["ID", "catValu", "labl", "catStat"]
         f_var = StringIO.StringIO()
         c_var = StringIO.StringIO()
         varwriter = csv.DictWriter(f_var, heads)
         codewriter = csv.DictWriter(c_var, c_heads)
         heading_row = {}
         for head in heads:
             heading_row[head] = head
         c_heading_row = {}
         for head in c_heads:
             c_heading_row[head] = head
         varwriter.writerow(heading_row)
         codewriter.writerow(c_heading_row)
         for var in vars:
             try:
                 varwriter.writerow(self._construct_csv(var, heads))
                 codewriter.writerows(self._create_code_rows(var))
             except ValueError, e:
                 raise IOError("Failed to import DDI to CSV! %s" % e)
         f_var.flush()
         label = "%s/%s_var.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, f_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len)
         label = "%s/%s_code.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, c_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len)
         f_var.seek(0)
         reader = csv.DictReader(f_var)
         for var in reader:
             metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]