Ejemplo n.º 1
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True
Ejemplo n.º 2
0
 def import_stage(self, harvest_object):
     """Import the metadata received in the fetch stage to a dataset and
     create groups if ones are defined. Fill in metadata from study and
     document description.
     """
     try:
         xml_dict = {}
         xml_dict["source"] = harvest_object.content
         udict = json.loads(harvest_object.content)
         if "url" in udict:
             f = urllib2.urlopen(udict["url"]).read()
             ddi_xml = BeautifulSoup(f, "xml")
         else:
             self._save_object_error("No url in content!", harvest_object)
             return False
     except urllib2.URLError:
         self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object)
         return False
     except etree.XMLSyntaxError:
         self._save_object_error("Unable to parse XML!", harvest_object)
         return False
     model.repo.new_revision()
     study_descr = ddi_xml.codeBook.stdyDscr
     document_info = ddi_xml.codeBook.docDscr.citation
     title = study_descr.citation.titlStmt.titl.string
     if not title:
         title = document_info.titlStmt.titl.string
     name = study_descr.citation.titlStmt.IDNo.string
     update = True
     pkg = Package.get(name)
     if not pkg:
         pkg = Package(name=name)
         update = False
     producer = study_descr.citation.prodStmt.producer
     if not producer:
         producer = study_descr.citation.rspStmt.AuthEnty
     if not producer:
         producer = study_descr.citation.rspStmt.othId
     pkg.author = producer.string
     pkg.maintainer = producer.string
     if study_descr.citation.distStmt.contact:
         pkg.maintainer = study_descr.citation.distStmt.contact.string
     if document_info.titlStmt.IDNo:
         pkg.id = document_info.titlStmt.IDNo.string
     keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas"))
     keywords = list(set(keywords))
     for kw in keywords:
         if kw:
             vocab = None
             kw_str = ""
             if kw.string:
                 kw_str = kw.string
             if "vocab" in kw.attrs:
                 vocab = kw.attrs.get("vocab", None)
             if vocab and kw.string:
                 kw_str = vocab + " " + kw.string
             pkg.add_tag_by_name(munge_tag(kw_str))
     if study_descr.stdyInfo.abstract:
         description_array = study_descr.stdyInfo.abstract("p")
     else:
         description_array = study_descr.citation.serStmt.serInfo("p")
     pkg.notes = "<br />".join([description.string for description in description_array])
     pkg.title = title[:100]
     pkg.url = udict["url"]
     if not update:
         ofs = get_ofs()
         nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
         idno = study_descr.citation.titlStmt.IDNo
         agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string
         label = "%s/%s.xml" % (nowstr, agencyxml)
         ofs.put_stream(BUCKET, label, f, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f))
         pkg.add_resource(
             url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title
         )
     metas = {}
     descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants]
     for docextra in descendants:
         if isinstance(docextra, Tag):
             if docextra:
                 if docextra.name == "p":
                     docextra.name = docextra.parent.name
                 if not docextra.name in metas and docextra.string:
                     metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra)
                 else:
                     if docextra.string:
                         metas[docextra.name] += (
                             " " + docextra.string if docextra.string else self._collect_attribs(docextra)
                         )
     if ddi_xml.codeBook.dataDscr and not update:
         vars = ddi_xml.codeBook.dataDscr("var")
         heads = self._get_headers()
         c_heads = ["ID", "catValu", "labl", "catStat"]
         f_var = StringIO.StringIO()
         c_var = StringIO.StringIO()
         varwriter = csv.DictWriter(f_var, heads)
         codewriter = csv.DictWriter(c_var, c_heads)
         heading_row = {}
         for head in heads:
             heading_row[head] = head
         c_heading_row = {}
         for head in c_heads:
             c_heading_row[head] = head
         varwriter.writerow(heading_row)
         codewriter.writerow(c_heading_row)
         for var in vars:
             try:
                 varwriter.writerow(self._construct_csv(var, heads))
                 codewriter.writerows(self._create_code_rows(var))
             except ValueError, e:
                 raise IOError("Failed to import DDI to CSV! %s" % e)
         f_var.flush()
         label = "%s/%s_var.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, f_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len)
         label = "%s/%s_code.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, c_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len)
         f_var.seek(0)
         reader = csv.DictReader(f_var)
         for var in reader:
             metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]