コード例 #1
0
ファイル: delineatewatershed.py プロジェクト: CI-WATER/portal
 def _upload_file(self, file_path):
     """
     uploads a file to ckan filestore as and returns file metadata
     related to its existance in ckan
     param file_path: name of the file with its current location (path)
     """
     source = 'delineate.delineatewatershed._upload_file():'
     # this code has been implemented based on the code for the upload_handle() method
     # in storage.py    
     bucket_id = base.config.get('ckan.storage.bucket', 'default')    
     ts = datetime.now().isoformat().split(".")[0]  # '2010-07-08T19:56:47'    
     file_name = os.path.basename(file_path).replace(' ', '-')  # ueb request.txt -> ueb-request.txt
     file_key = os.path.join(ts, file_name) 
     label = file_key
     params = {}
     params['filename_original'] = os.path.basename(file_path)
     params['key'] = file_key
     try:
         with open(file_path, 'r') as file_obj:
             ofs = storage.get_ofs()
             resource_metadata = ofs.put_stream(bucket_id, label, file_obj, params)
             log.info(source + 'File upload was successful for file: %s' % file_path)
     except Exception as e:
         log.error(source + 'Failed to upload file: %s \nException %s' % (file_path, e))
         tk.abort(400, _('Failed to upload file: %s') % file_path)
        
     return resource_metadata 
コード例 #2
0
ファイル: helpers.py プロジェクト: ngds/ckanext-geoserver
def get_url_for_file(label):
    """
    Returns the URL for a file given it's label.
    """
    bucket = config.get('ckan.storage.bucket', 'default')
    ofs = storage.get_ofs()
    return ofs.get_url(bucket, label).replace("file://", "")
コード例 #3
0
    def _upload_file(self, file_path):
        """
        uploads a file to ckan filestore as and returns file metadata
        related to its existance in ckan
        param file_path: name of the file with its current location (path)
        """
        source = 'delineate.delineatewatershed._upload_file():'
        # this code has been implemented based on the code for the upload_handle() method
        # in storage.py
        bucket_id = base.config.get('ckan.storage.bucket', 'default')
        ts = datetime.now().isoformat().split(".")[0]  # '2010-07-08T19:56:47'
        file_name = os.path.basename(file_path).replace(
            ' ', '-')  # ueb request.txt -> ueb-request.txt
        file_key = os.path.join(ts, file_name)
        label = file_key
        params = {}
        params['filename_original'] = os.path.basename(file_path)
        params['key'] = file_key
        try:
            with open(file_path, 'r') as file_obj:
                ofs = storage.get_ofs()
                resource_metadata = ofs.put_stream(bucket_id, label, file_obj,
                                                   params)
                log.info(source +
                         'File upload was successful for file: %s' % file_path)
        except Exception as e:
            log.error(source + 'Failed to upload file: %s \nException %s' %
                      (file_path, e))
            tk.abort(400, _('Failed to upload file: %s') % file_path)

        return resource_metadata
コード例 #4
0
ファイル: test_layer.py プロジェクト: okfn/ckanext-ngds
    def setUp(self):
        try:
            self._test_package = toolkit.get_action("package_show")(
                {
                    "user": self.admin_user().name
                }, {
                    "id": self._test_package_name
                })
        except:
            pass

        if not self._test_package:
            # Add a package
            self._test_package = self.add_package(self._test_package_name)

            # "Upload" shapefile to the package
            ofs = storage.get_ofs()
            label = "%s/test_shapefile_wgs84.zip" % datetime.now().isoformat()
            anything = ofs.put_stream(
                config.get('ckan.storage.bucket', 'default'),  # bucket
                label,  # label
                open(test_shapefile_path, "r"),  # file stream
                {"key": label}  # params
            )
            # Add a resource
            self._test_package = self.add_resource(
                self._test_package["id"], {
                    "package_id": self._test_package["id"],
                    "url": "http://localhost:5000/storage/f/%s" % label
                })

        self._test_resource = self._test_package.get("resources", [None])[0]
コード例 #5
0
def get_url_for_file(label):
    """
    Returns the URL for a file given it's label.
    """
    bucket = config.get('ckan.storage.bucket', 'default')
    ofs = storage.get_ofs()
    return ofs.get_url(bucket, label).replace("file://", "")
コード例 #6
0
ファイル: test_layer.py プロジェクト: ngds/ckanext-geoserver
    def setUp(self):
        try:
            self._test_package = toolkit.get_action("package_show")(
                {"user": self.admin_user().name}, {"id": self._test_package_name}
            )
        except:
            pass

        if not self._test_package:
            # Add a package
            self._test_package = self.add_package(self._test_package_name)

            # "Upload" shapefile to the package
            ofs = storage.get_ofs()
            label = "%s/test_shapefile_wgs84.zip" % datetime.now().isoformat()
            anything = ofs.put_stream(
                config.get("ckan.storage.bucket", "default"),  # bucket
                label,  # label
                open(test_shapefile_path, "r"),  # file stream
                {"key": label},  # params
            )
            # Add a resource
            self._test_package = self.add_resource(
                self._test_package["id"],
                {"package_id": self._test_package["id"], "url": "http://localhost:5000/storage/f/%s" % label},
            )

        self._test_resource = self._test_package.get("resources", [None])[0]
コード例 #7
0
def get_url_for_file(label):
    # storage_controller = StorageController()
    resourcename_fullpath = None
    try:
        ofs = storage.get_ofs()
        BUCKET = config.get('ckan.storage.bucket', 'default')
        resourcename_fullpath = ofs.get_url(BUCKET,label)
    except:
        pass
    return resourcename_fullpath 
コード例 #8
0
ファイル: helpers.py プロジェクト: CI-WATER/portal
def retrieve_file_object_from_file_store(file_filestore_path):
    """
    returns a file obj (in read mode) for the provided file in the ckan file store
    which the caller then can use to read the contents of the file (file_obj.read())
    param file_filestore_path : filecreationdatetime/followed by the filename
    """
    bucket_id = base.config.get('ckan.storage.bucket', 'default')
    ofs = storage.get_ofs()
    file_obj = ofs.get_stream(bucket_id, file_filestore_path)

    return file_obj
コード例 #9
0
ファイル: helpers.py プロジェクト: CI-WATER/portal
def retrieve_file_object_from_file_store(file_filestore_path):
    """
    returns a file obj (in read mode) for the provided file in the ckan file store
    which the caller then can use to read the contents of the file (file_obj.read())
    param file_filestore_path : filecreationdatetime/followed by the filename
    """
    bucket_id = base.config.get('ckan.storage.bucket', 'default')   
    ofs = storage.get_ofs()
    file_obj = ofs.get_stream(bucket_id, file_filestore_path) 

    return file_obj
コード例 #10
0
 def _save_ddi_variables_to_csv(self, name, pkg, harvest_object):
     # JuhoL: Handle codeBook.dataDscr parts, extract data (eg. questionnaire)
     # variables etc.
     # Saves <var>...</var> elements to a csv file accessible at:
     # <ckan_url>/storage/f/2013-11-05T18%3A10%3A19.686858/1049_var.csv
     # And separately saves <catgry> elements inside <var> to a csv as a resource
     # for package.
     # Assumes that dataDscr has not changed. Valid?
     data_dscr = "ddi_xml.codeBook.dataDscr"
     try:
         ofs = storage.get_ofs()
     except IOError, ioe:
         log.debug('Unable to save xml variables: {io}'.format(io=ioe))
         self.errors.append('Unable to save xml variables: {io}'.format(io=ioe))
         return u''
コード例 #11
0
    def add_shapefile_resource(self, package_name, filepath=test_shapefile_path):
        # Add a package
        p = self.add_package(package_name)

        # "Upload" shapefile to the package
        ofs = storage.get_ofs()
        label = "%s/%s" % (datetime.now().isoformat(), shapefile_name)
        anything = ofs.put_stream(
            config.get('ckan.storage.bucket', 'default'), # bucket
            label, # label
            open(filepath, "r"), # file stream
            {"key": label} # params
        )

        # Add a resource
        package = self.add_resource(p["id"], {"package_id": p["id"], "url": "http://localhost:5000/storage/f/%s" % label})
        return package.get("resources", [None])[0]
コード例 #12
0
ファイル: forms.py プロジェクト: etalab/weckan
def handle_upload(request, field, user=None):
    from ckan.controllers import storage

    if not isinstance(field.data, cgi.FieldStorage):
        return None

    filename, ext = splitext(field.data.filename)
    filename = strings.slugify(filename)
    filename = ''.join([filename, ext])
    filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename, ts=datetime.now())
    ofs = storage.get_ofs()
    ofs.put_stream(STORAGE_BUCKET, filename, field.data.file, {
        'filename-original': field.data.filename,
        'uploaded-by': user.name if user else '',
    })
    root = conf['home_url']
    if root.startswith('//'):
        root = root.replace('//', 'https://' if conf['https'] else 'http://', 1)
    path = urls.get_url(None, 'storage/f', filename)
    return ''.join([root, path])
コード例 #13
0
ファイル: forms.py プロジェクト: etalab/weckan
def handle_upload(request, field, user=None):
    from ckan.controllers import storage

    if not isinstance(field.data, cgi.FieldStorage):
        return None

    filename, ext = splitext(field.data.filename)
    filename = strings.slugify(filename)
    filename = ''.join([filename, ext])
    filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename,
                                                      ts=datetime.now())
    ofs = storage.get_ofs()
    ofs.put_stream(
        STORAGE_BUCKET, filename, field.data.file, {
            'filename-original': field.data.filename,
            'uploaded-by': user.name if user else '',
        })
    root = conf['home_url']
    if root.startswith('//'):
        root = root.replace('//', 'https://' if conf['https'] else 'http://',
                            1)
    path = urls.get_url(None, 'storage/f', filename)
    return ''.join([root, path])
コード例 #14
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata_oai_dc = data['metadata']['oai_dc']
    titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces)
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = titles.get('title_0', identifier)
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    esc_identifier = identifier.replace('/','-')
    pkg = Package.get(esc_identifier)
    if not pkg:
        pkg = Package(name=name, title=title, id=esc_identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = titles
    idx = 0
    for s in ('subject', 'type'):
        for tag in metadata_oai_dc.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://www.yso.fi'):
                tags = label_list_yso(tagi)
                extras['tag_source_%i' % idx] = tagi
                idx += 1
            elif tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                #tagi = munge_tag(tagi[:100]) # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                    model.PackageTag.tag_id == tag_obj.id).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    lastidx = 0
    for auth in metadata_oai_dc.get('creator', []):
        extras['organization_%d' % lastidx] = ''
        extras['author_%d' % lastidx] = auth
        lastidx += 1
    extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces))
    extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Causes failure in commit for some reason.
    #for f in _handle_format(metadata.get('formatNode', []), namespaces):
    #    pprint.pprint(f)
    #    pkg.add_resource(**f)
    # There may be multiple identifiers (URL, ISBN, ...) in the metadata.
    id_idx = 0
    for ident in metadata_oai_dc.get('identifier', []):
        extras['identifier_%i' % id_idx] = ident
        id_idx += 1
    # Check that we have a language.
    lang = metadata_oai_dc.get('language', [])
    if lang and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    if 'date' in extras:
        pkg.version = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    
    # Metadata may have different identifiers, pick link, if exists.
    for ids in metadata_oai_dc['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            pkg.add_resource(ids, name=pkg.title, format='html')
    # All belong to the main group even if they do not belong to any set.
    if group:
        group.add_package_by_name(pkg.name)
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for mdp, metadata in data['metadata'].items():
        for key, value in metadata.items():
            if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode',
                                                           'publisherNode', 'creator', 'contributorNode',
                                                           'description', 'identifier', 'language', 'formatNode'):
                continue
            extras[key] = ' '.join(value)
        #description = metadata['description'][0] if len(metadata['description']) else ''
        notes = ' '.join(metadata.get('description', []))
        pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    
    for mdp, resource in data['package_resource'].items():
        ofs = get_ofs()
        ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {})
        pkg.add_resource(**(resource))
    
    if harvest_object:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    
    model.repo.commit()
    return pkg.id
コード例 #15
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata = data['metadata']
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = metadata.get('title', identifier)[0]
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    pkg = Package.get(name)
    if not pkg:
        pkg = Package(name=name, title=title, id=identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = {}
    idx = 0
    for s in ('subject', 'type',):
        for tag in metadata.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                        model.PackageTag.tag_id == tag_obj.id
                    ).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    extras.update(
        _handle_contributor(metadata.get('contributorNode', []), namespaces))
    extras.update(
        _handle_publisher(metadata.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Check that we have a language.
    lang = metadata.get('language', [])
    if lang is not None and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for key, value in metadata.items():
        if value is None or len(value) == 0 or key in (
            'title',
            'description',
            'publisherNode',
            'contributorNode',
            'formatNode',
            'identifier',
            'source',
            'rightsNode'
        ):
            continue
        extras[key] = value[0]
    #description = metadata['description'][0] if len(metadata['description']) else ''
    notes = ' '.join(metadata.get('description', []))
    pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    if 'date' in extras:
        pkg.version = extras['date']
        extras['modified'] = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    if 'package_resource' in data:
        try:
            ofs = get_ofs()
            ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {})
            pkg.add_resource(**(data['package_resource']))
        except KeyError:
            pass
    if harvest_object is not None:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    # Metadata may have different identifiers, pick link, if exists.

    # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png
    # "Data" format is used by CKAN to identify unknown resources.
    # You can use it if you want (default format is "html"). For example:
    # - http://my.data.com/my-generated-resource?data
    # - http://my.data.com/my-resource.data
    available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html']
    default_format = 'html'

    for ids in metadata['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            # The end of the URL must be the format, otherwise it will use "html" by default
            infer_format = default_format

            for ext in available_formats:
                if ids.endswith(ext):
                    infer_format = ext

            pkg.add_resource(ids, name=pkg.title, format=infer_format)
    # All belong to the main group even if they do not belong to any set.
    if group is not None:
        group.add_package_by_name(pkg.name)
    model.repo.commit()
    return pkg.id
コード例 #16
0
 def read_data(self, id, resource_id):
     res = Resource.get(resource_id)
     pkg = Package.get(id)
     c.pkg_dict = pkg.as_dict()
     c.package = pkg
     c.resource = get_action('resource_show')({'model': model},
                                                  {'id': resource_id})
     label = res.url.split(config.get('ckan.site_url') + '/storage/f/')[-1]
     label = urllib2.unquote(label)
     ofs = get_ofs()
     try:
         furl = ofs.get_url(BUCKET, label).split('file://')[-1]
     except FileNotFoundException:
         h.flash_error(_('Cannot do data mining on remote resource!'))
         url = h.url_for(controller='package', action='resource_read',
                         id=id, resource_id=resource_id)
         return redirect(url)
     wordstats = {}
     ret = {}
     if res.format in ('TXT', 'txt'):
         wdsf, wdspath = tempfile.mkstemp()
         os.write(wdsf, "%s\nmetadata description title information" % furl)
         with os.fdopen(wdsf, 'r') as wordfile:
             preproc = orngText.Preprocess()
             table = orngText.loadFromListWithCategories(wdspath)
             data = orngText.bagOfWords(table, preprocessor=preproc)
             words = orngText.extractWordNGram(data, threshold=10.0, measure='MI')
         for i in range(len(words)):
             d = words[i]
             wordstats = d.get_metas(str)
         for k, v in wordstats.items():
             if v.value > 10.0:
                 ret[unicode(k, 'utf8')] = v.value
         from operator import itemgetter
         c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30]
         os.remove(wdspath)
         for i in range(len(data)):
                 d = words[i]
                 wordstats = d.get_metas(str)
         words = []
         for k, v in wordstats.items():
             words.append(k)
         model.repo.new_revision()
         if not 'autoextracted_description' in pkg.extras:
             pkg.extras['autoextracted_description'] = ' '.join(words)
         pkg.save()
         return render('datamining/read.html')
     elif res.format in ('odt', 'doc', 'xls', 'ods', 'odp', 'ppt', 'doc', 'html'):
         textfd, textpath = convert_to_text(res, furl)
         if not textpath:
             h.flash_error(_('This file could not be mined for any data!'))
             os.close(textfd)
             return render('datamining/read.html')
         else:
             wdsf, wdspath = tempfile.mkstemp()
             os.write(wdsf, "%s\nmetadata description title information" % textpath)
             preproc = orngText.Preprocess()
             table = orngText.loadFromListWithCategories(wdspath)
             data = orngText.bagOfWords(table, preprocessor=preproc)
             words = orngText.extractWordNGram(data, threshold=10.0, measure='MI')
             for i in range(len(words)):
                 d = words[i]
                 wordstats = d.get_metas(str)
             for k, v in wordstats.items():
                 if v.value > 10.0:
                     ret[unicode(k, 'utf8')] = v.value
             from operator import itemgetter
             c.data_tags = sorted(ret.iteritems(), key=itemgetter(1), reverse=True)[:30]
             os.close(textfd)
             os.close(wdsf)
             os.remove(wdspath)
             os.remove(textpath)
             for i in range(len(data)):
                 d = words[i]
                 wordstats = d.get_metas(str)
             words = []
             for k, v in wordstats.items():
                 log.debug(k)
                 words.append(substitute_ascii_equivalents(k))
             model.repo.new_revision()
             if not 'autoextracted_description' in pkg.extras:
                 pkg.extras['autoextracted_description'] = ' '.join(words)
             pkg.save()
             return render('datamining/read.html')
     else:
         h.flash_error(_('This metadata document is not in proper format for data mining!'))
         url = h.url_for(controller='package', action='resource_read',
                         id=id, resource_id=resource_id)
         return redirect(url)
コード例 #17
0
ファイル: harvester.py プロジェクト: ugeuder-kata/ckanext-ddi
 def import_stage(self, harvest_object):
     """Import the metadata received in the fetch stage to a dataset and
     create groups if ones are defined. Fill in metadata from study and
     document description.
     """
     try:
         xml_dict = {}
         xml_dict["source"] = harvest_object.content
         udict = json.loads(harvest_object.content)
         if "url" in udict:
             f = urllib2.urlopen(udict["url"]).read()
             ddi_xml = BeautifulSoup(f, "xml")
         else:
             self._save_object_error("No url in content!", harvest_object)
             return False
     except urllib2.URLError:
         self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object)
         return False
     except etree.XMLSyntaxError:
         self._save_object_error("Unable to parse XML!", harvest_object)
         return False
     model.repo.new_revision()
     study_descr = ddi_xml.codeBook.stdyDscr
     document_info = ddi_xml.codeBook.docDscr.citation
     title = study_descr.citation.titlStmt.titl.string
     if not title:
         title = document_info.titlStmt.titl.string
     name = study_descr.citation.titlStmt.IDNo.string
     update = True
     pkg = Package.get(name)
     if not pkg:
         pkg = Package(name=name)
         update = False
     producer = study_descr.citation.prodStmt.producer
     if not producer:
         producer = study_descr.citation.rspStmt.AuthEnty
     if not producer:
         producer = study_descr.citation.rspStmt.othId
     pkg.author = producer.string
     pkg.maintainer = producer.string
     if study_descr.citation.distStmt.contact:
         pkg.maintainer = study_descr.citation.distStmt.contact.string
     if document_info.titlStmt.IDNo:
         pkg.id = document_info.titlStmt.IDNo.string
     keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas"))
     keywords = list(set(keywords))
     for kw in keywords:
         if kw:
             vocab = None
             kw_str = ""
             if kw.string:
                 kw_str = kw.string
             if "vocab" in kw.attrs:
                 vocab = kw.attrs.get("vocab", None)
             if vocab and kw.string:
                 kw_str = vocab + " " + kw.string
             pkg.add_tag_by_name(munge_tag(kw_str))
     if study_descr.stdyInfo.abstract:
         description_array = study_descr.stdyInfo.abstract("p")
     else:
         description_array = study_descr.citation.serStmt.serInfo("p")
     pkg.notes = "<br />".join([description.string for description in description_array])
     pkg.title = title[:100]
     pkg.url = udict["url"]
     if not update:
         ofs = get_ofs()
         nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
         idno = study_descr.citation.titlStmt.IDNo
         agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string
         label = "%s/%s.xml" % (nowstr, agencyxml)
         ofs.put_stream(BUCKET, label, f, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f))
         pkg.add_resource(
             url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title
         )
     metas = {}
     descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants]
     for docextra in descendants:
         if isinstance(docextra, Tag):
             if docextra:
                 if docextra.name == "p":
                     docextra.name = docextra.parent.name
                 if not docextra.name in metas and docextra.string:
                     metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra)
                 else:
                     if docextra.string:
                         metas[docextra.name] += (
                             " " + docextra.string if docextra.string else self._collect_attribs(docextra)
                         )
     if ddi_xml.codeBook.dataDscr and not update:
         vars = ddi_xml.codeBook.dataDscr("var")
         heads = self._get_headers()
         c_heads = ["ID", "catValu", "labl", "catStat"]
         f_var = StringIO.StringIO()
         c_var = StringIO.StringIO()
         varwriter = csv.DictWriter(f_var, heads)
         codewriter = csv.DictWriter(c_var, c_heads)
         heading_row = {}
         for head in heads:
             heading_row[head] = head
         c_heading_row = {}
         for head in c_heads:
             c_heading_row[head] = head
         varwriter.writerow(heading_row)
         codewriter.writerow(c_heading_row)
         for var in vars:
             try:
                 varwriter.writerow(self._construct_csv(var, heads))
                 codewriter.writerows(self._create_code_rows(var))
             except ValueError, e:
                 raise IOError("Failed to import DDI to CSV! %s" % e)
         f_var.flush()
         label = "%s/%s_var.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, f_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len)
         label = "%s/%s_code.csv" % (nowstr, name)
         ofs.put_stream(BUCKET, label, c_var, {})
         fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label)
         pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len)
         f_var.seek(0)
         reader = csv.DictReader(f_var)
         for var in reader:
             metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]
コード例 #18
0
ファイル: extractor.py プロジェクト: xbian/ckanext-kata
def extract_text(resource_url, format):
    """
    Attempts to extract plain text contents from the CKAN resource with the
    given URL. Only local resources are supported at the moment.

    Non-plain text files are first converted to a plain text representation
    if possible.

    :param resource_url: URL to the resource
    :type resource_url: str
    :param format: the file format of the resource (practically file name extension)
    :type format: str
    :rtype: unicode
    :raises IOError: if the resource is remote or cannot be read
    """
    ofs = storage.get_ofs()

    label = resource_url.split(STORAGE_BASE_URL)[-1]
    label = urllib2.unquote(label)

    format = format.lower()

    log.debug("Resource label: %s" % label)

    original_path = None
    converted_path = None

    try:
        # Get file location
        original_path = ofs.get_url(BUCKET, label).split('file://')[-1]
    except storage_exceptions.FileNotFoundException:
        raise IOError(
            "Unable to extract text from {u} -- is the resource remote?".
            format(u=resource_url))

    mime_type = magic.Magic(mime=True).from_file(original_path)

    if mime_type == 'text/plain':
        tmp_file = False
        converted_path = original_path
    else:
        log.debug("Attempting to extract plain text from {p}".format(
            p=original_path))
        converted_fd, converted_path = convert_file_to_text(
            original_path, format)
        if converted_path is not None:
            tmp_file = True
        else:
            log.info(
                "Extracting plain text from {p} failed; unsupported format?".
                format(p=original_path))
            tmp_file = False

    if converted_path is not None:
        log.debug("Reading from %s", converted_path)
        try:
            with codecs.open(converted_path, mode='r',
                             encoding='utf-8') as text_file:
                text = text_file.read()
        except UnicodeDecodeError:
            log.debug(
                "Failed to open file using UTF-8 encoding. Trying to guess encoding."
            )
            try:
                encoding = magic.Magic(
                    mime_encoding=True).from_file(converted_path)
                with codecs.open(converted_path, mode='r',
                                 encoding=encoding) as text_file:
                    text = text_file.read()
            except:
                text = u""
                #raise ValidationError({'resources': [[_("Failed to detect file encoding")]]})
        log.debug("Resource plain text contents:")
        log.debug(text)
    else:
        text = u""

    if tmp_file:
        os.remove(converted_path)

    return text