Example #1
0
def get_results_dir(result, request, childSpecies=False):
    swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)

    # swift only if it is remote dataset. For blob and multi-species dataset, store locally.
    # For other dataset type (including the child species of multispecies), store at swift if possible.
    do_swift = IRemoteDataset.providedBy(result) or \
               ((childSpecies or (not IMultiSpeciesDataset.providedBy(result))) and \
                not IBlobDataset.providedBy(result) and \
                swiftsettings.storage_url)

    if do_swift:
        if swiftsettings.storage_url:
            results_dir = 'swift+{storage_url}/{container}/{path}/'.format(
                storage_url=swiftsettings.storage_url,
                container=swiftsettings.result_container,
                path=IUUID(result))
        else:
            raise Exception("Remote dataset requires swift url to be set")
    else:
        # if swift is not setup we use local storage
        results_dir = 'scp://{uid}@{ip}:{port}{path}/'.format(
            uid=pwd.getpwuid(os.getuid()).pw_name,
            # FIXME: hostname from request is not good enough...
            #        need to get ip or host from plone_worker that does actual
            #        import
            #        store in registry?
            #        (is ok for testing)
            # ip=get_public_ip(),
            ip=get_hostname(request),
            port=os.environ.get('SSH_PORT', 22),
            path=tempfile.mkdtemp(prefix='result_import_'))

    return results_dir
Example #2
0
def get_results_dir(result, request):
    swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)

    # swift only if it is remote dataset. For blob and multi-species dataset, store locally.
    # For other dataset type, store at swift if possible.
    do_swift = IRemoteDataset.providedBy(result) or \
               (not IMultiSpeciesDataset.providedBy(result) and \
                not IBlobDataset.providedBy(result) and \
                swiftsettings.storage_url)

    if do_swift:
        if swiftsettings.storage_url:
            results_dir = 'swift+{storage_url}/{container}/{path}/'.format(
                storage_url=swiftsettings.storage_url,
                container=swiftsettings.result_container,
                path=IUUID(result)
            )
        else:
            raise Exception("Remote dataset requires swift url to be set")
    else:
        # if swift is not setup we use local storage
        results_dir = 'scp://{uid}@{ip}:{port}{path}/'.format(
            uid=pwd.getpwuid(os.getuid()).pw_name,
            # FIXME: hostname from request is not good enough...
            #        need to get ip or host from plone_worker that does actual
            #        import
            #        store in registry?
            #        (is ok for testing)
            # ip=get_public_ip(),
            ip=get_hostname(request),
            port=os.environ.get('SSH_PORT', 22),
            path=tempfile.mkdtemp(prefix='result_import_')
        )

    return results_dir
Example #3
0
def build_ala_import_task(lsid, dataset, request):
    # creates task chain to import ala dataset
    """
    lsid .. species id
    context ... a dictionary with keys:
      - context: path to context object
      - userid: zope userid
    """
    # we need site-path, context-path and lsid for this job
    dataset_path = '/'.join(dataset.getPhysicalPath())
    member = api.user.get_current()
    context = {
        'context': dataset_path,
        'dataSource': dataset.dataSource,
        'user': {
            'id': member.getUserName(),
            'email': member.getProperty('email'),
            'fullname': member.getProperty('fullname')
        }
    }

    results_dir = get_results_dir(dataset, request)
    import_multispecies_params = {}
    if IMultiSpeciesDataset.providedBy(dataset) and dataset.dataSource in ('ala', 'gbif', 'obis'):
        container = aq_parent(aq_inner(dataset))
        import_multispecies_params = {
            'results_dir': get_results_dir(dataset, request, childSpecies=True),
            'import_context': {
                'context': '/'.join(container.getPhysicalPath()),
                'user': {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            }
        }

    if dataset.dataSource == 'gbif':
        return datamover.pull_occurrences_from_gbif.si(lsid,
                                                       results_dir, context,
                                                       import_multispecies_params)
    elif dataset.dataSource == 'aekos':
        return datamover.pull_occurrences_from_aekos.si(lsid,
                                                        results_dir, context)
    elif dataset.dataSource == 'obis':
        return datamover.pull_occurrences_from_obis.si(lsid,
                                                        results_dir, context,
                                                        import_multispecies_params)
    else:
        params = [{
            'query': 'lsid:{}'.format(lsid),
            'url': 'http://biocache.ala.org.au/ws'
        }]
        return datamover.pull_occurrences_from_ala.si(params,
                                                      results_dir, context, 
                                                      import_multispecies_params)
Example #4
0
def build_ala_import_task(lsid, dataset, request):
    # creates task chain to import ala dataset
    """
    lsid .. species id
    context ... a dictionary with keys:
      - context: path to context object
      - userid: zope userid
    """
    # we need site-path, context-path and lsid for this job
    dataset_path = '/'.join(dataset.getPhysicalPath())
    member = api.user.get_current()
    context = {
        'context': dataset_path,
        'dataSource': dataset.dataSource,
        'user': {
            'id': member.getUserName(),
            'email': member.getProperty('email'),
            'fullname': member.getProperty('fullname')
        }
    }

    results_dir = get_results_dir(dataset, request)
    import_multispecies_params = {}
    if IMultiSpeciesDataset.providedBy(dataset) and dataset.dataSource in (
            'ala', 'gbif', 'obis'):
        container = aq_parent(aq_inner(dataset))
        import_multispecies_params = {
            'results_dir': get_results_dir(dataset, request,
                                           childSpecies=True),
            'import_context': {
                'context': '/'.join(container.getPhysicalPath()),
                'user': {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            }
        }

    if dataset.dataSource == 'gbif':
        return datamover.pull_occurrences_from_gbif.si(
            lsid, results_dir, context, import_multispecies_params)
    elif dataset.dataSource == 'aekos':
        return datamover.pull_occurrences_from_aekos.si(
            lsid, results_dir, context)
    elif dataset.dataSource == 'obis':
        return datamover.pull_occurrences_from_obis.si(
            lsid, results_dir, context, import_multispecies_params)
    else:
        params = [{
            'query': 'lsid:{}'.format(lsid),
            'url': 'https://biocache-ws.ala.org.au/ws'
        }]
        return datamover.pull_occurrences_from_ala.si(
            params, results_dir, context, import_multispecies_params)
Example #5
0
def build_ala_import_qid_task(params, dataset, request):
    # creates task chain to import ala dataset
    """
    params .. [{name, qid, url}, ...]
    context ... a dictionary with keys:
      - context: path to context object
      - userid: zope userid
    """
    # we need site-path, context-path and lsid for this job
    dataset_path = '/'.join(dataset.getPhysicalPath())
    member = api.user.get_current()
    context = {
        'context': dataset_path,
        'dataSource': dataset.dataSource,
        'user': {
            'id': member.getUserName(),
            'email': member.getProperty('email'),
            'fullname': member.getProperty('fullname')
        }
    }

    import_multispecies_params = {}
    if IMultiSpeciesDataset.providedBy(dataset):
        container = aq_parent(aq_inner(dataset))
        import_multispecies_params = {
            'results_dir': get_results_dir(dataset, request,
                                           childSpecies=True),
            'import_context': {
                'context': '/'.join(container.getPhysicalPath()),
                'user': {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            }
        }

    results_dir = get_results_dir(dataset, request)
    task = datamover.pull_occurrences_from_ala.si(params, results_dir, context,
                                                  import_multispecies_params)
    return task
Example #6
0
def build_ala_import_qid_task(params, dataset, request):
    # creates task chain to import ala dataset
    """
    params .. [{name, qid, url}, ...]
    context ... a dictionary with keys:
      - context: path to context object
      - userid: zope userid
    """
    # we need site-path, context-path and lsid for this job
    dataset_path = '/'.join(dataset.getPhysicalPath())
    member = api.user.get_current()
    context = {
        'context': dataset_path,
        'dataSource': dataset.dataSource,
        'user': {
            'id': member.getUserName(),
            'email': member.getProperty('email'),
            'fullname': member.getProperty('fullname')
        }
    }

    import_multispecies_params = {}
    if IMultiSpeciesDataset.providedBy(dataset):
        container = aq_parent(aq_inner(dataset))
        import_multispecies_params = {
            'results_dir': get_results_dir(dataset, request, childSpecies=True),
            'import_context': {
                'context': '/'.join(container.getPhysicalPath()),
                'user': {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            }
        }

    results_dir = get_results_dir(dataset, request)
    task = datamover.pull_occurrences_from_ala.si(params,
                                                  results_dir, context,
                                                  import_multispecies_params)
    return task
Example #7
0
    def import_ala_data(self):
        if self.request.get('REQUEST_METHOD', 'GET').upper() != 'POST':
            self.record_error('Request must be POST', 400)
            raise BadRequest('Request must be POST')

        context = None
        # get import context
        if ISiteRoot.providedBy(self.context):
            # we have been called at site root... let's traverse to default
            # import location
            context = self.context.restrictedTraverse("/".join(
                (defaults.DATASETS_FOLDER_ID,
                 defaults.DATASETS_SPECIES_FOLDER_ID, 'ala')))
        else:
            # custom context.... let's use in
            context = self.context
        # do user check first
        member = ploneapi.user.get_current()
        if member.getId():
            user = {
                'id': member.getUserName(),
                'email': member.getProperty('email'),
                'fullname': member.getProperty('fullname')
            }
        else:
            # We need at least a valid user
            raise Unauthorized("Invalid user")
        # check permission
        if not checkPermission('org.bccvl.AddDataset', context):
            raise Unauthorized("User not allowed in this context")

        params = self.request.form.get('data')

        if not params:
            raise BadRequest("At least on of traits or environ has to be set")

        if params is None:
            self.record_error('Bad Request', 400, 'Missing parameter data',
                              {'parameter': 'data'})
        if not params:
            self.record_error('Bad Request', 400, 'Empty parameter data',
                              {'parameter': 'data'})
        # TODO: should validate objects inside as well? (or use json schema
        # validation?)

        # all good so far
        # pull dataset from aekos
        # TODO: get better name here
        title = params[0].get('name', 'ALA import')
        # determine dataset type
        # 1. test if it is a multi species import
        species = set()
        for query in params:
            biocache_url = '{}/occurrences/search'.format(query['url'])
            query = {
                'q': query['query'],
                'pageSize': 0,
                'limit': 2,
                'facets': 'species_guid',
                'fq': 'species_guid:*'  # skip results without species guid
            }
            res = requests.get(biocache_url, params=query)
            res = res.json()
            # FIXME: do we need to treat sandbox downloads differently?
            if res.get('facetResults'):  # do we have some results at all?
                for guid in res['facetResults'][0]['fieldResult']:
                    species.add(guid['label'])
        if len(species) > 1:
            portal_type = 'org.bccvl.content.multispeciesdataset'

        else:
            portal_type = 'org.bccvl.content.dataset'
            swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)
            if swiftsettings.storage_url:
                portal_type = 'org.bccvl.content.remotedataset'
        # create content
        ds = createContent(portal_type, title=title)
        ds.dataSource = 'ala'
        ds.description = u' '.join([title, u' imported from ALA'])
        ds.import_params = params
        ds = addContentToContainer(context, ds)
        md = IBCCVLMetadata(ds)
        if IMultiSpeciesDataset.providedBy(ds):
            md['genre'] = 'DataGenreSpeciesCollection'
            md['categories'] = ['multispecies']
        else:
            # species dataset
            md['genre'] = 'DataGenreSpeciesOccurrence'
            md['categories'] = ['occurrence']
        # TODO: populate this correctly as well
        md['species'] = [{'scientificName': 'qid', 'taxonID': 'qid'}]
        # FIXME: IStatusMessage should not be in API call
        from Products.statusmessages.interfaces import IStatusMessage
        IStatusMessage(self.request).add('New Dataset created', type='info')
        # start import job
        jt = IExperimentJobTracker(ds)
        status, message = jt.start_job()
        # reindex ojebct to make sure everything is up to date
        ds.reindexObject()
        # FIXME: IStatutsMessage should not be in API call
        IStatusMessage(self.request).add(message, type=status)

        # FIXME: API should not return a redirect
        #        201: new resource created ... location may point to resource
        from Products.CMFCore.utils import getToolByName
        portal = getToolByName(self.context, 'portal_url').getPortalObject()
        nexturl = portal[defaults.DATASETS_FOLDER_ID].absolute_url()
        self.request.response.setStatus(201)
        self.request.response.setHeader('Location', nexturl)
        # FIXME: should return a nice json representation of success or error
        return {
            'status': status,
            'message': message,
            'jobid': IJobTracker(ds).get_job().id
        }
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()),
                     '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories

        new_object.subject = []
        if self.domain:
            new_object.subject = [self.domain]
        if self.timeperiod:
            new_object.subject += self.timeperiod

            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url':
                    '{}/@@download/file/{}'.format(new_object.absolute_url(),
                                                   new_object.file.filename),
                    'results_dir':
                    get_results_dir(new_object,
                                    self.request,
                                    childSpecies=True),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'genre': self.datagenre,
                        'dataSource': new_object.dataSource,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: import_multi_species_csv',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(IRegistry).forInterface(
                    ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # FIXME: we can't use ssh here.... we don't know which container we are in... and
                #        sshing here is bad as well....
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)

                # TODO: we push the uploaded file directly to swift here..
                #       this really should be a background process
                #       best solution: ...
                #           user uploads to some temporary upload service (file never ends up here)
                #           we have a remote url here, and tell the datamover to pull it from there
                #           and move it to final destination. (or something like this)
                #       other good way: ...
                #           let user upload directly to swift (what about large file uploads?)
                #           and take care of clean up if necessary

                # 4. move file to swift
                # TODO: do we have enough information to upload to swift?
                #       need a temp url?
                swiftopts = app.conf.get('bccvl', {}).get('swift', {})
                src_url = build_source('file://{}'.format(tmpfile))
                dest_url = build_destination(
                    'swift+{}'.format(new_object.remoteUrl),
                    settings={
                        'swift': {
                            'os_auth_url':
                            swiftopts.get('os_auth_url'),
                            'os_username':
                            swiftopts.get('os_username'),
                            'os_password':
                            swiftopts.get('os_password'),
                            'os_project_name':
                            swiftopts.get('os_project_name'),
                            'os_storage_url':
                            swiftopts.get('os_storage_url'),
                            'os_user_domain_name':
                            swiftopts.get('os_user_domain_name'),
                            'os_project_domain_name':
                            swiftopts.get('os_project_domain_name'),
                            'auth_version':
                            swiftopts.get('auth_version')
                        }
                    })

                try:
                    movelib.move(src_url, dest_url)
                except Exception as e:
                    # do error handling here
                    raise
                finally:
                    # clean up temp location
                    path = os.path.dirname(tmpfile)
                    shutil.rmtree(path)

            # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: update_metadata',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Example #9
0
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()), '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories
            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename),
                    'results_dir': get_results_dir(container, self.request),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            job = jt.new_job('TODO: generate id',
                             'generate taskname: import_multi_species_csv')
            job.type = new_object.portal_type
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(
                    IRegistry).forInterface(ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)
                # 4. update task chain
                src_url = 'scp://{uid}@{ip}:{port}{file}'.format(
                    uid=pwd.getpwuid(os.getuid()).pw_name,
                    ip=get_hostname(self.request),
                    port=os.environ.get('SSH_PORT', 22),
                    file=tmpfile)
                dest_url = 'swift+{}'.format(new_object.remoteUrl)
                move_task = app.signature(
                    'org.bccvl.tasks.datamover.tasks.move',
                    kwargs={
                        'move_args': [(src_url, dest_url)],
                        'context': {
                            'context': context_path,
                            'user': {
                                'id': member.getUserName(),
                                'email': member.getProperty('email'),
                                'fullname': member.getProperty('fullname')
                            }
                        }
                    },
                    immutable=True)
                cleanup_task = app.signature(
                    'org.bccvl.tasks.plone.import_cleanup',
                    kwargs={
                        'path': os.path.dirname(tmpfile),
                        'context': {
                            'context': context_path,
                            'user': {
                                'id': member.getUserName(),
                                'email': member.getProperty('email'),
                                'fullname': member.getProperty('fullname')
                            }
                        }
                    },
                    immutable=True)

                update_task = move_task | update_task | cleanup_task

                # need some more workflow states here to support e.g. zip file upload (multiple rasters),
                #      give user a chance to better define metadata
                # make sure update_metadata does not change user edited metadata
                #      -> layer, unit, projection, whatever

                # FIXME: clean up tmp upload directory as well

                # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            job = jt.new_job('TODO: generate id',
                             'generate taskname: update_metadata')
            job.type = new_object.portal_type
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Example #10
0
    def import_ala_data(self):
        if self.request.get("REQUEST_METHOD", "GET").upper() != "POST":
            self.record_error("Request must be POST", 400)
            raise BadRequest("Request must be POST")

        context = None
        # get import context
        if ISiteRoot.providedBy(self.context):
            # we have been called at site root... let's traverse to default
            # import location
            context = self.context.restrictedTraverse(
                "/".join((defaults.DATASETS_FOLDER_ID, defaults.DATASETS_SPECIES_FOLDER_ID, "ala"))
            )
        else:
            # custom context.... let's use in
            context = self.context
        # do user check first
        member = ploneapi.user.get_current()
        if member.getId():
            user = {
                "id": member.getUserName(),
                "email": member.getProperty("email"),
                "fullname": member.getProperty("fullname"),
            }
        else:
            # We need at least a valid user
            raise Unauthorized("Invalid user")
        # check permission
        if not checkPermission("org.bccvl.AddDataset", context):
            raise Unauthorized("User not allowed in this context")

        params = self.request.form.get("data")

        if not params:
            raise BadRequest("At least on of traits or environ has to be set")

        if params is None:
            self.record_error("Bad Request", 400, "Missing parameter data", {"parameter": "data"})
        if not params:
            self.record_error("Bad Request", 400, "Empty parameter data", {"parameter": "data"})
        # TODO: should validate objects inside as well? (or use json schema
        # validation?)

        # all good so far
        # pull dataset from aekos
        # TODO: get better name here
        title = params[0].get("name", "ALA import")
        # determine dataset type
        # 1. test if it is a multi species import
        species = set()
        for query in params:
            biocache_url = "{}/occurrences/search".format(query["url"])
            query = {
                "q": query["query"],
                "pageSize": 0,
                "limit": 2,
                "facets": "species_guid",
                "fq": "species_guid:*",  # skip results without species guid
            }
            res = requests.get(biocache_url, params=query)
            res = res.json()
            # FIXME: do we need to treat sandbox downloads differently?
            if res["facetResults"]:  # do we have some results at all?
                for guid in res["facetResults"][0]["fieldResult"]:
                    species.add(guid["label"])
        if len(species) > 1:
            portal_type = "org.bccvl.content.multispeciesdataset"

        else:
            portal_type = "org.bccvl.content.dataset"
            swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)
            if swiftsettings.storage_url:
                portal_type = "org.bccvl.content.remotedataset"
        # create content
        ds = createContentInContainer(context, portal_type, title=title)
        ds.dataSource = "ala"
        ds.description = u" ".join([title, u" imported from ALA"])
        ds.import_params = params
        md = IBCCVLMetadata(ds)
        if IMultiSpeciesDataset.providedBy(ds):
            md["genre"] = "DataGenreSpeciesCollection"
        else:
            # species dataset
            md["genre"] = "DataGenreSpeciesOccurrence"
        md["categories"] = ["occurrence"]
        # TODO: populate this correctly as well
        md["species"] = [{"scientificName": "qid", "taxonID": "qid"}]
        # FIXME: IStatusMessage should not be in API call
        from Products.statusmessages.interfaces import IStatusMessage

        IStatusMessage(self.request).add("New Dataset created", type="info")
        # start import job
        jt = IExperimentJobTracker(ds)
        status, message = jt.start_job()
        # reindex ojebct to make sure everything is up to date
        ds.reindexObject()
        # FIXME: IStatutsMessage should not be in API call
        IStatusMessage(self.request).add(message, type=status)

        # FIXME: API should not return a redirect
        #        201: new resource created ... location may point to resource
        from Products.CMFCore.utils import getToolByName

        portal = getToolByName(self.context, "portal_url").getPortalObject()
        nexturl = portal[defaults.DATASETS_FOLDER_ID].absolute_url()
        self.request.response.setStatus(201)
        self.request.response.setHeader("Location", nexturl)
        # FIXME: should return a nice json representation of success or error
        return {"status": status, "message": message, "jobid": IJobTracker(ds).get_job().id}
Example #11
0
    def pullOccurrenceFromALA(self, lsid, taxon, dataSrc='ala', common=None):
        # TODO: check permisions?
        # 1. create new dataset with taxon, lsid and common name set
        portal = getToolByName(self.context, 'portal_url').getPortalObject()

        if dataSrc == 'ala':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['ala']
        elif dataSrc == 'gbif':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['gbif']
        elif dataSrc == 'aekos':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['aekos']
        elif dataSrc == 'obis':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['obis']
        else:
            raise BadRequest('Invalid data source {0}'.format(dataSrc))

        title = [taxon]
        if common:
            title.append(u"({})".format(common))

        # determine dataset type
        # 1. test if it is a multi species import
        species = set()
        if dataSrc == 'ala':
            params = [{
                'query': 'lsid:{}'.format(lsid),
                'url': 'https://biocache-ws.ala.org.au/ws'
            }]
            for query in params:
                biocache_url = '{}/occurrences/search'.format(query['url'])
                query = {
                    'q': query['query'],
                    'pageSize': 0,
                    'limit': 2,
                    'facets': 'species_guid',
                    'fq': 'species_guid:*'  # skip results without species guid
                }
                res = requests.get(biocache_url, params=query)
                res = res.json()
                if res.get('facetResults'):  # do we have some results at all?
                    for guid in res['facetResults'][0]['fieldResult']:
                        species.add(guid['label'])
        elif dataSrc == 'gbif':
            genusChildren_url = 'https://api.gbif.org/v1/species/{}/children?offset=0&limit=40'.format(
                lsid)
            res = requests.get(genusChildren_url)
            res = res.json()
            if res.get('results'):
                for sp in res.get('results'):
                    if sp.get('speciesKey'):
                        species.add(sp['speciesKey'])
        elif dataSrc == 'obis':
            genusChildren_url = 'https://backend.iobis.org/children/{}'.format(
                lsid)
            res = requests.get(genusChildren_url)
            res = res.json()
            for sp in res:
                if sp.get('rank_name', '') != 'Species':
                    continue
                if sp.get('valid_id'):
                    species.add(sp['valid_id'])

        if len(species) > 1:
            portal_type = 'org.bccvl.content.multispeciesdataset'
        else:
            swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)
            if swiftsettings.storage_url:
                portal_type = 'org.bccvl.content.remotedataset'
            else:
                portal_type = 'org.bccvl.content.dataset'

        # TODO: make sure we get a better content id that dataset-x
        title = u' '.join(title)
        ds = createContent(portal_type, title=title)
        ds.dataSource = dataSrc  # Either ALA or GBIF as source
        # TODO: add number of occurences to description
        ds.description = u' '.join(
            (title, u'imported from', unicode(dataSrc.upper())))
        ds = addContentToContainer(dscontainer, ds)
        md = IBCCVLMetadata(ds)
        # TODO: provenance ... import url?
        # FIXME: verify input parameters before adding to graph
        if IMultiSpeciesDataset.providedBy(ds):
            md['genre'] = 'DataGenreSpeciesCollection'
            md['categories'] = ['multispecies']
        else:
            md['genre'] = 'DataGenreSpeciesOccurrence'
            md['categories'] = ['occurrence']
        md['species'] = {
            'scientificName': taxon,
            'taxonID': lsid,
        }
        if common:
            md['species']['vernacularName'] = common
        IStatusMessage(self.request).add('New Dataset created', type='info')

        # 2. create and push alaimport job for dataset
        # TODO: make this named adapter
        jt = IExperimentJobTracker(ds)
        status, message = jt.start_job()
        # reindex object to make sure everything is up to date
        ds.reindexObject()
        # Job submission state notifier
        IStatusMessage(self.request).add(message, type=status)

        return (status, message)
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()), '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories
        
        new_object.subject = []
        if self.domain:
            new_object.subject = [self.domain]
        if self.timeperiod:
            new_object.subject += self.timeperiod

            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename),
                    'results_dir': get_results_dir(new_object, self.request, childSpecies=True),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'genre': self.datagenre,
                        'dataSource': new_object.dataSource,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: import_multi_species_csv',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(
                    IRegistry).forInterface(ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # FIXME: we can't use ssh here.... we don't know which container we are in... and
                #        sshing here is bad as well....
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)

                # TODO: we push the uploaded file directly to swift here..
                #       this really should be a background process
                #       best solution: ...
                #           user uploads to some temporary upload service (file never ends up here)
                #           we have a remote url here, and tell the datamover to pull it from there
                #           and move it to final destination. (or something like this)
                #       other good way: ...
                #           let user upload directly to swift (what about large file uploads?)
                #           and take care of clean up if necessary

                # 4. move file to swift
                # TODO: do we have enough information to upload to swift?
                #       need a temp url?
                swiftopts = app.conf.get('bccvl', {}).get('swift', {})
                src_url = build_source('file://{}'.format(tmpfile))
                dest_url = build_destination('swift+{}'.format(new_object.remoteUrl),
                    settings={'swift': {
                        'os_auth_url': swiftopts.get('os_auth_url'),
                        'os_username': swiftopts.get('os_username'),
                        'os_password': swiftopts.get('os_password'),
                        'os_tenant_name': swiftopts.get('os_tenant_name'),
                        'os_storage_url': swiftopts.get('os_storage_url')
                    }}
                )

                try:
                    movelib.move(src_url, dest_url)
                except Exception as e:
                    # do error handling here
                    raise
                finally:
                    # clean up temp location
                    path = os.path.dirname(tmpfile)
                    shutil.rmtree(path)

            # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: update_metadata',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Example #13
0
    def pullOccurrenceFromALA(self, lsid, taxon, dataSrc='ala', common=None):
        # TODO: check permisions?
        # 1. create new dataset with taxon, lsid and common name set
        portal = getToolByName(self.context, 'portal_url').getPortalObject()

        if dataSrc == 'ala':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['ala']
        elif dataSrc == 'gbif':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['gbif']
        elif dataSrc == 'aekos':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['aekos']
        elif dataSrc == 'obis':
            dscontainer = portal[defaults.DATASETS_FOLDER_ID][
                defaults.DATASETS_SPECIES_FOLDER_ID]['obis']
        else:
            raise BadRequest('Invalid data source {0}'.format(dataSrc))

        title = [taxon]
        if common:
            title.append(u"({})".format(common))

        # determine dataset type
        # 1. test if it is a multi species import
        species = set()
        if dataSrc == 'ala':
            params = [{
                'query': 'lsid:{}'.format(lsid),
                'url': 'http://biocache.ala.org.au/ws'
            }]
            for query in params:
                biocache_url = '{}/occurrences/search'.format(query['url'])
                query = {
                    'q': query['query'],
                    'pageSize': 0,
                    'limit': 2,
                    'facets': 'species_guid',
                    'fq': 'species_guid:*'    # skip results without species guid
                }
                res = requests.get(biocache_url, params=query)
                res = res.json()
                if res.get('facetResults'):  # do we have some results at all?
                    for guid in res['facetResults'][0]['fieldResult']:
                        species.add(guid['label'])
        elif dataSrc == 'gbif':
            genusChildren_url = 'https://api.gbif.org/v1/species/{}/children?offset=0&limit=40'.format(lsid)
            res = requests.get(genusChildren_url)
            res = res.json()
            if res.get('results'):
                for sp in res.get('results'):
                    if sp.get('speciesKey'):
                        species.add(sp['speciesKey'])
        elif dataSrc == 'obis':
            genusChildren_url = 'https://backend.iobis.org/children/{}'.format(lsid)
            res = requests.get(genusChildren_url)
            res = res.json()
            for sp in res:
                if sp.get('rank_name', '') != 'Species':
                    continue
                if sp.get('valid_id'):
                    species.add(sp['valid_id'])

        if len(species) > 1:
            portal_type = 'org.bccvl.content.multispeciesdataset'
        else:
            swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)
            if swiftsettings.storage_url:
                portal_type = 'org.bccvl.content.remotedataset'
            else:
                portal_type = 'org.bccvl.content.dataset'

        # TODO: make sure we get a better content id that dataset-x
        title = u' '.join(title)
        ds = createContent(portal_type, title=title)
        ds.dataSource = dataSrc  # Either ALA or GBIF as source
        # TODO: add number of occurences to description
        ds.description = u' '.join(
            (title, u'imported from', unicode(dataSrc.upper()))
        )
        ds = addContentToContainer(dscontainer, ds)
        md = IBCCVLMetadata(ds)
        # TODO: provenance ... import url?
        # FIXME: verify input parameters before adding to graph
        if IMultiSpeciesDataset.providedBy(ds):
            md['genre'] = 'DataGenreSpeciesCollection'
            md['categories'] = ['multispecies']
        else:
            md['genre'] = 'DataGenreSpeciesOccurrence'
            md['categories'] = ['occurrence']
        md['species'] = {
            'scientificName': taxon,
            'taxonID': lsid,
        }
        if common:
            md['species']['vernacularName'] = common
        IStatusMessage(self.request).add('New Dataset created',
                                         type='info')

        # 2. create and push alaimport job for dataset
        # TODO: make this named adapter
        jt = IExperimentJobTracker(ds)
        status, message = jt.start_job()
        # reindex object to make sure everything is up to date
        ds.reindexObject()
        # Job submission state notifier
        IStatusMessage(self.request).add(message, type=status)

        return (status, message)
Example #14
0
    def import_ala_data(self):
        if self.request.get('REQUEST_METHOD', 'GET').upper() != 'POST':
            self.record_error('Request must be POST', 400)
            raise BadRequest('Request must be POST')

        context = None
        # get import context
        if ISiteRoot.providedBy(self.context):
            # we have been called at site root... let's traverse to default
            # import location
            context = self.context.restrictedTraverse(
                "/".join((defaults.DATASETS_FOLDER_ID,
                          defaults.DATASETS_SPECIES_FOLDER_ID,
                          'ala')))
        else:
            # custom context.... let's use in
            context = self.context
        # do user check first
        member = ploneapi.user.get_current()
        if member.getId():
            user = {
                'id': member.getUserName(),
                'email': member.getProperty('email'),
                'fullname': member.getProperty('fullname')
            }
        else:
            # We need at least a valid user
            raise Unauthorized("Invalid user")
        # check permission
        if not checkPermission('org.bccvl.AddDataset', context):
            raise Unauthorized("User not allowed in this context")

        params = self.request.form.get('data')

        if not params:
            raise BadRequest("At least on of traits or environ has to be set")

        if params is None:
            self.record_error('Bad Request', 400,
                              'Missing parameter data',
                              {'parameter': 'data'})
        if not params:
            self.record_error('Bad Request', 400,
                              'Empty parameter data',
                              {'parameter': 'data'})
        # TODO: should validate objects inside as well? (or use json schema
        # validation?)

        # all good so far
        # pull dataset from aekos
        # TODO: get better name here
        title = params[0].get('name', 'ALA import')
        # determine dataset type
        # 1. test if it is a multi species import
        species = set()
        for query in params:
            biocache_url = '{}/occurrences/search'.format(query['url'])
            query = {
                'q': query['query'],
                'pageSize': 0,
                'limit': 2,
                'facets': 'species_guid',
                'fq': 'species_guid:*'    # skip results without species guid
            }
            res = requests.get(biocache_url, params=query)
            res = res.json()
            # FIXME: do we need to treat sandbox downloads differently?
            if res.get('facetResults'):  # do we have some results at all?
                for guid in res['facetResults'][0]['fieldResult']:
                    species.add(guid['label'])

        # Check of it is trait-data
        isTrait = any([p.get('trait', 0) for p in params])
        if not isTrait and len(species) > 1:
            portal_type = 'org.bccvl.content.multispeciesdataset'
        else:
            portal_type = 'org.bccvl.content.dataset'
            swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings)
            if swiftsettings.storage_url:
                portal_type = 'org.bccvl.content.remotedataset'
        # create content
        ds = createContent(portal_type, title=title)
        ds.dataSource = 'ala'
        ds.description = u' '.join([title, u' imported from ALA'])
        ds.import_params = params
        ds = addContentToContainer(context, ds)
        md = IBCCVLMetadata(ds)
        if IMultiSpeciesDataset.providedBy(ds):
            md['genre'] = 'DataGenreSpeciesCollection'
            md['categories'] = ['multispecies']
        else:
            if isTrait:
                # Trait dataset
                md['genre'] = 'DataGenreTraits'
                md['categories'] = ['traits']
            else:
                # species dataset
                md['genre'] = 'DataGenreSpeciesOccurrence'
                md['categories'] = ['occurrence']
        # TODO: populate this correctly as well
        md['species'] = [{
            'scientificName': 'qid',
            'taxonID': 'qid'}]
        # FIXME: IStatusMessage should not be in API call
        from Products.statusmessages.interfaces import IStatusMessage
        IStatusMessage(self.request).add('New Dataset created',
                                         type='info')
        # start import job
        jt = IExperimentJobTracker(ds)
        status, message = jt.start_job()
        # reindex ojebct to make sure everything is up to date
        ds.reindexObject()
        # FIXME: IStatutsMessage should not be in API call
        IStatusMessage(self.request).add(message, type=status)

        # FIXME: API should not return a redirect
        #        201: new resource created ... location may point to resource
        from Products.CMFCore.utils import getToolByName
        portal = getToolByName(self.context, 'portal_url').getPortalObject()
        nexturl = portal[defaults.DATASETS_FOLDER_ID].absolute_url()
        self.request.response.setStatus(201)
        self.request.response.setHeader('Location', nexturl)
        # FIXME: should return a nice json representation of success or error
        return {
            'status': status,
            'message': message,
            'jobid': IJobTracker(ds).get_job().id
        }