def test_worker_mixed_source_swift_destination(self):
        service = MoveService()
        service.configure(self.settings)

        file_1 = "scp://the_source/url/to/download_1.txt"
        file_2 = "http://www.someurl.com"
        file_3 = "ala://ala/?lsid=some_lsid"
        file_4 = "swift+http://nectar/my_container/local/dataset/myfile"
        file_5 = "file:///tmp/filename"
        source = [file_1, file_2, file_3, file_4, file_5]
        destination = "swift+http://nectar/my_container2/myfile"
        move_job = MoveJob(source, destination, "userid", False)

        with mock.patch("org.bccvl.movelib.move") as mock_move:
            service.worker(move_job)
            dest = build_destination(move_job.destination, service._config)
            calls = []

            for s in move_job.source:
                src = build_source(s, None, service._config)
                calls.append(mock.call(src, dest))
            mock_move.assert_has_calls(calls)

        self.assertEqual(move_job.status, MoveJob.STATUS_COMPLETE)
        self.assertEqual(move_job.reason, None)
Beispiel #2
0
def pull_occurrences_from_ala(params, dest_url, context, import_multspecies_params):
    # 1. set progress
    set_progress('RUNNING', 'Download occurrence dataset from ala', None, context)
    # 2. Download all the occurrence dataset in the params list
    results = []

    try:
        item, results = download_occurrence_from_ala(params, context)

        # This is the zip file path of the occurrence dataset
        ala_csv = item.get('file').get('url').split('file://')[1]

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'ala_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(ala_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(ala_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)

        # tell importer about new dataset (import it)
        set_progress("RUNNING", u"Import dataset '{0}' from ALA".format(item['title']), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job(
            "FAILED", u"Import of dataset '{0}' from ALA failed".format(item['title']), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", u"ALA import '{}' complete".format(item['title']), None, context)

        # Split multi-species dataset
        if import_multspecies_params:
            import_multispecies_job = import_multi_species_csv_job(item.get('file').get('url'),
                                                                   import_multspecies_params['results_dir'],
                                                                   import_multspecies_params['import_context'],
                                                                   context)
            import_multispecies_job.link_error(set_progress_job(
                "FAILED", u"Split multi-species dataset '{0}' from ALA failed".format(item['title']), None, context))
            import_multispecies_job.link_error(cleanup_job)
            (import_job | import_multispecies_job | cleanup_job | finish_job).delay()
        else:
            (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download occurrence dataset from ALA: {}'.format(e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', params, dest_url, e, exc_info=True)
    finally:
        for tmpdir in results:
            if tmpdir and os.path.exists(tmpdir):
                shutil.rmtree(tmpdir)
Beispiel #3
0
def get_files(urllist, userid, conf):
    """
    Download all files fiven in urllist to local tempfile
    return temp folder location.
    """
    dest = tempfile.mkdtemp(prefix='bccvl_export')
    for url in urllist:
        src = build_source(url, userid, conf)
        dst = build_destination('file://{0}/{1}'.format(dest, os.path.basename(url)), conf)
        movelib.move(src, dst)
    return dest
Beispiel #4
0
def download_input(move_args):
    src, dst = move_args['args']
    try:
        # set up the source and destination
        source = build_source(
            src, move_args['userid'], app.conf.get('bccvl', {}))
        destination = build_destination(dst)
        move(source, destination)
    except Exception as e:
        LOG.info('Download from %s to %s failed: %s', src, dst, e)
        raise
    LOG.info('Download from %s to %s succeeded.', src, dst)
    def test_worker_scp_ok(self, dummy=None):
        service = MoveService()
        service.configure(self.settings)

        source = "ala://ala?lsid=someid"
        destination = "scp://localhost/usr/local/dataset/"
        move_job = MoveJob(source, destination, "userid", False)

        with mock.patch("org.bccvl.movelib.move") as mock_move:
            service.worker(move_job)
            dest = build_destination(move_job.destination)
            src = build_source(move_job.source)
            mock_move.assert_called_with(src, dest)
        self.assertEqual(move_job.status, MoveJob.STATUS_COMPLETE)
Beispiel #6
0
def move(move_args, context):
    errmsgs = []
    for src, dest in move_args:
        try:
            source = build_source(src, context['user']['id'],
                                  app.conf.get('bccvl', {}))
            destination = build_destination(dest, app.conf.get('bccvl', {}))
            movelib.move(source, destination)
        except Exception as e:
            msg = 'Download from %s to %s failed: %s', src, dest, str(e)
            errmsgs.append(msg)
            LOG.warn(msg)
    if errmsgs:
        raise Exception('Move data failed', errmsgs)
    def test_worker_scp_failed(self):
        service = MoveService()
        service.configure(self.settings)

        source = "ala://ala?lsid=someid"
        destination = "scp://localhost/usr/local/dataset/"
        move_job = MoveJob(source, destination, "userid", False)

        with mock.patch("org.bccvl.movelib.move", side_effect=Exception("Bad source")) as mock_move:
            service.worker(move_job)
            dest = build_destination(move_job.destination)
            src = build_source(move_job.source)
            mock_move.assert_called_with(src, dest)
        reason = "Move has failed for job with id {0}. Reason: Bad source".format(move_job.id)
        self.assertEqual(move_job.status, MoveJob.STATUS_FAILED)
        self.assertEqual(move_job.reason, reason)
Beispiel #8
0
def upload_outputs(args):
    src, dest, item = args

    try:
        # set up the source and destination (src is a local file)
        source = build_source(src)
        # TODO: add content_type to destination? (move_lib supports it)
        destination = build_destination(dest, app.conf.get('bccvl', {}))

        # Upload the file and then generate metadata
        move(source, destination)
        LOG.info('Upload from %s to %s succeeded.', src, dest)
        item['file']['failed'] = False
    except Exception:
        LOG.info('Upload from %s to %s failed', src, dest)
        item['file']['failed'] = True
Beispiel #9
0
def update_metadata(url, filename, contenttype, context):
    try:
        set_progress('RUNNING', 'Download {0}'.format(url), None, context)
        tmpdir = tempfile.mkdtemp()
        tmpfile = '{}/{}'.format(tmpdir, filename)
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpfile), settings)
        movelib.move(src, dst)
        item = {
            'filemetadata': extract_metadata(tmpfile, contenttype)
        }

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if contenttype == 'text/csv':
            if ('headers' not in item['filemetadata']
                    or 'lat' not in item['filemetadata']['headers']
                    or 'lon' not in item['filemetadata']['headers']):
                raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_job = import_file_metadata_job([item], url, context)
        import_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))
        finish_job = set_progress_job(
            "COMPLETED", 'Metadata update for {} complete'.format(url),
            None, context)
        (import_job | finish_job).delay()
    except Exception as e:
        set_progress('FAILED',
                     'Metadata update for {} failed: {}'.format(url, e),
                     None, context)
        LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
Beispiel #10
0
    def worker(self, move_job):
        """
        Thread worker used to perform a move of data between endpoints.
        @param move_job: The move job to execute
        @type move_job: MoveJob
        """
        try:
            # Need o handle a list of sources
            self._logger.info("Starting move for job with id %s", move_job.id)
            move_job.update(status=MoveJob.STATUS_IN_PROGRESS, start_timestamp=datetime.datetime.now())

            # source can be just 1 source or a list of sources
            if isinstance(move_job.source, str):
                sourcelist = [move_job.source]
            elif isinstance(move_job.source, list):
                sourcelist = move_job.source
            else:
                raise Exception("Invalid source {1}".format(move_job.source))

            # Validate the destination url
            dest_url = urlparse(move_job.destination)
            if dest_url.scheme in ("swift+http", "swift+https") and not self._has_credential():
                raise Exception("Credential for Nectar swift service is not configured.")

            # Download all the files from the sources to the destination
            destination = build_destination(move_job.destination, self._config)

            for s in sourcelist:
                source = build_source(s, move_job.userid, self._config)
                movelib.move(source, destination)
            move_job.update(status=MoveJob.STATUS_COMPLETE, start_timestamp=datetime.datetime.now())
        except Exception as e:
            # catch any Exception here so that we can properly update the job state
            reason = "Move has failed for job with id {0}. Reason: {1}".format(move_job.id, str(e))
            self._logger.warning(reason)
            move_job.update(status=MoveJob.STATUS_FAILED, end_timestamp=datetime.datetime.now(), reason=reason)
Beispiel #11
0
    def export_to_ala(self):
        uuid = self.request.form.get("uuid", None)
        try:
            if uuid:
                brain = uuidToCatalogBrain(uuid)
                if brain is None:
                    raise Exception("Brain not found")

                obj = brain.getObject()
            else:
                obj = self.context

            # get username
            member = ploneapi.user.get_current()
            if member.getId():
                user = {
                    "id": member.getUserName(),
                    "email": member.getProperty("email"),
                    "fullname": member.getProperty("fullname"),
                }
            else:
                raise Exception("Invalid user")

            # verify dataset
            if obj.portal_type not in (
                "org.bccvl.content.dataset",
                "org.bccvl.content.remotedataset",
                "org.bccvl.content.multispeciesdataset",
            ):
                raise Exception("Invalid UUID (content type)")
            md = IBCCVLMetadata(obj)
            if md.get("genre") not in ("DataGenreSpeciesOccurrence", "DataGenreTraits"):
                raise Exception("Invalid UUID (data type)")
            # get download url
            dlinfo = IDownloadInfo(obj)

            # download file
            from org.bccvl import movelib
            from org.bccvl.movelib.utils import build_source, build_destination
            import tempfile

            destdir = tempfile.mkdtemp(prefix="export_to_ala")
            try:
                from org.bccvl.tasks.celery import app

                settings = app.conf.get("bccvl", {})
                dest = os.path.join(destdir, os.path.basename(dlinfo["url"]))
                movelib.move(
                    build_source(dlinfo["url"], user["id"], settings), build_destination("file://{}".format(dest))
                )

                csvfile = None

                if dlinfo["contenttype"] == "application/zip":
                    # loox at 'layers' to find file within zip
                    arc = md["layers"].keys()[0]

                    import zipfile

                    zf = zipfile.ZipFile(dest, "r")
                    csvfile = zf.open(arc, "r")
                else:
                    csvfile = open(dest, "rb")

                import requests

                # "Accept:application/json" "Origin:http://example.com"
                res = requests.post(
                    settings["ala"]["sandboxurl"],
                    files={"file": csvfile},
                    headers={"apikey": settings["ala"]["apikey"], "Accept": "application/json"},
                )
                if res.status_code != 200:
                    self.record_error(res.reason, res.status_code)
                    raise Exception("Upload failed")
                retval = res.json()
                # TODO: do error checking
                #  keys: sandboxUrl, fileName, message, error: Bool, fileId
                return retval
            finally:
                import shutil

                shutil.rmtree(destdir)

        except Exception as e:
            self.record_error(str(e), 500)
            raise
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()), '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories
        
        new_object.subject = []
        if self.domain:
            new_object.subject = [self.domain]
        if self.timeperiod:
            new_object.subject += self.timeperiod

            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename),
                    'results_dir': get_results_dir(new_object, self.request, childSpecies=True),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'genre': self.datagenre,
                        'dataSource': new_object.dataSource,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: import_multi_species_csv',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(
                    IRegistry).forInterface(ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # FIXME: we can't use ssh here.... we don't know which container we are in... and
                #        sshing here is bad as well....
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)

                # TODO: we push the uploaded file directly to swift here..
                #       this really should be a background process
                #       best solution: ...
                #           user uploads to some temporary upload service (file never ends up here)
                #           we have a remote url here, and tell the datamover to pull it from there
                #           and move it to final destination. (or something like this)
                #       other good way: ...
                #           let user upload directly to swift (what about large file uploads?)
                #           and take care of clean up if necessary

                # 4. move file to swift
                # TODO: do we have enough information to upload to swift?
                #       need a temp url?
                swiftopts = app.conf.get('bccvl', {}).get('swift', {})
                src_url = build_source('file://{}'.format(tmpfile))
                dest_url = build_destination('swift+{}'.format(new_object.remoteUrl),
                    settings={'swift': {
                        'os_auth_url': swiftopts.get('os_auth_url'),
                        'os_username': swiftopts.get('os_username'),
                        'os_password': swiftopts.get('os_password'),
                        'os_tenant_name': swiftopts.get('os_tenant_name'),
                        'os_storage_url': swiftopts.get('os_storage_url')
                    }}
                )

                try:
                    movelib.move(src_url, dest_url)
                except Exception as e:
                    # do error handling here
                    raise
                finally:
                    # clean up temp location
                    path = os.path.dirname(tmpfile)
                    shutil.rmtree(path)

            # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: update_metadata',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Beispiel #13
0
def pull_occurrences_from_gbif(lsid, dest_url, context):
    # 1. set progress
    set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context)
    # 2. do move
    src = None
    dst = None
    try:
        tmpdir = tempfile.mkdtemp(prefix='gbif_download_')
        src = build_source('gbif://gbif?lsid={}'.format(lsid))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)
        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context)
        # open gbif_dateset.json
        gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in gbif_ds['files']))
        # read gbif metadata from attribution file
        gbif_md = json.load(open(files['attribution']['url'], 'r'))
        gbif_csv = files['occurrence']['url']

        # build bccvl metadata:
        bccvlmd = {
            'genre': 'DataGenreSpeciesOccurrence',
            'categories': ['occurrence'],
            'species': {
                'scientificName': gbif_md.get('scientificName', None),
                'vernacularName': gbif_md.get('vernacularName', None),
                'taxonID': gbif_md.get('key', None),
                'rank': gbif_md.get('rank', None),
                'genus': gbif_md.get('genus', None),
                'genusGuid': gbif_md.get('genusKey', None),
                'family': gbif_md.get('family', None),
                'familyGuid': gbif_md.get('familyKey', None),
                'order': gbif_md.get('order', None),
                'orderGuid': gbif_md.get('orderKey', None),
                'clazz': gbif_md.get('class', None),
                'clazzGuid': gbif_md.get('classKey', None),
                'phylum': gbif_md.get('phylum', None),
                'phylumGuid': gbif_md.get('phylumKey', None),
                'kingdom': gbif_md.get('kingdom', None),
                'kingdomGuid': gbif_md.get('kingdomKey', None)
            },
        }
        # build item to import
        item = {
            'title': gbif_ds['title'],
            'description': gbif_ds['description'],
            'file': {
                'url': 'file://{}'.format(gbif_csv),  # local file url
                'contenttype': 'application/zip',
                'filename': os.path.basename(gbif_csv)
            },
            'bccvlmetadata': bccvlmd,
            'filemetadata': extract_metadata(gbif_csv, 'application/zip'),
        }

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(gbif_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)
        # tell importer about new dataset (import it)
        set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context)
        (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
Beispiel #14
0
def import_multi_species_csv(url, results_dir, import_context, context):
    # url .... source file
    # results_dir ... folder to place split files into
    # context ... the context with user and orig dataset
    try:
        set_progress('RUNNING', 'Split {0}'.format(url), None, context)
        # step 1: update main dataset metadata
        tmpdir = tempfile.mkdtemp()
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpdir), settings)
        movelib.move(src, dst)

        # Get the downloaded filename
        tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0]

        # Extract occurrence file from downloaded file
        mimetype, enc = mimetypes.guess_type(tmpfile)
        if mimetype == 'application/zip':
            src_occ_data = os.path.join('data', 'ala_occurrence.csv')
            with zipfile.ZipFile(tmpfile, 'r') as zipf:
                occfile = os.path.join(tmpdir, src_occ_data)
                zipf.extract(src_occ_data, tmpdir)
            item = {
                'filemetadata': extract_metadata(tmpfile, 'application/zip')
            }
            occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {})
        else:
            # csv file
            item = {
                'filemetadata': extract_metadata(tmpfile, "text/csv")
            }
            occfile = tmpfile
            occmd = item['filemetadata']

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if ('headers' not in occmd
                or 'lat' not in occmd['headers']
                or 'lon' not in occmd['headers']):
            raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_md_job = import_file_metadata_job([item], url, context)
        import_md_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))

        # step 2: split csv file and create sub datasets
        # start reading csv file and create new datasets which will be
        #       linked up with dataset collection item
        # FIXME: large csv files should be streamed to seperate files (not read
        #        into ram like here)
        f = io.open(occfile, 'r', encoding='utf-8', errors='ignore')
        csvreader = UnicodeCSVReader(f)
        headers = csvreader.next()
        if 'species' not in headers:
            raise Exception('missing species column')
        speciesidx = headers.index('species')
        # create dict with all data .... species column used as key, and rest
        # is just added
        data = {}
        for row in csvreader:
            if not row:
                continue
            species = row[speciesidx]
            if species not in data:
                # create new entry for species
                fname = u'{0}.csv'.format(species).replace(
                    u'/', u'_').encode('idna')
                # TODO: make sure fname contains only legal filename characters
                fpath = os.path.join(tmpdir, fname)
                file = io.open(fpath, 'wb')
                fwriter = UnicodeCSVWriter(file)
                fwriter.writerow(headers)
                data[species] = {
                    'file': file,
                    'writer': fwriter,
                    'path': fpath,
                    'name': fname
                }
            data[species]['writer'].writerow(row)
        # ok we have got all data and everything in separate files
        # close all files
        for species in data:
            data[species]['file'].close()
            del data[species]['file']
            del data[species]['writer']
        # extract metadata
        for species in data:
            data[species]['filemetadata'] = extract_metadata(
                data[species]['path'],
                'text/csv'
            )
        # send files to destination
        for species in data:
            src = build_source('file://{}'.format(data[species]['path']))
            dst = build_destination(os.path.join(results_dir,
                                                 data[species]['name']),
                                    app.conf.get('bccvl', {}))
            data[species]['url'] = dst['url']
            movelib.move(src, dst)
        # all files uploaded .... send import jobs
        set_progress('RUNNING', 'Create datasets for {0}'.format(
            url), None, context)
        items = []
        for species in data:
            # build item
            item = {
                'title': u'{0} occurrences'.format(species),
                'description': '',
                'file': {
                    'url': data[species]['url'],
                    'filename': data[species]['name'],
                    'contenttype': 'text/csv',
                },
                'bccvlmetadata': {
                    'genre': 'DataGenreSpeciesOccurrence',
                    'categories': ['occurrence'],
                    'species': {
                        'scientificName': species,
                    }
                },
                'filemetadata': data[species]['filemetadata'],
                '_partof': {
                    # add back reference to orig dataset
                    # TODO: shouldn't use absolute path here
                    'path': context['context']
                }
            }
            items.append(item)
        # start import process
        start_import = set_progress_job(
            'RUNNING', 'Import results', None, context)
        # What is results_dir being used for?
        import_job = import_result_job(items, results_dir, import_context)
        cleanup_job = import_cleanup_job(results_dir, context)
        import_job.link_error(set_progress_job(
            'FAILED', 'Multi species import failed', None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job(
            'COMPLETED', 'Task succeeded', None, context)
        (start_import | import_md_job | import_job |
         cleanup_job | finish_job).delay()
        # FIXME: missing stuff...
        #        need to set multi species collection to finished at some stage
    except Exception as e:
        set_progress('FAILED',
                     'Error while splitting Multi Species CSV {}: {}'.format(
                         url, e),
                     None, context)
        LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()),
                     '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories

        new_object.subject = []
        if self.domain:
            new_object.subject = [self.domain]
        if self.timeperiod:
            new_object.subject += self.timeperiod

            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url':
                    '{}/@@download/file/{}'.format(new_object.absolute_url(),
                                                   new_object.file.filename),
                    'results_dir':
                    get_results_dir(new_object,
                                    self.request,
                                    childSpecies=True),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'genre': self.datagenre,
                        'dataSource': new_object.dataSource,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: import_multi_species_csv',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(IRegistry).forInterface(
                    ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # FIXME: we can't use ssh here.... we don't know which container we are in... and
                #        sshing here is bad as well....
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)

                # TODO: we push the uploaded file directly to swift here..
                #       this really should be a background process
                #       best solution: ...
                #           user uploads to some temporary upload service (file never ends up here)
                #           we have a remote url here, and tell the datamover to pull it from there
                #           and move it to final destination. (or something like this)
                #       other good way: ...
                #           let user upload directly to swift (what about large file uploads?)
                #           and take care of clean up if necessary

                # 4. move file to swift
                # TODO: do we have enough information to upload to swift?
                #       need a temp url?
                swiftopts = app.conf.get('bccvl', {}).get('swift', {})
                src_url = build_source('file://{}'.format(tmpfile))
                dest_url = build_destination(
                    'swift+{}'.format(new_object.remoteUrl),
                    settings={
                        'swift': {
                            'os_auth_url':
                            swiftopts.get('os_auth_url'),
                            'os_username':
                            swiftopts.get('os_username'),
                            'os_password':
                            swiftopts.get('os_password'),
                            'os_project_name':
                            swiftopts.get('os_project_name'),
                            'os_storage_url':
                            swiftopts.get('os_storage_url'),
                            'os_user_domain_name':
                            swiftopts.get('os_user_domain_name'),
                            'os_project_domain_name':
                            swiftopts.get('os_project_domain_name'),
                            'auth_version':
                            swiftopts.get('auth_version')
                        }
                    })

                try:
                    movelib.move(src_url, dest_url)
                except Exception as e:
                    # do error handling here
                    raise
                finally:
                    # clean up temp location
                    path = os.path.dirname(tmpfile)
                    shutil.rmtree(path)

            # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: update_metadata',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Beispiel #16
0
def download_occurrence_from_ala(params, context):
    results = []
    species = []   # a list of species metadata
    ds_names = []

    for dataset in params:
        src = None
        dst = None
        occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download"
        query = dataset['query']    # i.e. qid:<qid> or lsid:<lsid>
        qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
        email = context.get('user', {}).get('email', '')
        ds_names.append(dataset.get('name', ''))

        # downlaod occurrence file
        # TODO: ignore file if not successfully download (exception), but continue??
        tmpdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(tmpdir)

        src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)

        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context)
        # open ala_dateset.json
        ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in ala_ds['files']))

        # occurrence data file
        ala_csv = files['occurrence']['url']  # this is actually a zip file now

        # read ala metadata from attribution file.
        # May not have metadata for user uploaded dataset into sandbox
        if files.get('attribution'):
            ala_md_list = json.load(open(files['attribution']['url'], 'r'))
            for md in ala_md_list:
                species.append({
                    'scientificName': md.get('scientificName'),
                    'vernacularName': md.get('commonNameSingle') or md.get('scientificName'),
                    'taxonID': md.get('guid'),
                    'rank': md.get('rank'),
                    'genus': md.get('genus'),
                    'family': md.get('family'),
                    'order': md.get('order'),
                    'clazz': md.get('classs'),
                    'phylum': md.get('phylum'),
                    'kingdom': md.get('kingdom')
                })

    # Shall not happen
    if len(results) == 0:
        raise Exception("No occurrence dataset is downloaded from ALA")

    # Combine all the occurrence and citation files from each download into 1 dataset
    imported_date = datetime.datetime.now().strftime('%d/%m/%Y')
    if len(results) > 1:
        destdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(destdir)
        os.mkdir(os.path.join(destdir, 'data'))
        combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir)
        combine_csv(results[:-1], 'data/ala_citation.csv', destdir)

        # Zip it out and point to the new zip file
        ala_csv = os.path.join(destdir, 'ala_occurrence.zip')
        zip_occurrence_data(ala_csv,
                            os.path.join(destdir, 'data'),
                            ['ala_occurrence.csv', 'ala_citation.csv'])

        # Make a title & description for multispecies dataset
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
        else:
            ds_name = ','.join([sp['scientificName'] for sp in species])
            title = "{} occurrences".format(ds_name)
        description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)

    else:
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
            description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)
        else:
            title = ala_ds['title']
            description = ala_ds['description']
        species = species[0]

    # build bccvl metadata:
    bccvlmd = {
        'genre': 'DataGenreSpeciesOccurrence',
        'categories': ['occurrence'],
        'species': species
    }

    # build item to import
    item = {
        'title': title,
        'description': description,
        'file': {
            'url': 'file://{}'.format(ala_csv),  # local file url
            'contenttype': 'application/zip',
            'filename': os.path.basename(ala_csv)
        },
        'bccvlmetadata': bccvlmd,
        'filemetadata': extract_metadata(ala_csv, 'application/zip'),
    }
    return (item, results)
Beispiel #17
0
    def export_to_ala(self):
        uuid = self.request.form.get('uuid', None)
        try:
            if uuid:
                brain = uuidToCatalogBrain(uuid)
                if brain is None:
                    raise Exception("Brain not found")

                obj = brain.getObject()
            else:
                obj = self.context

            # get username
            member = ploneapi.user.get_current()
            if member.getId():
                user = {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            else:
                raise Exception("Invalid user")

            # verify dataset
            if obj.portal_type not in (
                    'org.bccvl.content.dataset',
                    'org.bccvl.content.remotedataset',
                    'org.bccvl.content.multispeciesdataset'):
                raise Exception("Invalid UUID (content type)")
            md = IBCCVLMetadata(obj)
            if md.get('genre') not in ('DataGenreSpeciesOccurrence',
                                       'DataGenreSpeciesCollection',
                                       'DataGenreTraits'):
                raise Exception("Invalid UUID (data type)")
            # get download url
            dlinfo = IDownloadInfo(obj)

            # download file
            from org.bccvl import movelib
            from org.bccvl.movelib.utils import build_source, build_destination
            import tempfile
            destdir = tempfile.mkdtemp(prefix='export_to_ala')
            try:
                from org.bccvl.tasks.celery import app
                settings = app.conf.get('bccvl', {})
                dest = os.path.join(destdir, os.path.basename(dlinfo['url']))
                movelib.move(build_source(dlinfo['url'], user['id'], settings),
                             build_destination('file://{}'.format(dest)))

                csvfile = None

                if dlinfo['contenttype'] == 'application/zip':
                    # loox at 'layers' to find file within zip
                    arc = md['layers'].keys()[0]

                    import zipfile
                    zf = zipfile.ZipFile(dest, 'r')
                    csvfile = zf.open(arc, 'r')
                else:
                    csvfile = open(dest, 'rb')

                import requests
                # "Accept:application/json" "Origin:http://example.com"
                res = requests.post(settings['ala']['sandboxurl'],
                                    files={'file': csvfile},
                                    headers={
                                        'apikey': settings['ala']['apikey'],
                                        'Accept': 'application/json'
                                    })
                if res.status_code != 200:
                    self.record_error(res.reason, res.status_code)
                    raise Exception('Upload failed')
                retval = res.json()
                # TODO: do error checking
                #  keys: sandboxUrl, fileName, message, error: Bool, fileId
                return retval
            finally:
                import shutil
                shutil.rmtree(destdir)

        except Exception as e:
            self.record_error(str(e), 500)
            raise
Beispiel #18
0
    def export_to_ala(self):
        uuid = self.request.form.get('uuid', None)
        try:
            if uuid:
                brain = uuidToCatalogBrain(uuid)
                if brain is None:
                    raise Exception("Brain not found")

                obj = brain.getObject()
            else:
                obj = self.context

            # get username
            member = ploneapi.user.get_current()
            if member.getId():
                user = {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            else:
                raise Exception("Invalid user")

            # verify dataset
            if obj.portal_type not in ('org.bccvl.content.dataset',
                                       'org.bccvl.content.remotedataset',
                                       'org.bccvl.content.multispeciesdataset'):
                raise Exception("Invalid UUID (content type)")
            md = IBCCVLMetadata(obj)
            if md.get('genre') not in ('DataGenreSpeciesOccurrence',
                                       'DataGenreSpeciesCollection',
                                       'DataGenreTraits'):
                raise Exception("Invalid UUID (data type)")
            # get download url
            dlinfo = IDownloadInfo(obj)

            # download file
            from org.bccvl import movelib
            from org.bccvl.movelib.utils import build_source, build_destination
            import tempfile
            destdir = tempfile.mkdtemp(prefix='export_to_ala')
            try:
                from org.bccvl.tasks.celery import app
                settings = app.conf.get('bccvl', {})
                dest = os.path.join(destdir, os.path.basename(dlinfo['url']))
                movelib.move(build_source(dlinfo['url'], user['id'], settings),
                             build_destination('file://{}'.format(dest)))

                csvfile = None

                if dlinfo['contenttype'] == 'application/zip':
                    # loox at 'layers' to find file within zip
                    arc = md['layers'].keys()[0]

                    import zipfile
                    zf = zipfile.ZipFile(dest, 'r')
                    csvfile = zf.open(arc, 'r')
                else:
                    csvfile = open(dest, 'rb')

                import requests
                # "Accept:application/json" "Origin:http://example.com"
                res = requests.post(settings['ala']['sandboxurl'],
                                    files={'file': csvfile},
                                    headers={
                                        'apikey': settings['ala']['apikey'],
                                        'Accept': 'application/json'
                })
                if res.status_code != 200:
                    self.record_error(res.reason, res.status_code)
                    raise Exception('Upload failed')
                retval = res.json()
                # TODO: do error checking
                #  keys: sandboxUrl, fileName, message, error: Bool, fileId
                return retval
            finally:
                import shutil
                shutil.rmtree(destdir)

        except Exception as e:
            self.record_error(str(e), 500)
            raise