コード例 #1
0
 def mock_run_script(self, *args, **kw):
     # simulate a script run
     wrapper, params, context = args
     # 1. write file into results_dir
     tmpdir = urlsplit(params['result']['results_dir']).path
     try:
         # 2. create some result files
         for fname in ('model.RData',
                       'proj_test.tif'):
             img = Image.new('F', (10, 10))
             img.save(os.path.join(tmpdir, fname), 'TIFF')
         # 3. store results
         items = [
             {
                 'file': {
                     'url': 'file://{}/model.RData'.format(tmpdir),
                     'contenttype': 'application/x-r-data',
                     'filename': 'model.RData',
                 },
                 'title': 'Model Title',
                 'description': 'Model Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreSDMModel',
                 },
                 'filemetadata': {},
                 'layermd': {}
             },
             {
                 'file': {
                     'url': 'file://{}/proj_test.tif'.format(tmpdir),
                     'contenttype': 'image/tiff',
                     'filename': 'proj_test.tif',
                 },
                 'title': 'Test Projection',
                 'description': 'Test Projection Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreCP',
                 },
                 'filemetadata': {
                     'band': [{
                         'min': 0.0,
                         'STATISTICS_MINIMUM': 0.0,
                         'max': 1.0
                     }]
                 },
                 'layermd': {'files': {'proj_test.tif': {'layer': 'projection_probability', 'data_type': 'Continuous'}}}
             }
         ]
         # TODO: tasks called dierctly here; maybe call them as tasks as
         # well? (chain?)
         import_result_job(items, params['result'][
                           'results_dir'], context).delay()
         import_cleanup(params['result']['results_dir'], context)
         set_progress('COMPLETED', 'Test Task succeeded', None, context)
     except Exception as e:
         # 4. clean up if problem otherwise import task cleans up
         #    TODO: should be done by errback or whatever
         import_cleanup(params['result']['results_dir'], context)
         set_progress('FAILED', 'Test Task failed', None, context)
         raise
コード例 #2
0
 def mock_run_script(self, *args, **kw):
     # simulate a script run
     wrapper, params, context = args
     # 1. write file into results_dir
     tmpdir = urlsplit(params['result']['results_dir']).path
     try:
         for fname in ('model.RData',
                       'traits.txt'):
             open(os.path.join(tmpdir, fname), 'w').write('Mock Result')
         # 3. store results
         items = [
             {
                 'file': {
                     'url': 'file://{}/model.RData'.format(tmpdir),
                     'contenttype': 'application/x-r-data',
                     'filename': 'model.RData',
                 },
                 'title': 'Model Title',
                 'description': 'Model Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreSTModel',
                 },
                 'filemetadata': {},
                 'layermd': {}
             },
             {
                 'file': {
                     'url': 'file://{}/traits.txt'.format(tmpdir),
                     'contenttype': 'text/plain',
                     'filename': 'traits.txt',
                 },
                 'title': 'Test Traits',
                 'description': 'Test Traits Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreSTResult',
                 },
                 'filemetadata': {},
                 'layermd': {},
             }
         ]
         # TODO: tasks called dierctly here; maybe call them as tasks as
         # well? (chain?)
         import_result_job(items, params['result'][
                           'results_dir'], context).delay()
         import_cleanup(params['result']['results_dir'], context)
         set_progress('COMPLETED', 'Test Task succeeded', None, context)
     except Exception as e:
         # 4. clean up if problem otherwise import task cleans up
         #    TODO: should be done by errback or whatever
         import_cleanup(params['result']['results_dir'], context)
         set_progress('FAILED', 'Test Task failed', None, context)
         raise
コード例 #3
0
 def mock_run_script(self, *args, **kw):
     # simulate a script run
     wrapper, params, context = args
     # 1. write file into results_dir
     tmpdir = urlsplit(params['result']['results_dir']).path
     try:
         for fname in ('model.RData',
                       'traits.txt'):
             open(os.path.join(tmpdir, fname), 'w').write('Mock Result')
         # 3. store results
         items = [
             {
                 'file': {
                     'url': 'file://{}/model.RData'.format(tmpdir),
                     'contenttype': 'application/x-r-data',
                     'filename': 'model.RData',
                 },
                 'title': 'Model Title',
                 'description': 'Model Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreSTModel',
                 },
                 'filemetadata': {},
                 'layermd': {}
             },
             {
                 'file': {
                     'url': 'file://{}/traits.txt'.format(tmpdir),
                     'contenttype': 'text/plain',
                     'filename': 'traits.txt',
                 },
                 'title': 'Test Traits',
                 'description': 'Test Traits Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreSTResult',
                 },
                 'filemetadata': {},
                 'layermd': {},
             }
         ]
         # TODO: tasks called dierctly here; maybe call them as tasks as
         # well? (chain?)
         import_result_job(items, params['result'][
                           'results_dir'], context).delay()
         import_cleanup(params['result']['results_dir'], context)
         set_progress('COMPLETED', 'Test Task succeeded', None, context)
     except Exception as e:
         # 4. clean up if problem otherwise import task cleans up
         #    TODO: should be done by errback or whatever
         import_cleanup(params['result']['results_dir'], context)
         set_progress('FAILED', 'Test Task failed', None, context)
         raise
コード例 #4
0
 def mock_run_script(self, *args, **kw):
     # simulate a script run
     wrapper, params, context = args
     # 1. write file into results_dir
     tmpdir = urlsplit(params['result']['results_dir']).path
     try:
         # 2. create some result files
         for fname in ('ensemble.tif',):
             img = Image.new('F', (10, 10))
             img.save(os.path.join(tmpdir, fname), 'TIFF')
         # 3. store results
         items = [
             {
                 'file': {
                     'url': 'file://{}/ensemble.tif'.format(tmpdir),
                     'contenttype': 'image/tiff',
                     'filename': 'ensemble.tif',
                 },
                 'title': 'Ensemble Output',
                 'description': 'Ensemble Output Description',
                 'bccvlmetadata': {
                     'genre': 'DataGenreEnsembleResult',
                 },
                 'filemetadata': {
                     'band': [{
                         'min': 0.0,
                         'STATISTICS_MINIMUM': 0.0,
                         'max': 1.0
                     }]
                 },
                 'layermd': {},
             }
         ]
         # TODO: tasks called dierctly here; maybe call them as tasks as
         # well? (chain?)
         import_result_job(items, params['result'][
                           'results_dir'], context).delay()
         import_cleanup(params['result']['results_dir'], context)
         set_progress('COMPLETED', 'Test Task succeeded', None, context)
     except Exception as e:
         # 4. clean up if problem otherwise import task cleans up
         #    TODO: should be done by errback or whatever
         import_cleanup(params['result']['results_dir'], context)
         set_progress('FAILED', 'Test Task failed', None, context)
         raise
コード例 #5
0
ファイル: ala.py プロジェクト: BCCVL/org.bccvl.tasks
def pull_occurrences_from_ala(params, dest_url, context, import_multspecies_params):
    # 1. set progress
    set_progress('RUNNING', 'Download occurrence dataset from ala', None, context)
    # 2. Download all the occurrence dataset in the params list
    results = []

    try:
        item, results = download_occurrence_from_ala(params, context)

        # This is the zip file path of the occurrence dataset
        ala_csv = item.get('file').get('url').split('file://')[1]

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'ala_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(ala_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(ala_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)

        # tell importer about new dataset (import it)
        set_progress("RUNNING", u"Import dataset '{0}' from ALA".format(item['title']), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job(
            "FAILED", u"Import of dataset '{0}' from ALA failed".format(item['title']), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", u"ALA import '{}' complete".format(item['title']), None, context)

        # Split multi-species dataset
        if import_multspecies_params:
            import_multispecies_job = import_multi_species_csv_job(item.get('file').get('url'),
                                                                   import_multspecies_params['results_dir'],
                                                                   import_multspecies_params['import_context'],
                                                                   context)
            import_multispecies_job.link_error(set_progress_job(
                "FAILED", u"Split multi-species dataset '{0}' from ALA failed".format(item['title']), None, context))
            import_multispecies_job.link_error(cleanup_job)
            (import_job | import_multispecies_job | cleanup_job | finish_job).delay()
        else:
            (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download occurrence dataset from ALA: {}'.format(e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', params, dest_url, e, exc_info=True)
    finally:
        for tmpdir in results:
            if tmpdir and os.path.exists(tmpdir):
                shutil.rmtree(tmpdir)
コード例 #6
0
ファイル: tasks.py プロジェクト: BCCVL/org.bccvl.tasks
def update_metadata(url, filename, contenttype, context):
    try:
        set_progress('RUNNING', 'Download {0}'.format(url), None, context)
        tmpdir = tempfile.mkdtemp()
        tmpfile = '{}/{}'.format(tmpdir, filename)
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpfile), settings)
        movelib.move(src, dst)
        item = {
            'filemetadata': extract_metadata(tmpfile, contenttype)
        }

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if contenttype == 'text/csv':
            if ('headers' not in item['filemetadata']
                    or 'lat' not in item['filemetadata']['headers']
                    or 'lon' not in item['filemetadata']['headers']):
                raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_job = import_file_metadata_job([item], url, context)
        import_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))
        finish_job = set_progress_job(
            "COMPLETED", 'Metadata update for {} complete'.format(url),
            None, context)
        (import_job | finish_job).delay()
    except Exception as e:
        set_progress('FAILED',
                     'Metadata update for {} failed: {}'.format(url, e),
                     None, context)
        LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
コード例 #7
0
    def mock_run_script(self, *args, **kw):
        # simulate a script run
        wrapper, params, context = args
        # 1. write file into results_dir
        tmpdir = urlsplit(params['result']['results_dir']).path
        try:
            # 2. create some result files
            for fname in ('model.RData',
                          'proj_test.tif'):
                img = Image.new('F', (10, 10))
                img.save(os.path.join(tmpdir, fname), 'TIFF')
            # 3. store results
            items = [
                {
                    'file': {
                        'url': 'file://{}/model.RData'.format(tmpdir),
                        'contenttype': 'application/x-r-data',
                        'filename': 'model.RData',
                    },
                    'title': 'Model Title',
                    'description': 'Model Description',
                    'bccvlmetadata': {
                        'genre': 'DataGenreSDMModel',
                    },
                    'filemetadata': {},
                    'layermd': {}
                },
                {
                    'file': {
                        'url': 'file://{}/proj_test.tif'.format(tmpdir),
                        'contenttype': 'image/tiff',
                        'filename': 'proj_test.tif',
                    },
                    'title': 'Test Projection',
                    'description': 'Test Projection Description',
                    'bccvlmetadata': {
                        'genre': 'DataGenreCP',
                    },
                    'filemetadata': {
                        'band': [{
                            'min': 0.0,
                            'STATISTICS_MINIMUM': 0.0,
                            'max': 1.0
                        }]
                    },
                    'layermd': {'files': {'proj_test.tif': {'layer': 'projection_probability', 'data_type': 'Continuous'}}}
                },
                {
                    'file': {
                        'url': 'file://{}/proj_test.tif'.format(tmpdir),
                        'contenttype': 'image/tiff',
                        'filename': 'proj_test.tif',
                    },
                    'title': 'Test Envelop Projection',
                    'description': 'Test Envelop Projection Description',
                    'bccvlmetadata': {
                        'genre': 'DataGenreCP_ENVLOP',
                    },
                    'filemetadata': {
                        'band': [{
                            'min': 0.0,
                            'STATISTICS_MINIMUM': 0.0,
                            'max': 1.0
                        }]
                    },
                    'layermd': {'files': {'proj_test.tif': {'layer': 'projection_probability', 'data_type': 'Continuous'}}}
                }

            ]
            # TODO: tasks called dierctly here; maybe call them as tasks as
            # well? (chain?)
            import_result_job(items, params['result'][
                              'results_dir'], context).delay()
            import_cleanup(params['result']['results_dir'], context)
            set_progress('COMPLETED', 'Test Task succeeded', None, context)
        except Exception as e:
            # 4. clean up if problem otherwise import task cleans up
            #    TODO: should be done by errback or whatever
            import_cleanup(params['result']['results_dir'], context)
            set_progress('FAILED', 'Test Task failed', None, context)
            raise
コード例 #8
0
ファイル: ala.py プロジェクト: BCCVL/org.bccvl.tasks
def download_occurrence_from_ala(params, context):
    results = []
    species = []   # a list of species metadata
    ds_names = []

    for dataset in params:
        src = None
        dst = None
        occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download"
        query = dataset['query']    # i.e. qid:<qid> or lsid:<lsid>
        qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
        email = context.get('user', {}).get('email', '')
        ds_names.append(dataset.get('name', ''))

        # downlaod occurrence file
        # TODO: ignore file if not successfully download (exception), but continue??
        tmpdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(tmpdir)

        src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)

        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context)
        # open ala_dateset.json
        ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in ala_ds['files']))

        # occurrence data file
        ala_csv = files['occurrence']['url']  # this is actually a zip file now

        # read ala metadata from attribution file.
        # May not have metadata for user uploaded dataset into sandbox
        if files.get('attribution'):
            ala_md_list = json.load(open(files['attribution']['url'], 'r'))
            for md in ala_md_list:
                species.append({
                    'scientificName': md.get('scientificName'),
                    'vernacularName': md.get('commonNameSingle') or md.get('scientificName'),
                    'taxonID': md.get('guid'),
                    'rank': md.get('rank'),
                    'genus': md.get('genus'),
                    'family': md.get('family'),
                    'order': md.get('order'),
                    'clazz': md.get('classs'),
                    'phylum': md.get('phylum'),
                    'kingdom': md.get('kingdom')
                })

    # Shall not happen
    if len(results) == 0:
        raise Exception("No occurrence dataset is downloaded from ALA")

    # Combine all the occurrence and citation files from each download into 1 dataset
    imported_date = datetime.datetime.now().strftime('%d/%m/%Y')
    if len(results) > 1:
        destdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(destdir)
        os.mkdir(os.path.join(destdir, 'data'))
        combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir)
        combine_csv(results[:-1], 'data/ala_citation.csv', destdir)

        # Zip it out and point to the new zip file
        ala_csv = os.path.join(destdir, 'ala_occurrence.zip')
        zip_occurrence_data(ala_csv,
                            os.path.join(destdir, 'data'),
                            ['ala_occurrence.csv', 'ala_citation.csv'])

        # Make a title & description for multispecies dataset
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
        else:
            ds_name = ','.join([sp['scientificName'] for sp in species])
            title = "{} occurrences".format(ds_name)
        description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)

    else:
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
            description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)
        else:
            title = ala_ds['title']
            description = ala_ds['description']
        species = species[0]

    # build bccvl metadata:
    bccvlmd = {
        'genre': 'DataGenreSpeciesOccurrence',
        'categories': ['occurrence'],
        'species': species
    }

    # build item to import
    item = {
        'title': title,
        'description': description,
        'file': {
            'url': 'file://{}'.format(ala_csv),  # local file url
            'contenttype': 'application/zip',
            'filename': os.path.basename(ala_csv)
        },
        'bccvlmetadata': bccvlmd,
        'filemetadata': extract_metadata(ala_csv, 'application/zip'),
    }
    return (item, results)
コード例 #9
0
ファイル: compute.py プロジェクト: BCCVL/org.bccvl.tasks
def run_script_SDM(wrapper, params, context):
    # TODO: there are many little things that can fail here, and we
    #       need to communicate it properly back to the user.
    # TODO: however, we can't really do anything in case sending
    #       messages doesn't work.
    try:

        errmsg = 'Fail to transfer/import data'
        set_progress('RUNNING', 'Transferring data', None, context)

        # create initial folder structure
        create_workenv(params)
        # FIXME: remove me
        write_status_to_nectar(params, context, u'FETCHING')

        # transfer input files
        transfer_inputs(params, context)

        # create script
        scriptname = create_scripts(params, context)

        # run the script
        errmsg = 'Fail to run experiment'
        set_progress('RUNNING', 'Executing job', None, context)
        # FIXME: remove me
        write_status_to_nectar(params, context, u'RUNNING')

        scriptout = os.path.join(params['env']['outputdir'],
                                 params['worker']['script']['name'] + 'out')
        outfile = open(scriptout, 'w')
        wrapsh = os.path.join(params['env']['scriptdir'], 'wrap.sh')
        open(wrapsh, 'w').write(wrapper)
        # zip up workenv if requested
        if params['worker'].get('zipworkenv', False):
            # make sure tmp is big enough
            # TODO: add toolkit name to zip name ... workenv_bioclim.zip
            zip_folder(os.path.join(params['env']['outputdir'], 'workenv.zip'),
                       params['env']['workdir'])
        cmd = ["/bin/bash", "-l", "wrap.sh", scriptname]
        LOG.info("Executing: %s", ' '.join(cmd))

        run_date = datetime.datetime.now().strftime('%d/%m/%Y')

        proc = subprocess.Popen(cmd, cwd=params['env']['scriptdir'],
                                close_fds=True,
                                stdout=outfile, stderr=subprocess.STDOUT)
        rpid, ret, rusage = os.wait4(proc.pid, 0)
        usage = get_rusage(rusage)
        # TODO: check whether ret and proc.returncode are the same

        # Reproject using Web Mercator projection
        proj_files = reproject_to_webmercator(params, context)

        # move results back
        errmsg = 'Fail to transfer results back'
        set_progress('RUNNING', 'Transferring outputs', usage, context)
        # FIXME: remove me
        write_status_to_nectar(params, context, u'TRANSFERRING')

        # Push the projection to nectar, for the wordpress site to fetch
        transfer_projections(params, context, proj_files)

        # push the projection metadata file
        push_projection_info(params, context, run_date)

        set_progress('COMPLETED', 'Task succeeded', None, context)
        # FIXME: remove me
        write_status_to_nectar(params, context, u'COMPLETE')

    except Exception as e:
        # TODO: capture stacktrace
        # need to start import to get import cleaned up

        # Log error message with stacktrace.
        #:( exposes internals, ugly hash, complicated with admin only access
        #-> certainly need to get rid of exception in message.
        # test exceptions:
        #  ... upload file, replace with something else (unzip error)
        #  ... delete file and rerun experiment (donwload error)
        #  ... create file/folder error? (can't write log)
        #  ... how to simulate fault? (download error)

        # log error message with exception and traceback
        LOG.error(errmsg, exc_info=True)

        set_progress('FAILED', errmsg, None, context)
        # FIXME: remove me
        write_status_to_nectar(params, context, u'FAILED')

        raise
    finally:
        # TODO:  check if dir exists
        path = params['env'].get('workdir', None)
        if path and os.path.exists(path):
            shutil.rmtree(path)
コード例 #10
0
ファイル: compute.py プロジェクト: BCCVL/org.bccvl.tasks
def run_script(wrapper, params, context):
    # TODO: there are many little things that can fail here, and we
    #       need to communicate it properly back to the user.
    # TODO: however, we can't really do anything in case sending
    #       messages doesn't work.
    items = []
    try:
        errmsg = 'Fail to transfer/import data'
        set_progress('RUNNING', 'Transferring data', None, context)

        # create initial folder structure
        create_workenv(params)

        # transfer input files
        transfer_inputs(params, context)
        # create script
        scriptname = create_scripts(params, context)

        # run the script
        errmsg = 'Fail to run experiement'
        set_progress('RUNNING', 'Executing job', None, context)

        scriptout = os.path.join(params['env']['outputdir'],
                                 params['worker']['script']['name'] + 'out')
        outfile = open(scriptout, 'w')
        wrapsh = os.path.join(params['env']['scriptdir'], 'wrap.sh')
        open(wrapsh, 'w').write(wrapper)
        # zip up workenv if requested
        if params['worker'].get('zipworkenv', False):
            # make sure tmp is big enough
            # TODO: add toolkit name to zip name ... workenv_bioclim.zip
            zip_folder(os.path.join(params['env']['outputdir'], 'workenv.zip'),
                       params['env']['workdir'])
        cmd = ["/bin/bash", "-l", "wrap.sh", scriptname]
        LOG.info("Executing: %s", ' '.join(cmd))
        proc = subprocess.Popen(cmd, cwd=params['env']['scriptdir'],
                                close_fds=True,
                                stdout=outfile, stderr=subprocess.STDOUT)
        rpid, ret, rusage = os.wait4(proc.pid, 0)
        # TODO: should we write this as json file and send as result back
        #       or just send rusage with finished message?
        usage = get_rusage(rusage)
        # TODO: check whether ret and proc.returncode are the same

        # move results back
        errmsg = 'Fail to transfer results back'
        set_progress('RUNNING', 'Transferring outputs', usage, context)
        # TODO: maybe redesign this?
        #       transfer only uploads to destination and stores new url somewhere
        # and we do metadata extraction and item creation afterwards (here)?
        items = transfer_outputs(params, context)

        # we are done here, hand over to result importer
        # build a chain of the remaining tasks
        start_import = set_progress_job(
            'RUNNING', 'Import results', None, context)

        cleanup_job = import_cleanup_job(
            params['result']['results_dir'], context)
        import_job = import_result_job(items, params['result'][
                                       'results_dir'], context)
        import_job.link_error(set_progress_job(
            'FAILED', 'Result import failed', None, context))
        import_job.link_error(cleanup_job)

        if ret != 0:
            errmsg = 'Script execution failed with exit code {0}'.format(ret)
            finish_job = set_progress_job('FAILED', errmsg, None, context)
        else:
            finish_job = set_progress_job(
                'COMPLETED', 'Task succeeded', None, context)

        (start_import | import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        # TODO: capture stacktrace
        # need to start import to get import cleaned up

        # Log error message with stacktrace.
        #:( exposes internals, ugly hash, complicated with admin only access
        #-> certainly need to get rid of exception in message.
        # test exceptions:
        #  ... upload file, replace with something else (unzip error)
        #  ... delete file and rerun experiment (donwload error)
        #  ... create file/folder error? (can't write log)
        #  ... how to simulate fault? (download error)

        # log error message with exception and traceback
        LOG.error(errmsg, exc_info=True)

        start_import = set_progress_job(
            'RUNNING', 'Import results', None, context)

        import_job = import_result_job(items, params['result'][
                                       'results_dir'], context)
        import_job.link_error(set_progress_job(
            'FAILED', 'Result import failed', None, context))

        finish_job = set_progress_job('FAILED', errmsg, None, context)

        (start_import | import_job | finish_job).delay()
        raise
    finally:
        # TODO:  check if dir exists
        path = params['env'].get('workdir', None)
        if path and os.path.exists(path):
            shutil.rmtree(path)
コード例 #11
0
ファイル: gbif.py プロジェクト: BCCVL/org.bccvl.tasks
def pull_occurrences_from_gbif(lsid, dest_url, context):
    # 1. set progress
    set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context)
    # 2. do move
    src = None
    dst = None
    try:
        tmpdir = tempfile.mkdtemp(prefix='gbif_download_')
        src = build_source('gbif://gbif?lsid={}'.format(lsid))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)
        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context)
        # open gbif_dateset.json
        gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in gbif_ds['files']))
        # read gbif metadata from attribution file
        gbif_md = json.load(open(files['attribution']['url'], 'r'))
        gbif_csv = files['occurrence']['url']

        # build bccvl metadata:
        bccvlmd = {
            'genre': 'DataGenreSpeciesOccurrence',
            'categories': ['occurrence'],
            'species': {
                'scientificName': gbif_md.get('scientificName', None),
                'vernacularName': gbif_md.get('vernacularName', None),
                'taxonID': gbif_md.get('key', None),
                'rank': gbif_md.get('rank', None),
                'genus': gbif_md.get('genus', None),
                'genusGuid': gbif_md.get('genusKey', None),
                'family': gbif_md.get('family', None),
                'familyGuid': gbif_md.get('familyKey', None),
                'order': gbif_md.get('order', None),
                'orderGuid': gbif_md.get('orderKey', None),
                'clazz': gbif_md.get('class', None),
                'clazzGuid': gbif_md.get('classKey', None),
                'phylum': gbif_md.get('phylum', None),
                'phylumGuid': gbif_md.get('phylumKey', None),
                'kingdom': gbif_md.get('kingdom', None),
                'kingdomGuid': gbif_md.get('kingdomKey', None)
            },
        }
        # build item to import
        item = {
            'title': gbif_ds['title'],
            'description': gbif_ds['description'],
            'file': {
                'url': 'file://{}'.format(gbif_csv),  # local file url
                'contenttype': 'application/zip',
                'filename': os.path.basename(gbif_csv)
            },
            'bccvlmetadata': bccvlmd,
            'filemetadata': extract_metadata(gbif_csv, 'application/zip'),
        }

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(gbif_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)
        # tell importer about new dataset (import it)
        set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context)
        (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
コード例 #12
0
ファイル: tasks.py プロジェクト: BCCVL/org.bccvl.tasks
def import_multi_species_csv(url, results_dir, import_context, context):
    # url .... source file
    # results_dir ... folder to place split files into
    # context ... the context with user and orig dataset
    try:
        set_progress('RUNNING', 'Split {0}'.format(url), None, context)
        # step 1: update main dataset metadata
        tmpdir = tempfile.mkdtemp()
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpdir), settings)
        movelib.move(src, dst)

        # Get the downloaded filename
        tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0]

        # Extract occurrence file from downloaded file
        mimetype, enc = mimetypes.guess_type(tmpfile)
        if mimetype == 'application/zip':
            src_occ_data = os.path.join('data', 'ala_occurrence.csv')
            with zipfile.ZipFile(tmpfile, 'r') as zipf:
                occfile = os.path.join(tmpdir, src_occ_data)
                zipf.extract(src_occ_data, tmpdir)
            item = {
                'filemetadata': extract_metadata(tmpfile, 'application/zip')
            }
            occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {})
        else:
            # csv file
            item = {
                'filemetadata': extract_metadata(tmpfile, "text/csv")
            }
            occfile = tmpfile
            occmd = item['filemetadata']

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if ('headers' not in occmd
                or 'lat' not in occmd['headers']
                or 'lon' not in occmd['headers']):
            raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_md_job = import_file_metadata_job([item], url, context)
        import_md_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))

        # step 2: split csv file and create sub datasets
        # start reading csv file and create new datasets which will be
        #       linked up with dataset collection item
        # FIXME: large csv files should be streamed to seperate files (not read
        #        into ram like here)
        f = io.open(occfile, 'r', encoding='utf-8', errors='ignore')
        csvreader = UnicodeCSVReader(f)
        headers = csvreader.next()
        if 'species' not in headers:
            raise Exception('missing species column')
        speciesidx = headers.index('species')
        # create dict with all data .... species column used as key, and rest
        # is just added
        data = {}
        for row in csvreader:
            if not row:
                continue
            species = row[speciesidx]
            if species not in data:
                # create new entry for species
                fname = u'{0}.csv'.format(species).replace(
                    u'/', u'_').encode('idna')
                # TODO: make sure fname contains only legal filename characters
                fpath = os.path.join(tmpdir, fname)
                file = io.open(fpath, 'wb')
                fwriter = UnicodeCSVWriter(file)
                fwriter.writerow(headers)
                data[species] = {
                    'file': file,
                    'writer': fwriter,
                    'path': fpath,
                    'name': fname
                }
            data[species]['writer'].writerow(row)
        # ok we have got all data and everything in separate files
        # close all files
        for species in data:
            data[species]['file'].close()
            del data[species]['file']
            del data[species]['writer']
        # extract metadata
        for species in data:
            data[species]['filemetadata'] = extract_metadata(
                data[species]['path'],
                'text/csv'
            )
        # send files to destination
        for species in data:
            src = build_source('file://{}'.format(data[species]['path']))
            dst = build_destination(os.path.join(results_dir,
                                                 data[species]['name']),
                                    app.conf.get('bccvl', {}))
            data[species]['url'] = dst['url']
            movelib.move(src, dst)
        # all files uploaded .... send import jobs
        set_progress('RUNNING', 'Create datasets for {0}'.format(
            url), None, context)
        items = []
        for species in data:
            # build item
            item = {
                'title': u'{0} occurrences'.format(species),
                'description': '',
                'file': {
                    'url': data[species]['url'],
                    'filename': data[species]['name'],
                    'contenttype': 'text/csv',
                },
                'bccvlmetadata': {
                    'genre': 'DataGenreSpeciesOccurrence',
                    'categories': ['occurrence'],
                    'species': {
                        'scientificName': species,
                    }
                },
                'filemetadata': data[species]['filemetadata'],
                '_partof': {
                    # add back reference to orig dataset
                    # TODO: shouldn't use absolute path here
                    'path': context['context']
                }
            }
            items.append(item)
        # start import process
        start_import = set_progress_job(
            'RUNNING', 'Import results', None, context)
        # What is results_dir being used for?
        import_job = import_result_job(items, results_dir, import_context)
        cleanup_job = import_cleanup_job(results_dir, context)
        import_job.link_error(set_progress_job(
            'FAILED', 'Multi species import failed', None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job(
            'COMPLETED', 'Task succeeded', None, context)
        (start_import | import_md_job | import_job |
         cleanup_job | finish_job).delay()
        # FIXME: missing stuff...
        #        need to set multi species collection to finished at some stage
    except Exception as e:
        set_progress('FAILED',
                     'Error while splitting Multi Species CSV {}: {}'.format(
                         url, e),
                     None, context)
        LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)