def mock_run_script(self, *args, **kw): # simulate a script run wrapper, params, context = args # 1. write file into results_dir tmpdir = urlsplit(params['result']['results_dir']).path try: # 2. create some result files for fname in ('model.RData', 'proj_test.tif'): img = Image.new('F', (10, 10)) img.save(os.path.join(tmpdir, fname), 'TIFF') # 3. store results items = [ { 'file': { 'url': 'file://{}/model.RData'.format(tmpdir), 'contenttype': 'application/x-r-data', 'filename': 'model.RData', }, 'title': 'Model Title', 'description': 'Model Description', 'bccvlmetadata': { 'genre': 'DataGenreSDMModel', }, 'filemetadata': {}, 'layermd': {} }, { 'file': { 'url': 'file://{}/proj_test.tif'.format(tmpdir), 'contenttype': 'image/tiff', 'filename': 'proj_test.tif', }, 'title': 'Test Projection', 'description': 'Test Projection Description', 'bccvlmetadata': { 'genre': 'DataGenreCP', }, 'filemetadata': { 'band': [{ 'min': 0.0, 'STATISTICS_MINIMUM': 0.0, 'max': 1.0 }] }, 'layermd': {'files': {'proj_test.tif': {'layer': 'projection_probability', 'data_type': 'Continuous'}}} } ] # TODO: tasks called dierctly here; maybe call them as tasks as # well? (chain?) import_result_job(items, params['result'][ 'results_dir'], context).delay() import_cleanup(params['result']['results_dir'], context) set_progress('COMPLETED', 'Test Task succeeded', None, context) except Exception as e: # 4. clean up if problem otherwise import task cleans up # TODO: should be done by errback or whatever import_cleanup(params['result']['results_dir'], context) set_progress('FAILED', 'Test Task failed', None, context) raise
def mock_run_script(self, *args, **kw): # simulate a script run wrapper, params, context = args # 1. write file into results_dir tmpdir = urlsplit(params['result']['results_dir']).path try: for fname in ('model.RData', 'traits.txt'): open(os.path.join(tmpdir, fname), 'w').write('Mock Result') # 3. store results items = [ { 'file': { 'url': 'file://{}/model.RData'.format(tmpdir), 'contenttype': 'application/x-r-data', 'filename': 'model.RData', }, 'title': 'Model Title', 'description': 'Model Description', 'bccvlmetadata': { 'genre': 'DataGenreSTModel', }, 'filemetadata': {}, 'layermd': {} }, { 'file': { 'url': 'file://{}/traits.txt'.format(tmpdir), 'contenttype': 'text/plain', 'filename': 'traits.txt', }, 'title': 'Test Traits', 'description': 'Test Traits Description', 'bccvlmetadata': { 'genre': 'DataGenreSTResult', }, 'filemetadata': {}, 'layermd': {}, } ] # TODO: tasks called dierctly here; maybe call them as tasks as # well? (chain?) import_result_job(items, params['result'][ 'results_dir'], context).delay() import_cleanup(params['result']['results_dir'], context) set_progress('COMPLETED', 'Test Task succeeded', None, context) except Exception as e: # 4. clean up if problem otherwise import task cleans up # TODO: should be done by errback or whatever import_cleanup(params['result']['results_dir'], context) set_progress('FAILED', 'Test Task failed', None, context) raise
def mock_run_script(self, *args, **kw): # simulate a script run wrapper, params, context = args # 1. write file into results_dir tmpdir = urlsplit(params['result']['results_dir']).path try: # 2. create some result files for fname in ('ensemble.tif',): img = Image.new('F', (10, 10)) img.save(os.path.join(tmpdir, fname), 'TIFF') # 3. store results items = [ { 'file': { 'url': 'file://{}/ensemble.tif'.format(tmpdir), 'contenttype': 'image/tiff', 'filename': 'ensemble.tif', }, 'title': 'Ensemble Output', 'description': 'Ensemble Output Description', 'bccvlmetadata': { 'genre': 'DataGenreEnsembleResult', }, 'filemetadata': { 'band': [{ 'min': 0.0, 'STATISTICS_MINIMUM': 0.0, 'max': 1.0 }] }, 'layermd': {}, } ] # TODO: tasks called dierctly here; maybe call them as tasks as # well? (chain?) import_result_job(items, params['result'][ 'results_dir'], context).delay() import_cleanup(params['result']['results_dir'], context) set_progress('COMPLETED', 'Test Task succeeded', None, context) except Exception as e: # 4. clean up if problem otherwise import task cleans up # TODO: should be done by errback or whatever import_cleanup(params['result']['results_dir'], context) set_progress('FAILED', 'Test Task failed', None, context) raise
def mock_run_script(self, *args, **kw): # simulate a script run wrapper, params, context = args # 1. write file into results_dir tmpdir = urlsplit(params['result']['results_dir']).path try: # 2. create some result files for fname in ('model.RData', 'proj_test.tif'): img = Image.new('F', (10, 10)) img.save(os.path.join(tmpdir, fname), 'TIFF') # 3. store results items = [ { 'file': { 'url': 'file://{}/model.RData'.format(tmpdir), 'contenttype': 'application/x-r-data', 'filename': 'model.RData', }, 'title': 'Model Title', 'description': 'Model Description', 'bccvlmetadata': { 'genre': 'DataGenreSDMModel', }, 'filemetadata': {}, 'layermd': {} }, { 'file': { 'url': 'file://{}/proj_test.tif'.format(tmpdir), 'contenttype': 'image/tiff', 'filename': 'proj_test.tif', }, 'title': 'Test Projection', 'description': 'Test Projection Description', 'bccvlmetadata': { 'genre': 'DataGenreCP', }, 'filemetadata': { 'band': [{ 'min': 0.0, 'STATISTICS_MINIMUM': 0.0, 'max': 1.0 }] }, 'layermd': {'files': {'proj_test.tif': {'layer': 'projection_probability', 'data_type': 'Continuous'}}} }, { 'file': { 'url': 'file://{}/proj_test.tif'.format(tmpdir), 'contenttype': 'image/tiff', 'filename': 'proj_test.tif', }, 'title': 'Test Envelop Projection', 'description': 'Test Envelop Projection Description', 'bccvlmetadata': { 'genre': 'DataGenreCP_ENVLOP', }, 'filemetadata': { 'band': [{ 'min': 0.0, 'STATISTICS_MINIMUM': 0.0, 'max': 1.0 }] }, 'layermd': {'files': {'proj_test.tif': {'layer': 'projection_probability', 'data_type': 'Continuous'}}} } ] # TODO: tasks called dierctly here; maybe call them as tasks as # well? (chain?) import_result_job(items, params['result'][ 'results_dir'], context).delay() import_cleanup(params['result']['results_dir'], context) set_progress('COMPLETED', 'Test Task succeeded', None, context) except Exception as e: # 4. clean up if problem otherwise import task cleans up # TODO: should be done by errback or whatever import_cleanup(params['result']['results_dir'], context) set_progress('FAILED', 'Test Task failed', None, context) raise
def run_script(wrapper, params, context): # TODO: there are many little things that can fail here, and we # need to communicate it properly back to the user. # TODO: however, we can't really do anything in case sending # messages doesn't work. items = [] try: errmsg = 'Fail to transfer/import data' set_progress('RUNNING', 'Transferring data', None, context) # create initial folder structure create_workenv(params) # transfer input files transfer_inputs(params, context) # create script scriptname = create_scripts(params, context) # run the script errmsg = 'Fail to run experiement' set_progress('RUNNING', 'Executing job', None, context) scriptout = os.path.join(params['env']['outputdir'], params['worker']['script']['name'] + 'out') outfile = open(scriptout, 'w') wrapsh = os.path.join(params['env']['scriptdir'], 'wrap.sh') open(wrapsh, 'w').write(wrapper) # zip up workenv if requested if params['worker'].get('zipworkenv', False): # make sure tmp is big enough # TODO: add toolkit name to zip name ... workenv_bioclim.zip zip_folder(os.path.join(params['env']['outputdir'], 'workenv.zip'), params['env']['workdir']) cmd = ["/bin/bash", "-l", "wrap.sh", scriptname] LOG.info("Executing: %s", ' '.join(cmd)) proc = subprocess.Popen(cmd, cwd=params['env']['scriptdir'], close_fds=True, stdout=outfile, stderr=subprocess.STDOUT) rpid, ret, rusage = os.wait4(proc.pid, 0) # TODO: should we write this as json file and send as result back # or just send rusage with finished message? usage = get_rusage(rusage) # TODO: check whether ret and proc.returncode are the same # move results back errmsg = 'Fail to transfer results back' set_progress('RUNNING', 'Transferring outputs', usage, context) # TODO: maybe redesign this? # transfer only uploads to destination and stores new url somewhere # and we do metadata extraction and item creation afterwards (here)? items = transfer_outputs(params, context) # we are done here, hand over to result importer # build a chain of the remaining tasks start_import = set_progress_job( 'RUNNING', 'Import results', None, context) cleanup_job = import_cleanup_job( params['result']['results_dir'], context) import_job = import_result_job(items, params['result'][ 'results_dir'], context) import_job.link_error(set_progress_job( 'FAILED', 'Result import failed', None, context)) import_job.link_error(cleanup_job) if ret != 0: errmsg = 'Script execution failed with exit code {0}'.format(ret) finish_job = set_progress_job('FAILED', errmsg, None, context) else: finish_job = set_progress_job( 'COMPLETED', 'Task succeeded', None, context) (start_import | import_job | cleanup_job | finish_job).delay() except Exception as e: # TODO: capture stacktrace # need to start import to get import cleaned up # Log error message with stacktrace. #:( exposes internals, ugly hash, complicated with admin only access #-> certainly need to get rid of exception in message. # test exceptions: # ... upload file, replace with something else (unzip error) # ... delete file and rerun experiment (donwload error) # ... create file/folder error? (can't write log) # ... how to simulate fault? (download error) # log error message with exception and traceback LOG.error(errmsg, exc_info=True) start_import = set_progress_job( 'RUNNING', 'Import results', None, context) import_job = import_result_job(items, params['result'][ 'results_dir'], context) import_job.link_error(set_progress_job( 'FAILED', 'Result import failed', None, context)) finish_job = set_progress_job('FAILED', errmsg, None, context) (start_import | import_job | finish_job).delay() raise finally: # TODO: check if dir exists path = params['env'].get('workdir', None) if path and os.path.exists(path): shutil.rmtree(path)
def import_multi_species_csv(url, results_dir, import_context, context): # url .... source file # results_dir ... folder to place split files into # context ... the context with user and orig dataset try: set_progress('RUNNING', 'Split {0}'.format(url), None, context) # step 1: update main dataset metadata tmpdir = tempfile.mkdtemp() userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpdir), settings) movelib.move(src, dst) # Get the downloaded filename tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0] # Extract occurrence file from downloaded file mimetype, enc = mimetypes.guess_type(tmpfile) if mimetype == 'application/zip': src_occ_data = os.path.join('data', 'ala_occurrence.csv') with zipfile.ZipFile(tmpfile, 'r') as zipf: occfile = os.path.join(tmpdir, src_occ_data) zipf.extract(src_occ_data, tmpdir) item = { 'filemetadata': extract_metadata(tmpfile, 'application/zip') } occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {}) else: # csv file item = { 'filemetadata': extract_metadata(tmpfile, "text/csv") } occfile = tmpfile occmd = item['filemetadata'] # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if ('headers' not in occmd or 'lat' not in occmd['headers'] or 'lon' not in occmd['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_md_job = import_file_metadata_job([item], url, context) import_md_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) # step 2: split csv file and create sub datasets # start reading csv file and create new datasets which will be # linked up with dataset collection item # FIXME: large csv files should be streamed to seperate files (not read # into ram like here) f = io.open(occfile, 'r', encoding='utf-8', errors='ignore') csvreader = UnicodeCSVReader(f) headers = csvreader.next() if 'species' not in headers: raise Exception('missing species column') speciesidx = headers.index('species') # create dict with all data .... species column used as key, and rest # is just added data = {} for row in csvreader: if not row: continue species = row[speciesidx] if species not in data: # create new entry for species fname = u'{0}.csv'.format(species).replace( u'/', u'_').encode('idna') # TODO: make sure fname contains only legal filename characters fpath = os.path.join(tmpdir, fname) file = io.open(fpath, 'wb') fwriter = UnicodeCSVWriter(file) fwriter.writerow(headers) data[species] = { 'file': file, 'writer': fwriter, 'path': fpath, 'name': fname } data[species]['writer'].writerow(row) # ok we have got all data and everything in separate files # close all files for species in data: data[species]['file'].close() del data[species]['file'] del data[species]['writer'] # extract metadata for species in data: data[species]['filemetadata'] = extract_metadata( data[species]['path'], 'text/csv' ) # send files to destination for species in data: src = build_source('file://{}'.format(data[species]['path'])) dst = build_destination(os.path.join(results_dir, data[species]['name']), app.conf.get('bccvl', {})) data[species]['url'] = dst['url'] movelib.move(src, dst) # all files uploaded .... send import jobs set_progress('RUNNING', 'Create datasets for {0}'.format( url), None, context) items = [] for species in data: # build item item = { 'title': u'{0} occurrences'.format(species), 'description': '', 'file': { 'url': data[species]['url'], 'filename': data[species]['name'], 'contenttype': 'text/csv', }, 'bccvlmetadata': { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': species, } }, 'filemetadata': data[species]['filemetadata'], '_partof': { # add back reference to orig dataset # TODO: shouldn't use absolute path here 'path': context['context'] } } items.append(item) # start import process start_import = set_progress_job( 'RUNNING', 'Import results', None, context) # What is results_dir being used for? import_job = import_result_job(items, results_dir, import_context) cleanup_job = import_cleanup_job(results_dir, context) import_job.link_error(set_progress_job( 'FAILED', 'Multi species import failed', None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job( 'COMPLETED', 'Task succeeded', None, context) (start_import | import_md_job | import_job | cleanup_job | finish_job).delay() # FIXME: missing stuff... # need to set multi species collection to finished at some stage except Exception as e: set_progress('FAILED', 'Error while splitting Multi Species CSV {}: {}'.format( url, e), None, context) LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)