コード例 #1
0
ファイル: tasks.py プロジェクト: BCCVL/org.bccvl.tasks
def update_metadata(url, filename, contenttype, context):
    try:
        set_progress('RUNNING', 'Download {0}'.format(url), None, context)
        tmpdir = tempfile.mkdtemp()
        tmpfile = '{}/{}'.format(tmpdir, filename)
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpfile), settings)
        movelib.move(src, dst)
        item = {
            'filemetadata': extract_metadata(tmpfile, contenttype)
        }

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if contenttype == 'text/csv':
            if ('headers' not in item['filemetadata']
                    or 'lat' not in item['filemetadata']['headers']
                    or 'lon' not in item['filemetadata']['headers']):
                raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_job = import_file_metadata_job([item], url, context)
        import_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))
        finish_job = set_progress_job(
            "COMPLETED", 'Metadata update for {} complete'.format(url),
            None, context)
        (import_job | finish_job).delay()
    except Exception as e:
        set_progress('FAILED',
                     'Metadata update for {} failed: {}'.format(url, e),
                     None, context)
        LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
コード例 #2
0
ファイル: ala.py プロジェクト: BCCVL/org.bccvl.tasks
def download_occurrence_from_ala(params, context):
    results = []
    species = []   # a list of species metadata
    ds_names = []

    for dataset in params:
        src = None
        dst = None
        occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download"
        query = dataset['query']    # i.e. qid:<qid> or lsid:<lsid>
        qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
        email = context.get('user', {}).get('email', '')
        ds_names.append(dataset.get('name', ''))

        # downlaod occurrence file
        # TODO: ignore file if not successfully download (exception), but continue??
        tmpdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(tmpdir)

        src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)

        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context)
        # open ala_dateset.json
        ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in ala_ds['files']))

        # occurrence data file
        ala_csv = files['occurrence']['url']  # this is actually a zip file now

        # read ala metadata from attribution file.
        # May not have metadata for user uploaded dataset into sandbox
        if files.get('attribution'):
            ala_md_list = json.load(open(files['attribution']['url'], 'r'))
            for md in ala_md_list:
                species.append({
                    'scientificName': md.get('scientificName'),
                    'vernacularName': md.get('commonNameSingle') or md.get('scientificName'),
                    'taxonID': md.get('guid'),
                    'rank': md.get('rank'),
                    'genus': md.get('genus'),
                    'family': md.get('family'),
                    'order': md.get('order'),
                    'clazz': md.get('classs'),
                    'phylum': md.get('phylum'),
                    'kingdom': md.get('kingdom')
                })

    # Shall not happen
    if len(results) == 0:
        raise Exception("No occurrence dataset is downloaded from ALA")

    # Combine all the occurrence and citation files from each download into 1 dataset
    imported_date = datetime.datetime.now().strftime('%d/%m/%Y')
    if len(results) > 1:
        destdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(destdir)
        os.mkdir(os.path.join(destdir, 'data'))
        combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir)
        combine_csv(results[:-1], 'data/ala_citation.csv', destdir)

        # Zip it out and point to the new zip file
        ala_csv = os.path.join(destdir, 'ala_occurrence.zip')
        zip_occurrence_data(ala_csv,
                            os.path.join(destdir, 'data'),
                            ['ala_occurrence.csv', 'ala_citation.csv'])

        # Make a title & description for multispecies dataset
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
        else:
            ds_name = ','.join([sp['scientificName'] for sp in species])
            title = "{} occurrences".format(ds_name)
        description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)

    else:
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
            description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)
        else:
            title = ala_ds['title']
            description = ala_ds['description']
        species = species[0]

    # build bccvl metadata:
    bccvlmd = {
        'genre': 'DataGenreSpeciesOccurrence',
        'categories': ['occurrence'],
        'species': species
    }

    # build item to import
    item = {
        'title': title,
        'description': description,
        'file': {
            'url': 'file://{}'.format(ala_csv),  # local file url
            'contenttype': 'application/zip',
            'filename': os.path.basename(ala_csv)
        },
        'bccvlmetadata': bccvlmd,
        'filemetadata': extract_metadata(ala_csv, 'application/zip'),
    }
    return (item, results)
コード例 #3
0
ファイル: compute.py プロジェクト: BCCVL/org.bccvl.tasks
def createItem(fname, info, params):
    # fname: full path to file
    # info: ... from outputmap
    name = os.path.basename(fname)
    # layermd ... metadata about raster layer
    layermd = {}
    # bccvlmd ... bccvl specific metadata
    bccvlmd = {}
    genre = info.get('genre', None)
    if genre:
        bccvlmd['genre'] = genre
        if genre in ('DataGenreSDMModel', 'DataGenreCP', 'DataGenreCP_ENVLOP', 'DataGenreClampingMask'):
            if genre == 'DataGenreClampingMask':
                layermd = {
                    'files': {name: {'layer': 'clamping_mask', 'data_type': 'Discrete'}}}
            elif genre in ('DataGenreCP', 'DataGenreCP_ENVLOP'):
                if params['function'] in ('circles', 'convhull', 'voronoihull'):
                    layermd = {
                        'files': {name: {'layer': 'projection_binary', 'data_type': 'Continuous'}}}
                elif params['function'] in ('maxent',):
                    layermd = {
                        'files': {name: {'layer': 'projection_suitablity', 'data_type': 'Continuous'}}}
                else:
                    layermd = {'files': {
                        name: {'layer': 'projection_probability', 'data_type': 'Continuous'}}}
            # FIXME: find a cleaner way to attach metadata
            for key in ('year', 'month', 'emsc', 'gcm'):
                if key in params:
                    bccvlmd[key] = params[key]
        elif genre == 'DataGenreSDMEval' and info.get('mimetype') == 'text/csv':
            # Only get threshold value as from the output of Sama's evaluation script
            # FIXME: should not depend on file name (has already changed once
            # and caused disappearance of threshold values in biodiverse)
            if fname.endswith('Loss function intervals table.csv'):
                thresholds = extractThresholdValues(fname)
                # FIXME: merge thresholds?
                bccvlmd['thresholds'] = thresholds
        elif genre == 'DataGenreBiodiverseOutput':
            # Add in the srs and cellsize for Biodiverse
            bccvlmd['srs'] = 'epsg:3577'
            bccvlmd['cellsize'] = params['cluster_size']
    # make sure we have a mimetype
    mimetype = info.get('mimetype', None)
    if mimetype is None:
        mimetype = guess_mimetype(fname)
    # extract file metadata
    filemd = extract_metadata(fname, mimetype)

    # FIXME: check keys to make sense
    #        -> merge layermd and filemetadata?
    #        -> merge bccvlmd and filemetadata?
    return {
        'file': {
            'url': 'file://{}'.format(fname),  # local file url
            'contenttype': mimetype,
            'filename': name
        },
        'title': name,
        'description': info.get('title', u''),
        'bccvlmetadata': bccvlmd,
        'filemetadata': filemd,
        'layermd': layermd,
        'order': info.get('order', 999999)
    }
コード例 #4
0
ファイル: gbif.py プロジェクト: BCCVL/org.bccvl.tasks
def pull_occurrences_from_gbif(lsid, dest_url, context):
    # 1. set progress
    set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context)
    # 2. do move
    src = None
    dst = None
    try:
        tmpdir = tempfile.mkdtemp(prefix='gbif_download_')
        src = build_source('gbif://gbif?lsid={}'.format(lsid))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)
        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context)
        # open gbif_dateset.json
        gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in gbif_ds['files']))
        # read gbif metadata from attribution file
        gbif_md = json.load(open(files['attribution']['url'], 'r'))
        gbif_csv = files['occurrence']['url']

        # build bccvl metadata:
        bccvlmd = {
            'genre': 'DataGenreSpeciesOccurrence',
            'categories': ['occurrence'],
            'species': {
                'scientificName': gbif_md.get('scientificName', None),
                'vernacularName': gbif_md.get('vernacularName', None),
                'taxonID': gbif_md.get('key', None),
                'rank': gbif_md.get('rank', None),
                'genus': gbif_md.get('genus', None),
                'genusGuid': gbif_md.get('genusKey', None),
                'family': gbif_md.get('family', None),
                'familyGuid': gbif_md.get('familyKey', None),
                'order': gbif_md.get('order', None),
                'orderGuid': gbif_md.get('orderKey', None),
                'clazz': gbif_md.get('class', None),
                'clazzGuid': gbif_md.get('classKey', None),
                'phylum': gbif_md.get('phylum', None),
                'phylumGuid': gbif_md.get('phylumKey', None),
                'kingdom': gbif_md.get('kingdom', None),
                'kingdomGuid': gbif_md.get('kingdomKey', None)
            },
        }
        # build item to import
        item = {
            'title': gbif_ds['title'],
            'description': gbif_ds['description'],
            'file': {
                'url': 'file://{}'.format(gbif_csv),  # local file url
                'contenttype': 'application/zip',
                'filename': os.path.basename(gbif_csv)
            },
            'bccvlmetadata': bccvlmd,
            'filemetadata': extract_metadata(gbif_csv, 'application/zip'),
        }

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(gbif_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)
        # tell importer about new dataset (import it)
        set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context)
        (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
コード例 #5
0
ファイル: tasks.py プロジェクト: BCCVL/org.bccvl.tasks
def import_multi_species_csv(url, results_dir, import_context, context):
    # url .... source file
    # results_dir ... folder to place split files into
    # context ... the context with user and orig dataset
    try:
        set_progress('RUNNING', 'Split {0}'.format(url), None, context)
        # step 1: update main dataset metadata
        tmpdir = tempfile.mkdtemp()
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpdir), settings)
        movelib.move(src, dst)

        # Get the downloaded filename
        tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0]

        # Extract occurrence file from downloaded file
        mimetype, enc = mimetypes.guess_type(tmpfile)
        if mimetype == 'application/zip':
            src_occ_data = os.path.join('data', 'ala_occurrence.csv')
            with zipfile.ZipFile(tmpfile, 'r') as zipf:
                occfile = os.path.join(tmpdir, src_occ_data)
                zipf.extract(src_occ_data, tmpdir)
            item = {
                'filemetadata': extract_metadata(tmpfile, 'application/zip')
            }
            occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {})
        else:
            # csv file
            item = {
                'filemetadata': extract_metadata(tmpfile, "text/csv")
            }
            occfile = tmpfile
            occmd = item['filemetadata']

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if ('headers' not in occmd
                or 'lat' not in occmd['headers']
                or 'lon' not in occmd['headers']):
            raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_md_job = import_file_metadata_job([item], url, context)
        import_md_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))

        # step 2: split csv file and create sub datasets
        # start reading csv file and create new datasets which will be
        #       linked up with dataset collection item
        # FIXME: large csv files should be streamed to seperate files (not read
        #        into ram like here)
        f = io.open(occfile, 'r', encoding='utf-8', errors='ignore')
        csvreader = UnicodeCSVReader(f)
        headers = csvreader.next()
        if 'species' not in headers:
            raise Exception('missing species column')
        speciesidx = headers.index('species')
        # create dict with all data .... species column used as key, and rest
        # is just added
        data = {}
        for row in csvreader:
            if not row:
                continue
            species = row[speciesidx]
            if species not in data:
                # create new entry for species
                fname = u'{0}.csv'.format(species).replace(
                    u'/', u'_').encode('idna')
                # TODO: make sure fname contains only legal filename characters
                fpath = os.path.join(tmpdir, fname)
                file = io.open(fpath, 'wb')
                fwriter = UnicodeCSVWriter(file)
                fwriter.writerow(headers)
                data[species] = {
                    'file': file,
                    'writer': fwriter,
                    'path': fpath,
                    'name': fname
                }
            data[species]['writer'].writerow(row)
        # ok we have got all data and everything in separate files
        # close all files
        for species in data:
            data[species]['file'].close()
            del data[species]['file']
            del data[species]['writer']
        # extract metadata
        for species in data:
            data[species]['filemetadata'] = extract_metadata(
                data[species]['path'],
                'text/csv'
            )
        # send files to destination
        for species in data:
            src = build_source('file://{}'.format(data[species]['path']))
            dst = build_destination(os.path.join(results_dir,
                                                 data[species]['name']),
                                    app.conf.get('bccvl', {}))
            data[species]['url'] = dst['url']
            movelib.move(src, dst)
        # all files uploaded .... send import jobs
        set_progress('RUNNING', 'Create datasets for {0}'.format(
            url), None, context)
        items = []
        for species in data:
            # build item
            item = {
                'title': u'{0} occurrences'.format(species),
                'description': '',
                'file': {
                    'url': data[species]['url'],
                    'filename': data[species]['name'],
                    'contenttype': 'text/csv',
                },
                'bccvlmetadata': {
                    'genre': 'DataGenreSpeciesOccurrence',
                    'categories': ['occurrence'],
                    'species': {
                        'scientificName': species,
                    }
                },
                'filemetadata': data[species]['filemetadata'],
                '_partof': {
                    # add back reference to orig dataset
                    # TODO: shouldn't use absolute path here
                    'path': context['context']
                }
            }
            items.append(item)
        # start import process
        start_import = set_progress_job(
            'RUNNING', 'Import results', None, context)
        # What is results_dir being used for?
        import_job = import_result_job(items, results_dir, import_context)
        cleanup_job = import_cleanup_job(results_dir, context)
        import_job.link_error(set_progress_job(
            'FAILED', 'Multi species import failed', None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job(
            'COMPLETED', 'Task succeeded', None, context)
        (start_import | import_md_job | import_job |
         cleanup_job | finish_job).delay()
        # FIXME: missing stuff...
        #        need to set multi species collection to finished at some stage
    except Exception as e:
        set_progress('FAILED',
                     'Error while splitting Multi Species CSV {}: {}'.format(
                         url, e),
                     None, context)
        LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)