Example #1
0
def _download_occurrence(occurrence_url, dest):
    """
    Downloads Species Occurrence data from ALA (Atlas of Living Australia)
    @param download_url: the url to download species occurrence data
    @type download_url: str
    @param dest: the destination directory that the ALA files are going to end up inside of on the remote machine. Used to form the metadata .json file.
    @type dest: str
    @return True if the dataset was obtained. False otherwise
    """
    # TODO: validate dest is a dir?

    # Get occurrence data
    temp_file = None
    lsid_list = []
    try:
        temp_file, _ = urllib.urlretrieve(occurrence_url)
        # extract data.csv file into dest
        with zipfile.ZipFile(temp_file) as z:
            data_dest = os.path.join(dest, 'data')
            os.mkdir(data_dest)

            # rename to ala_occurrence.csv
            z.extract('data.csv', dest)
            os.rename(os.path.join(dest, 'data.csv'),
                      os.path.join(data_dest, 'ala_occurrence.csv'))

            # citation file is optional
            try:
                z.extract('citation.csv', dest)
                os.rename(os.path.join(dest, 'citation.csv'),
                          os.path.join(data_dest, 'ala_citation.csv'))
            except Exception:
                pass
        lsid_list = _get_species_guid_from_csv(os.path.join(data_dest, 'ala_occurrence.csv'))

        # Zip out files if available
        zip_occurrence_data(os.path.join(dest, 'ala_occurrence.zip'),
                            os.path.join(dest, 'data'),
                            ['ala_occurrence.csv', 'ala_citation.csv'])

    except KeyError:
        LOG.error("Cannot find file %s in downloaded zip file", 'data.csv', exc_info=True)
        raise
    except Exception:
        # TODO: Not a zip file error.... does it have to raise?
        LOG.error("The file %s is not a zip file", 'data.csv', exc_info=True)
        raise
    finally:
        if temp_file:
            os.remove(temp_file)

    return {'url': os.path.join(dest, 'ala_occurrence.zip'),
            'name': 'ala_occurrence.zip',
            'content_type': 'application/zip',
            'lsids': lsid_list}
Example #2
0
def _ala_postprocess(csvzipfile, mdfile, occurrence_url, dest):
    # cleanup occurrence csv file and generate dataset metadata
    # occurrence dataset can be multiple species, i.e. user upload data
    taxon_names = {}
    common_names = []

    if mdfile:
        # Generate dataset .json
        # 1. read mdfile and find interesting bits:
        sp_metadata = json.load(open(mdfile))

        for md in sp_metadata:
            # TODO: is this the correct bit? (see plone dataset import )
            guid = md.get('guid')
            if guid:
                taxon_names[guid] = md.get('scientificName') or \
                    md.get('name') or \
                    md.get('nameComplete')
                common_names.append(md.get('commonNameSingle') or md.get('scientificName'))

    # 2. clean up occurrence csv file and count occurrence points
    csvfile = os.path.join(dest, 'data/ala_occurrence.csv')
    num_occurrences = _normalize_occurrence(csvfile, taxon_names)

    # Rebuild the zip archive file with updated occurrence csv file.
    os.remove(csvzipfile)
    zip_occurrence_data(csvzipfile,
                        os.path.join(os.path.dirname(csvzipfile), 'data'),
                        ['ala_occurrence.csv', 'ala_citation.csv'])

    # 3. generate ala_dataset.json
    imported_date = datetime.datetime.now().strftime('%d/%m/%Y')
    common = u', '.join(common_names)
    taxon = u', '.join(taxon_names.values())
    if common_names:
        title = u"%s (%s) occurrences" % (common, taxon)
        description = u"Observed occurrences for %s (%s), imported from ALA on %s" % (common, taxon, imported_date)
    elif taxon:
        title = u"%s occurrences" % (taxon)
        description = u"Observed occurrences for %s, imported from ALA on %s" % (taxon, imported_date)
    else:
        # This would be the case where the user dataset does not match to any species in ALA
        # TODO: Use the user supplied name
        title = u"Occurrence for user defined dataset"
        description = u"User defined occurrence dataset, imported on %s" % (imported_date)

    files = [{
        'url': csvzipfile,
        'dataset_type': 'occurrence',
        'size': os.path.getsize(csvzipfile)
    }]
    if mdfile:
        files.append({
            'url': mdfile,
            'dataset_type': 'attribution',
            'size': os.path.getsize(mdfile)
        })

    ala_dataset = {
        'title': title,
        'description': description,
        'num_occurrences': num_occurrences,
        'files': files,
        'provenance': {
            'source': 'ALA',
            'url': occurrence_url,
            'source_date': imported_date
        }
    }

    # Write the dataset to a file
    dataset_path = os.path.join(dest, 'ala_dataset.json')
    f = io.open(dataset_path, mode='wb')
    json.dump(ala_dataset, f, indent=2)
    f.close()
    dsfile = {'url': dataset_path,
              'name': 'ala_dataset.json',
              'content_type': 'application/json'}
    return dsfile
Example #3
0
def download_occurrence_from_ala(params, context):
    results = []
    species = []   # a list of species metadata
    ds_names = []

    for dataset in params:
        src = None
        dst = None
        occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download"
        query = dataset['query']    # i.e. qid:<qid> or lsid:<lsid>
        qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
        email = context.get('user', {}).get('email', '')
        ds_names.append(dataset.get('name', ''))

        # downlaod occurrence file
        # TODO: ignore file if not successfully download (exception), but continue??
        tmpdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(tmpdir)

        src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)

        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context)
        # open ala_dateset.json
        ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in ala_ds['files']))

        # occurrence data file
        ala_csv = files['occurrence']['url']  # this is actually a zip file now

        # read ala metadata from attribution file.
        # May not have metadata for user uploaded dataset into sandbox
        if files.get('attribution'):
            ala_md_list = json.load(open(files['attribution']['url'], 'r'))
            for md in ala_md_list:
                species.append({
                    'scientificName': md.get('scientificName'),
                    'vernacularName': md.get('commonNameSingle') or md.get('scientificName'),
                    'taxonID': md.get('guid'),
                    'rank': md.get('rank'),
                    'genus': md.get('genus'),
                    'family': md.get('family'),
                    'order': md.get('order'),
                    'clazz': md.get('classs'),
                    'phylum': md.get('phylum'),
                    'kingdom': md.get('kingdom')
                })

    # Shall not happen
    if len(results) == 0:
        raise Exception("No occurrence dataset is downloaded from ALA")

    # Combine all the occurrence and citation files from each download into 1 dataset
    imported_date = datetime.datetime.now().strftime('%d/%m/%Y')
    if len(results) > 1:
        destdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(destdir)
        os.mkdir(os.path.join(destdir, 'data'))
        combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir)
        combine_csv(results[:-1], 'data/ala_citation.csv', destdir)

        # Zip it out and point to the new zip file
        ala_csv = os.path.join(destdir, 'ala_occurrence.zip')
        zip_occurrence_data(ala_csv,
                            os.path.join(destdir, 'data'),
                            ['ala_occurrence.csv', 'ala_citation.csv'])

        # Make a title & description for multispecies dataset
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
        else:
            ds_name = ','.join([sp['scientificName'] for sp in species])
            title = "{} occurrences".format(ds_name)
        description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)

    else:
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
            description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)
        else:
            title = ala_ds['title']
            description = ala_ds['description']
        species = species[0]

    # build bccvl metadata:
    bccvlmd = {
        'genre': 'DataGenreSpeciesOccurrence',
        'categories': ['occurrence'],
        'species': species
    }

    # build item to import
    item = {
        'title': title,
        'description': description,
        'file': {
            'url': 'file://{}'.format(ala_csv),  # local file url
            'contenttype': 'application/zip',
            'filename': os.path.basename(ala_csv)
        },
        'bccvlmetadata': bccvlmd,
        'filemetadata': extract_metadata(ala_csv, 'application/zip'),
    }
    return (item, results)