Exemple #1
0
    def test_gbif_to_file(self, mock_urlretrieve=None, mock_urlopen=None):
        mock_urlretrieve.side_effect = self._urlretrieve
        mock_urlopen.side_effect = self._urlopen
        # mock urllib.urlretrieve ....
        #        return zip file with data.csv and citation.csv
        # mock urllib.urlretriev ...
        #        return gbif_metadata.json

        file_dest = {
            'url': 'file://{}'.format(self.tmpdir)
        }
        move(self.gbif_source, file_dest)

        # Check files are created
        self.assertTrue(os.path.exists(
            os.path.join(self.tmpdir, 'gbif_dataset.json')))
        self.assertTrue(os.path.exists(
            os.path.join(self.tmpdir, 'gbif_occurrence.zip')))
        self.assertTrue(os.path.exists(
            os.path.join(self.tmpdir, 'gbif_metadata.json')))

        # Check file contents
        zf = zipfile.ZipFile(os.path.join(self.tmpdir, 'gbif_occurrence.zip'))
        zf.extractall(self.tmpdir)
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'gbif_metadata.json'),
                                    pkg_resources.resource_filename(__name__, 'data/gbif_metadata.json')))
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'gbif_occurrence.csv'),
                                    pkg_resources.resource_filename(__name__, 'data/gbif_occurrence.csv')))
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'gbif_citation.txt'),
                                    pkg_resources.resource_filename(__name__, 'data/gbif_citation.txt')))
    def test_aekos_occurrence_to_file(self, mock_download_as_file=None):
        mock_download_as_file.side_effect = self._download_as_file

        file_dest = {
            'url': 'file://{0}'.format(self.tmpdir)
        }
        move(self.occurrence_source, file_dest)

        # Check for these files are created
        self.assertTrue(os.path.exists(
            os.path.join(self.tmpdir, 'aekos_metadata.json')))
        self.assertTrue(os.path.exists(
            os.path.join(self.tmpdir, 'aekos_dataset.json')))
        self.assertTrue(os.path.exists(os.path.join(
            self.tmpdir, 'aekos_occurrence.zip')))
        self.assertTrue(os.path.exists(os.path.join(
            self.tmpdir, 'data', 'aekos_occurrence.csv')))
        self.assertTrue(os.path.exists(os.path.join(
            self.tmpdir, 'data', 'aekos_citation.txt')))

        # Check file content
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'aekos_metadata.json'),
                                    resource_filename(__name__, 'data/aekos_metadata.json')))
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_occurrence.csv'),
                                    resource_filename(__name__, 'data/aekos_occurrence.csv')))
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_citation.txt'),
                                    resource_filename(__name__, 'data/aekos_citation.txt')))
Exemple #3
0
    def test_http_to_file(self, mock_SessionClass=None):
        mock_session = mock_SessionClass.return_value  # get mock response
        mock_response = mock_session.get.return_value
        mock_response.iter_content.return_value = ['test content']
        mock_headers = mock_response.headers
        mock_headers.get.return_value = 'text/csv'

        ticket = AuthTkt('ibycgtpw', 'admin')
        cookies = {
            'name': '__ac',
            'value': ticket.ticket(),
            'domain': '',
            'path': '/',
            'secure': True
        }
        http_source = {
            'url': 'http://www.bccvl.org.au/datasets/test.csv',
            'cookies': cookies
        }
        dest_file = os.path.join(self.tmpdir, 'test.csv')
        file_dest = {
            'url': 'file://{}'.format(dest_file)
        }

        move(http_source, file_dest)
        
        # verify destination file
        self.assertTrue(os.path.exists(dest_file))
        self.assertEqual(open(dest_file).read(), 'test content')
    def test_aekos_traits_to_file_multispecies(self, mock_download_as_file=None):
        mock_download_as_file.side_effect = self._download_multispecies

        traits_source = {
            'url': 'aekos://traits?speciesName=Abutilon%20fraseri,Abutilon%20halophilum&traitName=height%2ClifeForm&envVarName=aspect%2CelectricalConductivity'
        }

        file_dest = {
            'url': 'file://{}'.format(self.tmpdir)
        }
        move(self.traits_source, file_dest)

        # Check for these files are created
        self.assertTrue(os.path.exists(
            os.path.join(self.tmpdir, 'aekos_dataset.json')))
        self.assertTrue(os.path.exists(os.path.join(
            self.tmpdir, 'aekos_traits_env.zip')))
        self.assertTrue(os.path.exists(os.path.join(
            self.tmpdir, 'data', 'aekos_traits_env.csv')))
        self.assertTrue(os.path.exists(os.path.join(
            self.tmpdir, 'data', 'aekos_citation.csv')))

        # Check file content
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_traits_env.csv'),
                                    resource_filename(__name__, 'data/aekos_traits_env_multispecies.csv')))
        self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_citation.csv'),
                                    resource_filename(__name__, 'data/aekos_citation_multispecies.csv')))
Exemple #5
0
def pull_occurrences_from_ala(params, dest_url, context, import_multspecies_params):
    # 1. set progress
    set_progress('RUNNING', 'Download occurrence dataset from ala', None, context)
    # 2. Download all the occurrence dataset in the params list
    results = []

    try:
        item, results = download_occurrence_from_ala(params, context)

        # This is the zip file path of the occurrence dataset
        ala_csv = item.get('file').get('url').split('file://')[1]

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'ala_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(ala_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(ala_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)

        # tell importer about new dataset (import it)
        set_progress("RUNNING", u"Import dataset '{0}' from ALA".format(item['title']), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job(
            "FAILED", u"Import of dataset '{0}' from ALA failed".format(item['title']), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", u"ALA import '{}' complete".format(item['title']), None, context)

        # Split multi-species dataset
        if import_multspecies_params:
            import_multispecies_job = import_multi_species_csv_job(item.get('file').get('url'),
                                                                   import_multspecies_params['results_dir'],
                                                                   import_multspecies_params['import_context'],
                                                                   context)
            import_multispecies_job.link_error(set_progress_job(
                "FAILED", u"Split multi-species dataset '{0}' from ALA failed".format(item['title']), None, context))
            import_multispecies_job.link_error(cleanup_job)
            (import_job | import_multispecies_job | cleanup_job | finish_job).delay()
        else:
            (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download occurrence dataset from ALA: {}'.format(e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', params, dest_url, e, exc_info=True)
    finally:
        for tmpdir in results:
            if tmpdir and os.path.exists(tmpdir):
                shutil.rmtree(tmpdir)
Exemple #6
0
    def test_file_to_file(self, mock_copy=None):
        file_dest = {
            'url': 'file://{}'.format(self.tmpdir)
        }
        move(self.file_source, file_dest)

        # verify destination file
        dest_file = os.path.join(self.tmpdir, 'test.csv')
        self.assertTrue(os.path.exists(dest_file))
        self.assertEqual(open(dest_file).read(), pkg_resources.resource_string(__name__, 'data/test.csv'))
Exemple #7
0
def get_files(urllist, userid, conf):
    """
    Download all files fiven in urllist to local tempfile
    return temp folder location.
    """
    dest = tempfile.mkdtemp(prefix='bccvl_export')
    for url in urllist:
        src = build_source(url, userid, conf)
        dst = build_destination('file://{0}/{1}'.format(dest, os.path.basename(url)), conf)
        movelib.move(src, dst)
    return dest
Exemple #8
0
def download_input(move_args):
    src, dst = move_args['args']
    try:
        # set up the source and destination
        source = build_source(
            src, move_args['userid'], app.conf.get('bccvl', {}))
        destination = build_destination(dst)
        move(source, destination)
    except Exception as e:
        LOG.info('Download from %s to %s failed: %s', src, dst, e)
        raise
    LOG.info('Download from %s to %s succeeded.', src, dst)
Exemple #9
0
def move(move_args, context):
    errmsgs = []
    for src, dest in move_args:
        try:
            source = build_source(src, context['user']['id'],
                                  app.conf.get('bccvl', {}))
            destination = build_destination(dest, app.conf.get('bccvl', {}))
            movelib.move(source, destination)
        except Exception as e:
            msg = 'Download from %s to %s failed: %s', src, dest, str(e)
            errmsgs.append(msg)
            LOG.warn(msg)
    if errmsgs:
        raise Exception('Move data failed', errmsgs)
Exemple #10
0
def upload_outputs(args):
    src, dest, item = args

    try:
        # set up the source and destination (src is a local file)
        source = build_source(src)
        # TODO: add content_type to destination? (move_lib supports it)
        destination = build_destination(dest, app.conf.get('bccvl', {}))

        # Upload the file and then generate metadata
        move(source, destination)
        LOG.info('Upload from %s to %s succeeded.', src, dest)
        item['file']['failed'] = False
    except Exception:
        LOG.info('Upload from %s to %s failed', src, dest)
        item['file']['failed'] = True
Exemple #11
0
    def test_swift_to_swift(self, mock_SwiftService=None):
        mock_swiftservice = mock_SwiftService.return_value
        mock_swiftservice.upload.return_value = [{'success': True}]  # simulate successful upload
        mock_swiftservice.download.side_effect = self._swift_download

        move(self.swift_source, self.swift_dest)

        mock_SwiftService.assert_has_calls([
            # init SwiftService
            mock.call(mock.ANY),
            mock.call().download('container2', ['test/test2.txt'], {'out_file': mock.ANY}),
            # init SwiftService
            mock.call(mock.ANY),
            # TODO: mock.ANY here is a SwiftUploadObject, can we verify that in more detail? like object name etc...
            mock.call().upload('container2', [mock.ANY]),
        ])
Exemple #12
0
    def test_swift_to_file(self, mock_SwiftService=None):
        mock_swiftservice = mock_SwiftService.return_value
        mock_swiftservice.download.side_effect = self._swift_download

        file_dest = {
            'url': 'file://{}'.format(self.tmpdir)
        }
        move(self.swift_source, file_dest)

        dest_file = os.path.join(self.tmpdir, 'test2.txt')
        mock_SwiftService.assert_has_calls([
            # init SwiftService
            mock.call(mock.ANY),
            mock.call().download('container2', ['test/test2.txt'], {'out_file': dest_file}),
        ])
        # assert dest file?
        self.assertTrue(os.path.exists(dest_file))
        self.assertEqual(open(dest_file).read(), 'test content')
Exemple #13
0
def update_metadata(url, filename, contenttype, context):
    try:
        set_progress('RUNNING', 'Download {0}'.format(url), None, context)
        tmpdir = tempfile.mkdtemp()
        tmpfile = '{}/{}'.format(tmpdir, filename)
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpfile), settings)
        movelib.move(src, dst)
        item = {
            'filemetadata': extract_metadata(tmpfile, contenttype)
        }

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if contenttype == 'text/csv':
            if ('headers' not in item['filemetadata']
                    or 'lat' not in item['filemetadata']['headers']
                    or 'lon' not in item['filemetadata']['headers']):
                raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_job = import_file_metadata_job([item], url, context)
        import_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))
        finish_job = set_progress_job(
            "COMPLETED", 'Metadata update for {} complete'.format(url),
            None, context)
        (import_job | finish_job).delay()
    except Exception as e:
        set_progress('FAILED',
                     'Metadata update for {} failed: {}'.format(url, e),
                     None, context)
        LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
Exemple #14
0
    def test_ala_qid_to_file(self, mock_urlretrieve=None):
        mock_urlretrieve.side_effect = self._urlretrieve
        # mock urllib.urlretrieve ....
        #        return zip file with data.csv and citation.csv
        # mock urllib.urlretriev ...
        #        return ala_metadata.json
        occurrence_url = "http://biocache.ala.org.au/ws/occurrences/index/download"
        query = "qid:urn:lsid:biodiversity.org.au:afd.taxon:31a9b8b8-4e8f-4343-a15f-2ed24e0bf1ae"
        qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
        email = "*****@*****.**"
        src_url = 'ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email)

        file_dest = {
            'url': 'file://{}'.format(self.tmpdir)
        }
        move({'url': src_url}, file_dest)

        # verify ala calls?
        self.assertTrue(os.path.exists(os.path.join(self.tmpdir, 'ala_dataset.json')))
        self.assertTrue(os.path.exists(os.path.join(self.tmpdir, 'ala_occurrence.zip')))
    def test_ala_utf8_move(self, mock_occur, mock_md):
        def fetch_occur_data(download_url, dest):
            occur_file = os.path.join(dest, 'ala_occurrence.zip')
            shutil.copyfile(resource_filename(__name__, 'data.zip'),
                            occur_file)
            # FIXME: ala.py exploits side effect, zip is being created in _download_metadata_for_lsid, but other methods in the module rely on the enpacked zip being available
            # FIXME: ala.py alse rezips inside _ala_postprocess again
            with zipfile.ZipFile(occur_file) as z:
                z.extractall(dest)
            return { 'url' : occur_file,
                     'name': 'ala_occurrence.zip',
                     'content_type': 'application/zip'}

        def fetch_meta_data(lsid_list, dest):
            metadata_file = os.path.join(dest, 'ala_metadata.json')
            shutil.copyfile(resource_filename(__name__, 'data.json'),
                            metadata_file)
            return { 'url' : metadata_file,
                     'name': 'ala_metadata.json',
                     'content_type': 'application/json'}

        mock_occur.side_effect = fetch_occur_data
        mock_md.side_effect = fetch_meta_data

        tmpdir = tempfile.mkdtemp()
        try:
            occurrence_url = "http://biocache.ala.org.au/ws/occurrences/index/download"
            query = "lsid:urn:lsid:biodiversity.org.au:apni.taxon:262359"
            qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
            email = "*****@*****.**"
            src_url = 'ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email)
            movelib.move({'url': src_url},
                         {'url': 'file://{}'.format(tmpdir)})
            self.assertEqual(mock_occur.call_count, 1)
            self.assertEqual(mock_md.call_count, 1)
            dl_list = os.listdir(tmpdir)
            # FIXME: data should not be there
            self.assertEqual(set(dl_list),
                             set(['ala_occurrence.zip', 'ala_dataset.json', 'ala_metadata.json', 'data']))
        finally:
            shutil.rmtree(tmpdir)
    def worker(self, move_job):
        """
        Thread worker used to perform a move of data between endpoints.
        @param move_job: The move job to execute
        @type move_job: MoveJob
        """
        try:
            # Need o handle a list of sources
            self._logger.info("Starting move for job with id %s", move_job.id)
            move_job.update(status=MoveJob.STATUS_IN_PROGRESS, start_timestamp=datetime.datetime.now())

            # source can be just 1 source or a list of sources
            if isinstance(move_job.source, str):
                sourcelist = [move_job.source]
            elif isinstance(move_job.source, list):
                sourcelist = move_job.source
            else:
                raise Exception("Invalid source {1}".format(move_job.source))

            # Validate the destination url
            dest_url = urlparse(move_job.destination)
            if dest_url.scheme in ("swift+http", "swift+https") and not self._has_credential():
                raise Exception("Credential for Nectar swift service is not configured.")

            # Download all the files from the sources to the destination
            destination = build_destination(move_job.destination, self._config)

            for s in sourcelist:
                source = build_source(s, move_job.userid, self._config)
                movelib.move(source, destination)
            move_job.update(status=MoveJob.STATUS_COMPLETE, start_timestamp=datetime.datetime.now())
        except Exception as e:
            # catch any Exception here so that we can properly update the job state
            reason = "Move has failed for job with id {0}. Reason: {1}".format(move_job.id, str(e))
            self._logger.warning(reason)
            move_job.update(status=MoveJob.STATUS_FAILED, end_timestamp=datetime.datetime.now(), reason=reason)
Exemple #17
0
def import_multi_species_csv(url, results_dir, import_context, context):
    # url .... source file
    # results_dir ... folder to place split files into
    # context ... the context with user and orig dataset
    try:
        set_progress('RUNNING', 'Split {0}'.format(url), None, context)
        # step 1: update main dataset metadata
        tmpdir = tempfile.mkdtemp()
        userid = context.get('user', {}).get('id')
        settings = app.conf.get('bccvl', {})
        src = build_source(url, userid, settings)
        dst = build_destination('file://{}'.format(tmpdir), settings)
        movelib.move(src, dst)

        # Get the downloaded filename
        tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0]

        # Extract occurrence file from downloaded file
        mimetype, enc = mimetypes.guess_type(tmpfile)
        if mimetype == 'application/zip':
            src_occ_data = os.path.join('data', 'ala_occurrence.csv')
            with zipfile.ZipFile(tmpfile, 'r') as zipf:
                occfile = os.path.join(tmpdir, src_occ_data)
                zipf.extract(src_occ_data, tmpdir)
            item = {
                'filemetadata': extract_metadata(tmpfile, 'application/zip')
            }
            occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {})
        else:
            # csv file
            item = {
                'filemetadata': extract_metadata(tmpfile, "text/csv")
            }
            occfile = tmpfile
            occmd = item['filemetadata']

        # Check that there are lon and lat columns
        # if upload is of type csv, we validate column names as well
        if ('headers' not in occmd
                or 'lat' not in occmd['headers']
                or 'lon' not in occmd['headers']):
            raise Exception("Missing 'lat'/'lon' column")

        set_progress('RUNNING',
                     'Import metadata for {0}'.format(url),
                     None, context)

        import_md_job = import_file_metadata_job([item], url, context)
        import_md_job.link_error(set_progress_job(
            "FAILED", "Metadata update failed for {0}".format(url),
            None, context))

        # step 2: split csv file and create sub datasets
        # start reading csv file and create new datasets which will be
        #       linked up with dataset collection item
        # FIXME: large csv files should be streamed to seperate files (not read
        #        into ram like here)
        f = io.open(occfile, 'r', encoding='utf-8', errors='ignore')
        csvreader = UnicodeCSVReader(f)
        headers = csvreader.next()
        if 'species' not in headers:
            raise Exception('missing species column')
        speciesidx = headers.index('species')
        # create dict with all data .... species column used as key, and rest
        # is just added
        data = {}
        for row in csvreader:
            if not row:
                continue
            species = row[speciesidx]
            if species not in data:
                # create new entry for species
                fname = u'{0}.csv'.format(species).replace(
                    u'/', u'_').encode('idna')
                # TODO: make sure fname contains only legal filename characters
                fpath = os.path.join(tmpdir, fname)
                file = io.open(fpath, 'wb')
                fwriter = UnicodeCSVWriter(file)
                fwriter.writerow(headers)
                data[species] = {
                    'file': file,
                    'writer': fwriter,
                    'path': fpath,
                    'name': fname
                }
            data[species]['writer'].writerow(row)
        # ok we have got all data and everything in separate files
        # close all files
        for species in data:
            data[species]['file'].close()
            del data[species]['file']
            del data[species]['writer']
        # extract metadata
        for species in data:
            data[species]['filemetadata'] = extract_metadata(
                data[species]['path'],
                'text/csv'
            )
        # send files to destination
        for species in data:
            src = build_source('file://{}'.format(data[species]['path']))
            dst = build_destination(os.path.join(results_dir,
                                                 data[species]['name']),
                                    app.conf.get('bccvl', {}))
            data[species]['url'] = dst['url']
            movelib.move(src, dst)
        # all files uploaded .... send import jobs
        set_progress('RUNNING', 'Create datasets for {0}'.format(
            url), None, context)
        items = []
        for species in data:
            # build item
            item = {
                'title': u'{0} occurrences'.format(species),
                'description': '',
                'file': {
                    'url': data[species]['url'],
                    'filename': data[species]['name'],
                    'contenttype': 'text/csv',
                },
                'bccvlmetadata': {
                    'genre': 'DataGenreSpeciesOccurrence',
                    'categories': ['occurrence'],
                    'species': {
                        'scientificName': species,
                    }
                },
                'filemetadata': data[species]['filemetadata'],
                '_partof': {
                    # add back reference to orig dataset
                    # TODO: shouldn't use absolute path here
                    'path': context['context']
                }
            }
            items.append(item)
        # start import process
        start_import = set_progress_job(
            'RUNNING', 'Import results', None, context)
        # What is results_dir being used for?
        import_job = import_result_job(items, results_dir, import_context)
        cleanup_job = import_cleanup_job(results_dir, context)
        import_job.link_error(set_progress_job(
            'FAILED', 'Multi species import failed', None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job(
            'COMPLETED', 'Task succeeded', None, context)
        (start_import | import_md_job | import_job |
         cleanup_job | finish_job).delay()
        # FIXME: missing stuff...
        #        need to set multi species collection to finished at some stage
    except Exception as e:
        set_progress('FAILED',
                     'Error while splitting Multi Species CSV {}: {}'.format(
                         url, e),
                     None, context)
        LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
Exemple #18
0
def pull_occurrences_from_gbif(lsid, dest_url, context):
    # 1. set progress
    set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context)
    # 2. do move
    src = None
    dst = None
    try:
        tmpdir = tempfile.mkdtemp(prefix='gbif_download_')
        src = build_source('gbif://gbif?lsid={}'.format(lsid))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)
        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context)
        # open gbif_dateset.json
        gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in gbif_ds['files']))
        # read gbif metadata from attribution file
        gbif_md = json.load(open(files['attribution']['url'], 'r'))
        gbif_csv = files['occurrence']['url']

        # build bccvl metadata:
        bccvlmd = {
            'genre': 'DataGenreSpeciesOccurrence',
            'categories': ['occurrence'],
            'species': {
                'scientificName': gbif_md.get('scientificName', None),
                'vernacularName': gbif_md.get('vernacularName', None),
                'taxonID': gbif_md.get('key', None),
                'rank': gbif_md.get('rank', None),
                'genus': gbif_md.get('genus', None),
                'genusGuid': gbif_md.get('genusKey', None),
                'family': gbif_md.get('family', None),
                'familyGuid': gbif_md.get('familyKey', None),
                'order': gbif_md.get('order', None),
                'orderGuid': gbif_md.get('orderKey', None),
                'clazz': gbif_md.get('class', None),
                'clazzGuid': gbif_md.get('classKey', None),
                'phylum': gbif_md.get('phylum', None),
                'phylumGuid': gbif_md.get('phylumKey', None),
                'kingdom': gbif_md.get('kingdom', None),
                'kingdomGuid': gbif_md.get('kingdomKey', None)
            },
        }
        # build item to import
        item = {
            'title': gbif_ds['title'],
            'description': gbif_ds['description'],
            'file': {
                'url': 'file://{}'.format(gbif_csv),  # local file url
                'contenttype': 'application/zip',
                'filename': os.path.basename(gbif_csv)
            },
            'bccvlmetadata': bccvlmd,
            'filemetadata': extract_metadata(gbif_csv, 'application/zip'),
        }

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(gbif_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)
        # tell importer about new dataset (import it)
        set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context)
        (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
Exemple #19
0
    def export_to_ala(self):
        uuid = self.request.form.get('uuid', None)
        try:
            if uuid:
                brain = uuidToCatalogBrain(uuid)
                if brain is None:
                    raise Exception("Brain not found")

                obj = brain.getObject()
            else:
                obj = self.context

            # get username
            member = ploneapi.user.get_current()
            if member.getId():
                user = {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            else:
                raise Exception("Invalid user")

            # verify dataset
            if obj.portal_type not in (
                    'org.bccvl.content.dataset',
                    'org.bccvl.content.remotedataset',
                    'org.bccvl.content.multispeciesdataset'):
                raise Exception("Invalid UUID (content type)")
            md = IBCCVLMetadata(obj)
            if md.get('genre') not in ('DataGenreSpeciesOccurrence',
                                       'DataGenreSpeciesCollection',
                                       'DataGenreTraits'):
                raise Exception("Invalid UUID (data type)")
            # get download url
            dlinfo = IDownloadInfo(obj)

            # download file
            from org.bccvl import movelib
            from org.bccvl.movelib.utils import build_source, build_destination
            import tempfile
            destdir = tempfile.mkdtemp(prefix='export_to_ala')
            try:
                from org.bccvl.tasks.celery import app
                settings = app.conf.get('bccvl', {})
                dest = os.path.join(destdir, os.path.basename(dlinfo['url']))
                movelib.move(build_source(dlinfo['url'], user['id'], settings),
                             build_destination('file://{}'.format(dest)))

                csvfile = None

                if dlinfo['contenttype'] == 'application/zip':
                    # loox at 'layers' to find file within zip
                    arc = md['layers'].keys()[0]

                    import zipfile
                    zf = zipfile.ZipFile(dest, 'r')
                    csvfile = zf.open(arc, 'r')
                else:
                    csvfile = open(dest, 'rb')

                import requests
                # "Accept:application/json" "Origin:http://example.com"
                res = requests.post(settings['ala']['sandboxurl'],
                                    files={'file': csvfile},
                                    headers={
                                        'apikey': settings['ala']['apikey'],
                                        'Accept': 'application/json'
                                    })
                if res.status_code != 200:
                    self.record_error(res.reason, res.status_code)
                    raise Exception('Upload failed')
                retval = res.json()
                # TODO: do error checking
                #  keys: sandboxUrl, fileName, message, error: Bool, fileId
                return retval
            finally:
                import shutil
                shutil.rmtree(destdir)

        except Exception as e:
            self.record_error(str(e), 500)
            raise
Exemple #20
0
def download_occurrence_from_ala(params, context):
    results = []
    species = []   # a list of species metadata
    ds_names = []

    for dataset in params:
        src = None
        dst = None
        occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download"
        query = dataset['query']    # i.e. qid:<qid> or lsid:<lsid>
        qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch"
        email = context.get('user', {}).get('email', '')
        ds_names.append(dataset.get('name', ''))

        # downlaod occurrence file
        # TODO: ignore file if not successfully download (exception), but continue??
        tmpdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(tmpdir)

        src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)

        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context)
        # open ala_dateset.json
        ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in ala_ds['files']))

        # occurrence data file
        ala_csv = files['occurrence']['url']  # this is actually a zip file now

        # read ala metadata from attribution file.
        # May not have metadata for user uploaded dataset into sandbox
        if files.get('attribution'):
            ala_md_list = json.load(open(files['attribution']['url'], 'r'))
            for md in ala_md_list:
                species.append({
                    'scientificName': md.get('scientificName'),
                    'vernacularName': md.get('commonNameSingle') or md.get('scientificName'),
                    'taxonID': md.get('guid'),
                    'rank': md.get('rank'),
                    'genus': md.get('genus'),
                    'family': md.get('family'),
                    'order': md.get('order'),
                    'clazz': md.get('classs'),
                    'phylum': md.get('phylum'),
                    'kingdom': md.get('kingdom')
                })

    # Shall not happen
    if len(results) == 0:
        raise Exception("No occurrence dataset is downloaded from ALA")

    # Combine all the occurrence and citation files from each download into 1 dataset
    imported_date = datetime.datetime.now().strftime('%d/%m/%Y')
    if len(results) > 1:
        destdir = tempfile.mkdtemp(prefix='ala_download_')
        results.append(destdir)
        os.mkdir(os.path.join(destdir, 'data'))
        combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir)
        combine_csv(results[:-1], 'data/ala_citation.csv', destdir)

        # Zip it out and point to the new zip file
        ala_csv = os.path.join(destdir, 'ala_occurrence.zip')
        zip_occurrence_data(ala_csv,
                            os.path.join(destdir, 'data'),
                            ['ala_occurrence.csv', 'ala_citation.csv'])

        # Make a title & description for multispecies dataset
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
        else:
            ds_name = ','.join([sp['scientificName'] for sp in species])
            title = "{} occurrences".format(ds_name)
        description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)

    else:
        ds_name = ', '.join([name for name in ds_names if name])
        if ds_name:
            title = ds_name
            description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date)
        else:
            title = ala_ds['title']
            description = ala_ds['description']
        species = species[0]

    # build bccvl metadata:
    bccvlmd = {
        'genre': 'DataGenreSpeciesOccurrence',
        'categories': ['occurrence'],
        'species': species
    }

    # build item to import
    item = {
        'title': title,
        'description': description,
        'file': {
            'url': 'file://{}'.format(ala_csv),  # local file url
            'contenttype': 'application/zip',
            'filename': os.path.basename(ala_csv)
        },
        'bccvlmetadata': bccvlmd,
        'filemetadata': extract_metadata(ala_csv, 'application/zip'),
    }
    return (item, results)
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()),
                     '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories

        new_object.subject = []
        if self.domain:
            new_object.subject = [self.domain]
        if self.timeperiod:
            new_object.subject += self.timeperiod

            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url':
                    '{}/@@download/file/{}'.format(new_object.absolute_url(),
                                                   new_object.file.filename),
                    'results_dir':
                    get_results_dir(new_object,
                                    self.request,
                                    childSpecies=True),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'genre': self.datagenre,
                        'dataSource': new_object.dataSource,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: import_multi_species_csv',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(IRegistry).forInterface(
                    ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # FIXME: we can't use ssh here.... we don't know which container we are in... and
                #        sshing here is bad as well....
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)

                # TODO: we push the uploaded file directly to swift here..
                #       this really should be a background process
                #       best solution: ...
                #           user uploads to some temporary upload service (file never ends up here)
                #           we have a remote url here, and tell the datamover to pull it from there
                #           and move it to final destination. (or something like this)
                #       other good way: ...
                #           let user upload directly to swift (what about large file uploads?)
                #           and take care of clean up if necessary

                # 4. move file to swift
                # TODO: do we have enough information to upload to swift?
                #       need a temp url?
                swiftopts = app.conf.get('bccvl', {}).get('swift', {})
                src_url = build_source('file://{}'.format(tmpfile))
                dest_url = build_destination(
                    'swift+{}'.format(new_object.remoteUrl),
                    settings={
                        'swift': {
                            'os_auth_url':
                            swiftopts.get('os_auth_url'),
                            'os_username':
                            swiftopts.get('os_username'),
                            'os_password':
                            swiftopts.get('os_password'),
                            'os_project_name':
                            swiftopts.get('os_project_name'),
                            'os_storage_url':
                            swiftopts.get('os_storage_url'),
                            'os_user_domain_name':
                            swiftopts.get('os_user_domain_name'),
                            'os_project_domain_name':
                            swiftopts.get('os_project_domain_name'),
                            'auth_version':
                            swiftopts.get('auth_version')
                        }
                    })

                try:
                    movelib.move(src_url, dest_url)
                except Exception as e:
                    # do error handling here
                    raise
                finally:
                    # clean up temp location
                    path = os.path.dirname(tmpfile)
                    shutil.rmtree(path)

            # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: update_metadata',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Exemple #22
0
def fetch_file(request, url):
    """Dowload the file from url and place it on the local file system.
    If file is a zip file it will be extracted to the local file system.

    The method returns the filename of the requested file on the
    local file system.
    """
    # TODO: optimize  data files for mapserver?
    # reproject/warp source? to avoid mapserver doing warp on the fly
    # otheroptions:
    #   convert to tiled raster (makes access to tiles faster)
    #     gdal_translate -co TILED=YES original.tif tiled.tif
    #   use Erdas Imagine (HFA) format ... always tiled and supports>4GB files
    #     gdal_translate -of HFA original.tif tiled.img
    #   add overview image to raster (after possible translate)
    #     gdaladdo [-r average] tiled.tif 2 4 8 16 32 64 128
    # for rs point data maybe convert to shapefile?
    if not (url.startswith('http://') or url.startswith('https://')):
        # TODO: probably allow more than just http and https
        #       and use better exception
        raise Exception('unsupported url scheme: %s', url)

    # Check if a local data file is already exist
    datadir = data_dir(request, url)
    url, fragment = urlparse.urldefrag(url)
    # FIXME: have to import here due to circular import
    from pyramid.settings import asbool
    with LockFile(datadir + '.lock'):
        if not os.path.exists(datadir):
            # the folder doesn't exist so we'll have to fetch the file
            # TODO: make sure there is no '..' in datadir
            os.makedirs(datadir)
            # not available yet so fetch it
            try:
                settings = request.registry.settings
                destfile = os.path.join(datadir, os.path.basename(url))
                try:
                    src = {
                        'url': url,
                        'verify': asbool(settings.get('bccvl.ssl.verify', True))
                    }
                    # do we have an __ac cookie?
                    cookie = request.cookies.get('__ac')
                    # get my tokens
                    tokens = ','.join([
                        token.strip()
                        for token in settings.get(
                            'authtkt.tokens', '').split('\n') if token.strip()
                    ])
                    if cookie:
                        src['cookies'] = {
                            'name': '__ac',
                            'value': update_auth_cookie(cookie, tokens, request),
                            'secure': True,
                            'domain': request.host,
                            'path': '/'
                        }
                    dst = {'url': u'file://{0}'.format(destfile)}
                    movelib.move(src, dst)
                except Exception as e:
                    # direct download failed what now?
                    LOG.exception('Failed to download data %s: %s', url, e)
                    raise
                # if it is a zip we should unpack it
                # FIXME: do some more robust zip detection
                if 'application/zip' in mimetypes.guess_type(destfile):
                    with zipfile.ZipFile(destfile, 'r') as zipf:
                        zipf.extractall(datadir)
                    # remove zipfile
                    os.remove(destfile)

                # search all tifs and try to generate overviews
                for root, dirnames, filenames in os.walk(datadir):
                    for filename in fnmatch.filter(filenames, '*.tif'):
                        rasterfile = os.path.join(root, filename)
                        ds = gdal.Open(rasterfile)
                        if ds:
                            maxlevel = min(ds.RasterXSize, ds.RasterYSize) / 512
                            ovrclear = ['gdaladdo', '-clean', rasterfile]
                            ovradd = ['gdaladdo', '-ro',
                                      #'--config', 'COMPRESS_OVERVIEW', 'LZW',
                                      rasterfile,
                            ]
                            level = 2
                            while level < maxlevel:
                                ovradd.append(str(level))
                                level = level * 2
                            if maxlevel > 2:
                                subprocess.check_call(ovrclear)
                                subprocess.check_call(ovradd)

            except Exception as e:
                LOG.error('Could not download %s to %s : %s', url, datadir, e)
                shutil.rmtree(datadir)
                raise e
    # we have the data now construct the filepath
    filename = fragment if fragment else os.path.basename(url)
    # FIXME: make sure path.join works correctly (trailing/leading slash?)
    filename = os.path.join(datadir, filename)
    # make sure filename is within datadir
    filename = os.path.normpath(filename)
    if not os.path.normpath(filename).startswith(datadir):
        # FIXME: should probably check if filename exists and is supported
        #        and use better exception here
        raise Exception("Data file path not valid: '%s'", filename)
    return filename
Exemple #23
0
    def export_to_ala(self):
        uuid = self.request.form.get("uuid", None)
        try:
            if uuid:
                brain = uuidToCatalogBrain(uuid)
                if brain is None:
                    raise Exception("Brain not found")

                obj = brain.getObject()
            else:
                obj = self.context

            # get username
            member = ploneapi.user.get_current()
            if member.getId():
                user = {
                    "id": member.getUserName(),
                    "email": member.getProperty("email"),
                    "fullname": member.getProperty("fullname"),
                }
            else:
                raise Exception("Invalid user")

            # verify dataset
            if obj.portal_type not in (
                "org.bccvl.content.dataset",
                "org.bccvl.content.remotedataset",
                "org.bccvl.content.multispeciesdataset",
            ):
                raise Exception("Invalid UUID (content type)")
            md = IBCCVLMetadata(obj)
            if md.get("genre") not in ("DataGenreSpeciesOccurrence", "DataGenreTraits"):
                raise Exception("Invalid UUID (data type)")
            # get download url
            dlinfo = IDownloadInfo(obj)

            # download file
            from org.bccvl import movelib
            from org.bccvl.movelib.utils import build_source, build_destination
            import tempfile

            destdir = tempfile.mkdtemp(prefix="export_to_ala")
            try:
                from org.bccvl.tasks.celery import app

                settings = app.conf.get("bccvl", {})
                dest = os.path.join(destdir, os.path.basename(dlinfo["url"]))
                movelib.move(
                    build_source(dlinfo["url"], user["id"], settings), build_destination("file://{}".format(dest))
                )

                csvfile = None

                if dlinfo["contenttype"] == "application/zip":
                    # loox at 'layers' to find file within zip
                    arc = md["layers"].keys()[0]

                    import zipfile

                    zf = zipfile.ZipFile(dest, "r")
                    csvfile = zf.open(arc, "r")
                else:
                    csvfile = open(dest, "rb")

                import requests

                # "Accept:application/json" "Origin:http://example.com"
                res = requests.post(
                    settings["ala"]["sandboxurl"],
                    files={"file": csvfile},
                    headers={"apikey": settings["ala"]["apikey"], "Accept": "application/json"},
                )
                if res.status_code != 200:
                    self.record_error(res.reason, res.status_code)
                    raise Exception("Upload failed")
                retval = res.json()
                # TODO: do error checking
                #  keys: sandboxUrl, fileName, message, error: Bool, fileId
                return retval
            finally:
                import shutil

                shutil.rmtree(destdir)

        except Exception as e:
            self.record_error(str(e), 500)
            raise
    def add(self, object):
        # FIXME: this is a workaround, which is fine for small uploaded files.
        #        large uploads should go through another process anyway
        # TODO: re implementing this method is the only way to know
        #       the full path of the object. We need the path to apply
        #       the transmogrifier chain.
        # fti = getUtility(IDexterityFTI, name=self.portal_type)
        container = aq_inner(self.context)
        try:
            # traverse to subfolder if possible
            container = container.restrictedTraverse('/'.join(self.subpath))
        except Exception as e:
            LOG.warn('Could not traverse to %s/%s',
                     '/'.join(container.getPhysicalPath()), '/'.join(self.subpath))
        new_object = addContentToContainer(container, object)
        # set data genre:
        if self.datagenre:
            IBCCVLMetadata(new_object)['genre'] = self.datagenre
        if self.categories:
            IBCCVLMetadata(new_object)['categories'] = self.categories
        
        new_object.subject = []
        if self.domain:
            new_object.subject = [self.domain]
        if self.timeperiod:
            new_object.subject += self.timeperiod

            # rdf commit should happens in transmogrifier step later on
        # if fti.immediate_view:
        #     self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,)
        # else:
        #     self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id)
        # start background import process (just a metadata update)

        # run transmogrify md extraction here
        context_path = '/'.join(new_object.getPhysicalPath())
        member = api.user.get_current()
        # species extract task
        if IMultiSpeciesDataset.providedBy(new_object):
            # kick off csv split import tasks
            import_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.import_multi_species_csv",
                kwargs={
                    'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename),
                    'results_dir': get_results_dir(new_object, self.request, childSpecies=True),
                    'import_context': {
                        'context': '/'.join(container.getPhysicalPath()),
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    },
                    'context': {
                        'context': context_path,
                        'genre': self.datagenre,
                        'dataSource': new_object.dataSource,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            after_commit_task(import_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: import_multi_species_csv',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Multi species import pending')
        else:
            if hasattr(self, '_upload'):
                file = self._upload['file']
                new_object.format = file.contentType
                uid = IUUID(new_object)
                swiftsettings = getUtility(
                    IRegistry).forInterface(ISwiftSettings)
                import os.path
                swift_url = '{storage_url}/{container}/{path}/{name}'.format(
                    storage_url=swiftsettings.storage_url,
                    container=swiftsettings.result_container,
                    path=uid,
                    name=os.path.basename(file.filename))
                new_object.remoteUrl = swift_url
            else:
                file = new_object.file
                new_object.format = file.contentType

            dlinfo = IDownloadInfo(new_object)

            # single species upload
            update_task = app.signature(
                "org.bccvl.tasks.datamover.tasks.update_metadata",
                kwargs={
                    'url': dlinfo['url'],
                    'filename': dlinfo['filename'],
                    'contenttype': dlinfo['contenttype'],
                    'context': {
                        'context': context_path,
                        'user': {
                            'id': member.getUserName(),
                            'email': member.getProperty('email'),
                            'fullname': member.getProperty('fullname')
                        }
                    }
                },
                immutable=True)
            # create upload task in case we upload to external store
            if hasattr(self, '_upload'):
                # FIXME: we can't use ssh here.... we don't know which container we are in... and
                #        sshing here is bad as well....
                # There is an upload ... we have to make sure the uploaded data ends up in external storage
                # 3. put temp file aside
                tmpdir = tempfile.mkdtemp(prefix='bccvl_upload')
                tmpfile = os.path.join(tmpdir, os.path.basename(file.filename))
                blobf = file.open()
                try:
                    # try rename
                    os.rename(blobf.name, tmpfile)
                except OSError:
                    # try copy
                    shutil.copy(blobf.name, tmpfile)

                # TODO: we push the uploaded file directly to swift here..
                #       this really should be a background process
                #       best solution: ...
                #           user uploads to some temporary upload service (file never ends up here)
                #           we have a remote url here, and tell the datamover to pull it from there
                #           and move it to final destination. (or something like this)
                #       other good way: ...
                #           let user upload directly to swift (what about large file uploads?)
                #           and take care of clean up if necessary

                # 4. move file to swift
                # TODO: do we have enough information to upload to swift?
                #       need a temp url?
                swiftopts = app.conf.get('bccvl', {}).get('swift', {})
                src_url = build_source('file://{}'.format(tmpfile))
                dest_url = build_destination('swift+{}'.format(new_object.remoteUrl),
                    settings={'swift': {
                        'os_auth_url': swiftopts.get('os_auth_url'),
                        'os_username': swiftopts.get('os_username'),
                        'os_password': swiftopts.get('os_password'),
                        'os_tenant_name': swiftopts.get('os_tenant_name'),
                        'os_storage_url': swiftopts.get('os_storage_url')
                    }}
                )

                try:
                    movelib.move(src_url, dest_url)
                except Exception as e:
                    # do error handling here
                    raise
                finally:
                    # clean up temp location
                    path = os.path.dirname(tmpfile)
                    shutil.rmtree(path)

            # queue job submission
            after_commit_task(update_task)
            # create job tracking object
            jt = IJobTracker(new_object)
            jt.new_job('TODO: generate id',
                       'generate taskname: update_metadata',
                       function=new_object.dataSource,
                       type=new_object.portal_type)
            jt.set_progress('PENDING', u'Metadata update pending')

        # We have to reindex after updating the object
        new_object.reindexObject()
Exemple #25
0
    def export_to_ala(self):
        uuid = self.request.form.get('uuid', None)
        try:
            if uuid:
                brain = uuidToCatalogBrain(uuid)
                if brain is None:
                    raise Exception("Brain not found")

                obj = brain.getObject()
            else:
                obj = self.context

            # get username
            member = ploneapi.user.get_current()
            if member.getId():
                user = {
                    'id': member.getUserName(),
                    'email': member.getProperty('email'),
                    'fullname': member.getProperty('fullname')
                }
            else:
                raise Exception("Invalid user")

            # verify dataset
            if obj.portal_type not in ('org.bccvl.content.dataset',
                                       'org.bccvl.content.remotedataset',
                                       'org.bccvl.content.multispeciesdataset'):
                raise Exception("Invalid UUID (content type)")
            md = IBCCVLMetadata(obj)
            if md.get('genre') not in ('DataGenreSpeciesOccurrence',
                                       'DataGenreSpeciesCollection',
                                       'DataGenreTraits'):
                raise Exception("Invalid UUID (data type)")
            # get download url
            dlinfo = IDownloadInfo(obj)

            # download file
            from org.bccvl import movelib
            from org.bccvl.movelib.utils import build_source, build_destination
            import tempfile
            destdir = tempfile.mkdtemp(prefix='export_to_ala')
            try:
                from org.bccvl.tasks.celery import app
                settings = app.conf.get('bccvl', {})
                dest = os.path.join(destdir, os.path.basename(dlinfo['url']))
                movelib.move(build_source(dlinfo['url'], user['id'], settings),
                             build_destination('file://{}'.format(dest)))

                csvfile = None

                if dlinfo['contenttype'] == 'application/zip':
                    # loox at 'layers' to find file within zip
                    arc = md['layers'].keys()[0]

                    import zipfile
                    zf = zipfile.ZipFile(dest, 'r')
                    csvfile = zf.open(arc, 'r')
                else:
                    csvfile = open(dest, 'rb')

                import requests
                # "Accept:application/json" "Origin:http://example.com"
                res = requests.post(settings['ala']['sandboxurl'],
                                    files={'file': csvfile},
                                    headers={
                                        'apikey': settings['ala']['apikey'],
                                        'Accept': 'application/json'
                })
                if res.status_code != 200:
                    self.record_error(res.reason, res.status_code)
                    raise Exception('Upload failed')
                retval = res.json()
                # TODO: do error checking
                #  keys: sandboxUrl, fileName, message, error: Bool, fileId
                return retval
            finally:
                import shutil
                shutil.rmtree(destdir)

        except Exception as e:
            self.record_error(str(e), 500)
            raise
Exemple #26
0
def fetch_file(request, url):
    """Dowload the file from url and place it on the local file system.
    If file is a zip file it will be extracted to the local file system.

    The method returns the filename of the requested file on the
    local file system.
    """
    # TODO: optimize  data files for mapserver?
    # reproject/warp source? to avoid mapserver doing warp on the fly
    # otheroptions:
    #   convert to tiled raster (makes access to tiles faster)
    #     gdal_translate -co TILED=YES original.tif tiled.tif
    #   use Erdas Imagine (HFA) format ... always tiled and supports>4GB files
    #     gdal_translate -of HFA original.tif tiled.img
    #   add overview image to raster (after possible translate)
    #     gdaladdo [-r average] tiled.tif 2 4 8 16 32 64 128
    # for rs point data maybe convert to shapefile?
    if not (url.startswith('http://') or url.startswith('https://')):
        # TODO: probably allow more than just http and https
        #       and use better exception
        raise Exception('unsupported url scheme: %s', url)

    # Check if a local data file is already exist
    datadir = data_dir(request, url)
    url, fragment = urlparse.urldefrag(url)
    # FIXME: have to import here due to circular import
    from pyramid.settings import asbool
    with LockFile(datadir + '.lock'):
        if not os.path.exists(datadir):
            # the folder doesn't exist so we'll have to fetch the file
            # TODO: make sure there is no '..' in datadir
            os.makedirs(datadir)
            # not available yet so fetch it
            try:
                settings = request.registry.settings
                destfile = os.path.join(datadir, os.path.basename(url))
                try:
                    src = {
                        'url': url,
                        'verify': asbool(settings.get('bccvl.ssl.verify',
                                                      True))
                    }
                    # do we have an __ac cookie?
                    cookie = request.cookies.get('__ac')
                    # get my tokens
                    tokens = ','.join([
                        token.strip() for token in settings.get(
                            'authtkt.tokens', '').split('\n') if token.strip()
                    ])
                    if cookie:
                        src['cookies'] = {
                            'name': '__ac',
                            'value':
                            update_auth_cookie(cookie, tokens, request),
                            'secure': True,
                            'domain': request.host,
                            'path': '/'
                        }
                    dst = {'url': u'file://{0}'.format(destfile)}
                    movelib.move(src, dst)
                except Exception as e:
                    # direct download failed what now?
                    LOG.exception('Failed to download data %s: %s', url, e)
                    raise
                # if it is a zip we should unpack it
                # FIXME: do some more robust zip detection
                if 'application/zip' in mimetypes.guess_type(destfile):
                    with zipfile.ZipFile(destfile, 'r') as zipf:
                        zipf.extractall(datadir)
                    # remove zipfile
                    os.remove(destfile)

                # search all tifs and try to generate overviews
                for root, dirnames, filenames in os.walk(datadir):
                    for filename in fnmatch.filter(filenames, '*.tif'):
                        rasterfile = os.path.join(root, filename)
                        ds = gdal.Open(rasterfile)
                        if ds:
                            maxlevel = min(ds.RasterXSize,
                                           ds.RasterYSize) / 512
                            ovrclear = ['gdaladdo', '-clean', rasterfile]
                            ovradd = [
                                'gdaladdo',
                                '-ro',
                                #'--config', 'COMPRESS_OVERVIEW', 'LZW',
                                rasterfile,
                            ]
                            level = 2
                            while level < maxlevel:
                                ovradd.append(str(level))
                                level = level * 2
                            if maxlevel > 2:
                                subprocess.check_call(ovrclear)
                                subprocess.check_call(ovradd)

            except Exception as e:
                LOG.error('Could not download %s to %s : %s', url, datadir, e)
                shutil.rmtree(datadir)
                raise e
    # we have the data now construct the filepath
    filename = fragment if fragment else os.path.basename(url)
    # FIXME: make sure path.join works correctly (trailing/leading slash?)
    filename = os.path.join(datadir, filename)
    # make sure filename is within datadir
    filename = os.path.normpath(filename)
    if not os.path.normpath(filename).startswith(datadir):
        # FIXME: should probably check if filename exists and is supported
        #        and use better exception here
        raise Exception("Data file path not valid: '%s'", filename)
    return filename