Example #1
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
Example #2
0
    def gather_stage(self, harvest_job):

        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('z3950Harvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # get current objects out of db
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        guid_to_package_id = dict((res[0], res[1]) for res in query)
        current_guids = set(guid_to_package_id.keys())
        current_guids_in_harvest = set()

        # Get contents
        try:
            conn = zoom.Connection(source_url,
                                   int(self.source_config.get('port', 210)))
            conn.databaseName = self.source_config.get('database', '')
            conn.preferredRecordSyntax = 'XML'
            conn.elementSetName = 'T'
            query = zoom.Query('CCL', 'metadata')
            res = conn.search(query)
            ids = []
            for num, result in enumerate(res):
                hash = hashlib.md5(result.data).hexdigest()
                if hash in current_guids:
                    current_guids_in_harvest.add(hash)
                else:
                    obj = HarvestObject(
                        job=harvest_job,
                        guid=hash,
                        extras=[
                            HOExtra(key='status', value='new'),
                            HOExtra(key='original_document',
                                    value=result.data.decode('latin-1')),
                            HOExtra(key='original_format', value='fgdc')
                        ])
                    obj.save()
                    ids.append(obj.id)
            for guid in (current_guids - current_guids_in_harvest):
                obj = HarvestObject(
                    job=harvest_job,
                    guid=guid,
                    package_id=guid_to_package_id[guid],
                    extras=[HOExtra(key='status', value='delete')])
                obj.save()
                ids.append(obj.id)
            return ids
        except Exception, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None
    def _gather_entry(self, entry, auth=None):
        # Create a harvest object for each entry
        entry_guid = entry['guid']
        log.debug('gathering %s', entry_guid)
        entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '')  # noqa: E501
        entry_restart_date = entry['restart_date']

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if self.update_all:
                log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                status = 'change'
            else:
                log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501  # noqa: E501
                status = 'unchanged'

            obj = HarvestObject(guid=entry_guid,
                                job=self.job,
                                extras=[
                                    HOExtra(key='status', value=status),
                                    HOExtra(key='restart_date', value=entry_restart_date)
                                ])

            obj.content = entry['content']
            obj.package = package
            obj.save()
            return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='restart_date', value=entry_restart_date)
                ])
            obj.content = entry['content']
            obj.package = None
            obj.save()
            return obj.id
Example #4
0
    def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid='test1', job=harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid='test2', job=harvest_job)
            obj3 = HarvestObject(guid='test_to_delete', job=harvest_job)
            obj.add()
            obj2.add()
            obj3.save()  # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []
Example #5
0
    def gather_stage(self, harvest_job):
        log.debug('In SRDAHarvester gather_stage (%s)' %
                  harvest_job.source.url)

        get_all_packages = True
        package_ids = []

        data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        doc = html.parse(data)
        for td in doc.findall("//td[@class='left_p12_title']/a"):
            link = td.get('href')
            if re.match(r"/search/fsciitem", link):
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job=harvest_job, content=link)
                obj.save()
                package_ids.append(obj.id)

        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        return package_ids
Example #6
0
    def gather_stage(self, harvest_job):
        """Retrieve datasets"""

        log.debug('In KoelnCKANHarvester gather_stage (%s)' %
                  harvest_job.source.url)
        package_ids = []
        self._set_config(None)

        base_url = harvest_job.source.url.rstrip('/')
        package_list_url = base_url + '/3/action/package_list'
        content = self._get_content(package_list_url)

        content_json = json.loads(content)
        package_ids = content_json['result']

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids

            else:
                self._save_gather_error(
                    'No packages received for URL: %s' % url, harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
    def gather_stage(self,harvest_job):
        log.debug('In NTPCHarvester gather_stage (%s)' % harvest_job.source.url)

        url = self.PREFIX_URL + self.CATALOGUE_INDEX_URL
        get_all_packages = True
        try:
            package_ids = []
            dataset_count = self._get_ntpc_dataset_count(url)
            msg_count = 0
            for x in range(dataset_count/10 + 1):
                page_url = url + '?currentPage=%s' % (x + 1)
                data = urllib2.urlopen(page_url)
                doc = html.parse(data)
                for div in doc.findall("//a[@href]"):
                    if '/NTPC/od/query;' in div.attrib['href']:
                        link = div.attrib['href']
                        id = sha1(link).hexdigest()
                        obj = HarvestObject(guid=id, job=harvest_job, content=link)
                        obj.save()
                        package_ids.append(obj.id)
                        msg_count = msg_count + 1

            if msg_count == 0:
                self._save_gather_error('No packages received for URL: %s' % url,
                        harvest_job)
                return None

        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
Example #8
0
    def gather_stage(self,harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid = package_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
Example #9
0
def harvest_object_create(context, data_dict):
    """ Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string 
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    """
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(),
                             context)

    if errors:
        raise logic.ValidationError(errors)

    obj = HarvestObject(guid=data.get('guid'),
                        content=data.get('content'),
                        job=data['job_id'],
                        harvest_source_id=data.get('source_id'),
                        package_id=data.get('package_id'),
                        extras=[
                            HarvestObjectExtra(key=k, value=v)
                            for k, v in data.get('extras', {}).items()
                        ])

    obj.save()
    return harvest_object_dictize(obj, context)
Example #10
0
    def gather_stage(self, harvest_job):
        log.debug('In ZhstatHarvester gather_stage')

        ids = []
        parser = etree.XMLParser(encoding='utf-8')

        for dataset in etree.fromstring(self._fetch_metadata(), parser=parser):

            # Get the german data if one is available,
            # otherwise get the first one
            base_datas = dataset.xpath("data[@xml:lang='de']")
            if len(base_datas) != 0:
                base_data = base_datas[0]
            else:
                base_data = dataset.find('data')

            metadata = self._generate_metadata(base_data, dataset)

            if metadata:
                obj = HarvestObject(guid=dataset.get('id'),
                                    job=harvest_job,
                                    content=json.dumps(metadata))
                obj.save()
                log.debug('adding ' + dataset.get('id') + ' to the queue')
                ids.append(obj.id)
            else:
                log.debug(
                    'Skipping %s since no resources or groups are available' %
                    dataset.get('id'))

        return ids
Example #11
0
    def _gather_object(self, job, url, size, start_date, forecast_date):
        filename = parse_filename(url)
        filename_id = (filename.replace('-v02.0-fv02.0', '').replace(
            '-fv02.0',
            '').replace('-sv01.00', '').replace('-sv05.00', '').replace(
                '-v02', '').replace('-sv10.00',
                                    '').replace('-sv09.00',
                                                '').replace('-sv07.00', ''))

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps(
            {
                'identifier': filename_id,
                'ftp_link': url,
                'size': size,
                'start_date': start_date,
                'forecast_date': forecast_date,
                'restart_date': start_date
            },
            default=str)
        obj = HarvestObject(job=job, guid=url, extras=extras, content=content)
        obj.package = package
        obj.save()
        return obj.id
Example #12
0
    def _run_import(self, xml, job):
        if not model.User.get('harvest'):
            model.User(name='harvest', sysadmin=True).save()
        if not model.Group.get('test'):
            get_action('organization_create')({
                'user': '******'
            }, {
                'name': 'test'
            })

        record = _get_record(xml)

        metadata = CmdiReader()(record)
        metadata['unified']['owner_org'] = "test"

        harvest_object = HarvestObject()
        harvest_object.content = json.dumps(metadata.getMap())
        harvest_object.id = xml
        harvest_object.guid = xml
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.save()

        self.harvester.import_stage(harvest_object)
        return harvest_object
Example #13
0
 def _create_harvest_object(self, package_name, ref):
     package = model.Package.by_name(unicode(package_name))
     model.Session.add(
             HarvestObject(guid='not important',
                           current=True, source=self.source, job=self.job,
                           harvest_source_reference=ref_prefix+ref,
                           package=package)
         )
def doi_update(context, data_dict):
    model = context['model']
    new_package = data_dict
    source_hash = hashlib.sha1(json.dumps(data_dict,
                                          sort_keys=True)).hexdigest()
    old_package = p.toolkit.get_action('package_show')({
        'model': model,
        'ignore_auth': True
    }, {
        "id": new_package['id']
    })
    for extra in old_package['extras']:
        if extra['key'] == 'source_hash':
            old_source_hash = extra['value']
            break
    else:
        old_source_hash = None

    if source_hash == old_source_hash and old_package.get('state') == 'active':
        print str(datetime.datetime.now()
                  ) + ' No change for doi id ' + new_package['id']
        return

    new_package["extras"].append({"key": "source_hash", "value": source_hash})
    new_package["extras"].append({"key": "metadata-source", "value": "doi"})
    new_package["extras"].append({
        "key": "source_doi_import_identifier",
        "value": True
    })
    new_package.pop("name", None)
    owner_org = model.Group.get(
        ORG_MAPPING.get(new_package['organization']['name']))
    if not owner_org:
        print str(
            datetime.datetime.now()) + ' Fail to update doi id ' + new_package[
                'id'] + '. Organization ' + new_package['organization'][
                    'name'] + ' does not exist.'
        return
    new_package['owner_org'] = owner_org.name
    group_name = new_package.pop('owner_name', None)

    resources = []
    for resource in new_package['resources']:
        resource.pop('resource_group_id', None)
        resource.pop('revision_id', None)
        resource.pop('id', None)
        resources.append(resource)
    new_package['resources'] = resources

    obj = HarvestObject(guid=uuid.uuid4().hex,
                        job=context['harvest_job'],
                        content=context['harvestobj'])
    obj.save()
    new_package["extras"].append({"key": "harvest_object_id", "value": obj.id})

    context['return_id_only'] = True
    p.toolkit.get_action('package_update')(context, new_package)
    print str(datetime.datetime.now()) + ' Updated doi id ' + new_package['id']
Example #15
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        # Get contents
        try:
            content = self._get_content(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None
        ids = []
        try:
            for url in self._extract_urls(content, url):
                try:
                    content = self._get_content(url)
                except Exception as e:
                    msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e)
                    self._save_gather_error(msg, harvest_job)
                    continue
                else:
                    # We need to extract the guid to pass it to the next stage
                    try:
                        gemini_string, gemini_guid = self.get_gemini_string_and_guid(
                            content, url)
                        if gemini_guid:
                            log.debug('Got GUID %s' % gemini_guid)
                            # Create a new HarvestObject for this identifier
                            # Generally the content will be set in the fetch stage, but as we alredy
                            # have it, we might as well save a request
                            obj = HarvestObject(guid=gemini_guid,
                                                job=harvest_job,
                                                content=gemini_string)
                            obj.save()

                            ids.append(obj.id)

                    except Exception as e:
                        msg = 'Could not get GUID for source %s: %r' % (url, e)
                        self._save_gather_error(msg, harvest_job)
                        continue
        except Exception as e:
            msg = 'Error extracting URLs from %s' % url
            self._save_gather_error(msg, harvest_job)
            return None

        if len(ids) > 0:
            return ids
        else:
            self._save_gather_error(
                'Couldn\'t find any links to metadata files', harvest_job)
            return None
Example #16
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.CSW.gather')
        log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url

        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_gather_error('Error contacting the CSW server: %s' % e,
                                    harvest_job)
            return None

        log.debug('Starting gathering for %s' % url)
        used_identifiers = []
        ids = []
        try:
            for identifier in self.csw.getidentifiers(page=10):
                try:
                    log.info('Got identifier %s from the CSW', identifier)
                    if identifier in used_identifiers:
                        log.error(
                            'CSW identifier %r already used, skipping...' %
                            identifier)
                        continue
                    if identifier is None:
                        log.error('CSW returned identifier %r, skipping...' %
                                  identifier)
                        ## log an error here? happens with the dutch data
                        continue

                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=identifier, job=harvest_job)
                    obj.save()

                    ids.append(obj.id)
                    used_identifiers.append(identifier)
                except Exception as e:
                    self._save_gather_error(
                        'Error for the identifier %s [%r]' % (identifier, e),
                        harvest_job)
                    continue

        except Exception as e:
            log.error('Exception: %s' % text_traceback())
            self._save_gather_error(
                'Error gathering the identifiers from the CSW server [%s]' %
                six.text_type(e), harvest_job)
            return None

        if len(ids) == 0:
            self._save_gather_error('No records received from the CSW server',
                                    harvest_job)
            return None

        return ids
Example #17
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.ITagEnricher.gather')
        log.debug('ITagEnricher gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        org_id = model.Package.get(harvest_job.source.id).owner_org
        organization = logic.get_action('organization_show')(context, {
            'id': org_id
        })  # noqa: E501

        # Exclude Sentinel-3 because it seems like iTag can't handle the curved
        # footprints.
        filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format(
            organization['name'])  # noqa: E501

        ids = []

        # We'll limit this to 10 datasets per job so that results appear
        # faster
        start = 0
        rows = self.source_config.get('datasets_per_job', 10)
        untagged = logic.get_action('package_search')(context, {
            'fq': filter_query,
            'rows': rows,
            'start': start
        })
        results = untagged['results']
        for result in results:
            spatial = None
            for i in result['extras']:
                if i['key'] == 'spatial':
                    spatial = i['value']
            if spatial:
                obj = HarvestObject(
                    guid=result['id'],
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value='change'),  # noqa: E501
                        HOExtra(key='spatial', value=spatial),  # noqa: E501
                        HOExtra(key='package', value=json.dumps(result))
                    ])  # noqa: E501
                obj.save()
                ids.append(obj.id)

        return ids
Example #18
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        log.info("in gather stage: %s" % harvest_job.source.url)

        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url,
                                          registry,
                                          self.credentials,
                                          force_http_get=self.force_http_get)
            # Start looking from here
            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                harvest_obj = HarvestObject(guid=header.identifier(),
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
                log.info("Harvest obj %s created" % harvest_obj.id)
                # return harvest_obj_ids # This is to get only one record
        except urllib.error.HTTPError as e:
            log.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        except Exception as e:
            log.exception('Gather stage failed on %s: %s' % (
                harvest_job.source.url,
                str(e),
            ))
            self._save_gather_error(
                'Could not gather anything from %s: %s / %s' %
                (harvest_job.source.url, str(e), traceback.format_exc()),
                harvest_job)
            return None
        log.info("Gather stage successfully finished with %s harvest objects" %
                 len(harvest_obj_ids))
        return harvest_obj_ids
Example #19
0
    def gather_stage(self, harvest_job):
        log.debug('In DataWienGvAt gather_stage')

        doc = etree.parse(self.CATALOGUE_FEED_URL)
        ids = []
        for link in doc.findall("//item/link"):
            link = link.text
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()
            ids.append(obj.id)
        return ids
 def delete_geocat_ids(self, harvest_job, harvest_obj_ids,
                       packages_to_delete):
     delete_harvest_obj_ids = []
     for package_info in packages_to_delete:
         obj = HarvestObject(guid=package_info[1].name,
                             job=harvest_job,
                             extras=[
                                 HarvestObjectExtra(key='import_action',
                                                    value='delete')
                             ])
         obj.save()
         delete_harvest_obj_ids.append(obj.id)
     return delete_harvest_obj_ids
    def gather_stage(self, harvest_job):
        log.debug('In OpenDataCatHarvester gahter_stage')
        # Get feed contents
        doc = etree.parse(self.INDEX_URL)
        ids = []
        for link_element in doc.findall('//item/link'):
            link = link_element.text.strip()
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()

            ids.append(obj.id)
        return ids
Example #22
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        logger.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            self._set_config(harvest_job.source.config)

            skip_licenses = {
                'c12c3333-1ad7-4a3a-a629-ed51fcb636ac',
                'a270745d-07d5-4e93-94fc-ba6e0afc97fb',
            }

            # TODO: switch
            # for record in json.loads(open('/tmp/data.json').read())['dataset']:
            for record in requests.get(
                    urlparse.urljoin(harvest_job.source.url,
                                     'data.json')).json()['dataset']:
                license_id = record.get('license',
                                        'cc-by').strip('/').split('/')[-1]
                if license_id in skip_licenses:
                    continue
                if 'hub.pacificdata' == record.get('isPartOf'):
                    continue
                if 'Info' in record.get('theme', []):
                    continue
                harvest_obj = HarvestObject(guid=record['identifier'],
                                            content=json.dumps(record),
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
        except urllib2.HTTPError, e:
            logger.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
Example #23
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        logger.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            self._set_config(harvest_job.source.config)
            url = urljoin(harvest_job.source.url, '/v1/dataset/search')

            for record in self._fetch_record_outline(url):

                # if record['key'] != 'a38c7d49-5a5d-4aa6-a64e-421178bd06d7':
                # continue
                harvest_obj = HarvestObject(guid=record['key'],
                                            content=record['country'],
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)

                # TODO: remove
                # break
        except (HTTPError) as e:
            logger.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        except (Exception) as e:
            logger.exception('Gather stage failed on %s: %s' % (
                harvest_job.source.url,
                str(e),
            ))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        return harvest_obj_ids
Example #24
0
class JSONZipBaseHarvester(JSONDumpBaseCKANHarvester):
    def info(self):
        return {
            'name':
            'zipbase',
            'title':
            'Base Zip Harvester',
            'description':
            'A Harvester for Portals, which return JSON files in a zip file.'
        }

    def gather_stage(self, harvest_job):
        self._set_config(harvest_job.source.config)
        # Request all remote packages
        try:
            content = self._get_content(harvest_job.source.url)
        except Exception, e:
            self._save_gather_error(
                'Unable to get content for URL: %s: %s' %
                (harvest_job.source.url, str(e)), harvest_job)
            return None

        object_ids = []
        packages = []
        import zipfile
        import StringIO
        file_content = StringIO.StringIO(content)
        archive = zipfile.ZipFile(file_content, "r")
        for name in archive.namelist():
            print name
            if name.endswith(".json"):
                package = json.loads(archive.read(name))
                packages.append(package)
                obj = HarvestObject(guid=package['name'], job=harvest_job)
                obj.content = json.dumps(package)
                obj.save()
                object_ids.append(obj.id)
        '''
        context = self.build_context()
        remote_dataset_names = map(lambda d: d['name'], packages)
        self.delete_deprecated_datasets(context, remote_dataset_names)
        
	'''

        if object_ids:
            return object_ids
        else:
            self._save_gather_error(
                'No packages received for URL: %s' % harvest_job.source.url,
                harvest_job)
            return None
Example #25
0
    def _save_harvest_object(self, metadata, harvest_job):
        '''
        Save the harvest object with the given metadata dict and harvest_job
        '''

        obj = HarvestObject(
            guid=metadata['datasetID'],
            job=harvest_job,
            content=json.dumps(metadata)
        )
        obj.save()
        log.debug('adding ' + metadata['datasetID'] + ' to the queue')

        return obj.id
Example #26
0
    def gather_stage(self, harvest_job):
        log.debug('In OpendataParisFr gather_stage')

        doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        ids = []
        for link in doc.findall(
                "//div[@class='animate download-portlet-element']/a"):
            link = link.get('href')
            if not "#comments" in link:
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job=harvest_job, content=link)
                obj.save()
                ids.append(obj.id)
        return ids
    def gather_stage(self, harvest_object):
        log.debug('In OdgovltHarvester gather_stage')

        sync = IvpkIrsSync(sa.create_engine(harvest_object.source.url))
        sync.sync_groups()

        ids = []
        for ivpk_dataset in sync.get_ivpk_datasets():
            content = json.dumps(dict(ivpk_dataset), cls=DatetimeEncoder)
            obj = HarvestObject(guid=ivpk_dataset.ID,
                                job=harvest_object,
                                content=content)
            obj.save()
            ids.append(obj.id)
        return ids
Example #28
0
 def _make_harvest_objs(datasets):
     '''Create HarvestObject with Socrata dataset content.'''
     obj_ids = []
     guids = []
     for d in datasets:
         log.debug('Creating HarvestObject for {} {}'.format(
             d['resource']['name'], d['resource']['id']))
         obj = HarvestObject(
             guid=d['resource']['id'],
             job=harvest_job,
             content=json.dumps(d),
             extras=[HarvestObjectExtra(key='status', value='hi!')])
         obj.save()
         obj_ids.append(obj.id)
         guids.append(d['resource']['id'])
     return obj_ids, guids
Example #29
0
    def _mark_datasets_for_deletion(self, guids_in_source, harvest_job):
        '''
        Given a list of guids in the remote source, checks which in the DB
        need to be deleted

        To do so it queries all guids in the DB for this source and calculates
        the difference.

        For each of these creates a HarvestObject with the dataset id, marked
        for deletion.

        Returns a list with the ids of the Harvest Objects to delete.
        '''

        object_ids = []

        # Get all previous current guids and dataset ids for this source
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id)\
            .filter(
            HarvestObject.current == True  # noqa
        ).filter(HarvestObject.harvest_source_id == harvest_job.source.id)

        guid_to_package_id = {}
        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = list(guid_to_package_id.keys())

        # Get objects/datasets to delete (ie in the DB but not in the source)
        guids_to_delete = set(guids_in_db) - set(guids_in_source)

        # Create a harvest object for each of them, flagged for deletion
        for guid in guids_to_delete:
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                package_id=guid_to_package_id[guid],
                extras=[HarvestObjectExtra(key='status', value='delete')])

            # Mark the rest of objects for this guid as not current
            model.Session.query(HarvestObject) \
                         .filter_by(guid=guid) \
                         .update({'current': False}, False)
            obj.save()
            object_ids.append(obj.id)

        return object_ids
    def _create_object(self, ebv_type, dataset_info):

        extras = [HOExtra(key='status', value='new')]

        if ebv_type == 'tree_species':
            collectionID = 'TREE_SPECIES_DISTRIBUTION_HABITAT_SUITABILITY'
            collection_name = 'Tree Species Distribution Habitat Suitability'
            collection_description = '	European Distribution of the tress species for the years 2000 (Habitat Suitability baseline), 2020, 2050 and 2080 (Habitat Suitability future), based on different models such as ENS, CCCMA, CSIRO, HADCM3.'  # noqa: E501
        elif ebv_type == 'flood_hazards':
            collectionID = 'FLOOD_HAZARD_EU_GL'
            collection_name = 'Flood Hazard Europe/Global'
            collection_description = 'The maps depict flood prone areas at global/european scale for flood events. Resolution is 30 arcseconds (approx. 1km). Cell values indicate water depth (in m). The map can be used to assess flood exposure and risk of population and assets. NOTE: this dataset is based on JRC elaborations and is not an official flood hazard map.'  # noqa: E501
        title = dataset_info[0]
        description = dataset_info[1]
        start_date = dataset_info[2]
        end_date = dataset_info[3]
        spatial = dataset_info[4]
        filename = dataset_info[5]
        identifier = dataset_info[6]
        download_url = dataset_info[7]
        tags = dataset_info[8]

        content = json.dumps(
            {
                'collectionID': collectionID,
                'title': title,
                'description': description,
                'start_date': start_date,
                'end_date': end_date,  # noqa: E501
                'identifier': identifier,
                'downloadURL': download_url,  # noqa: E501
                'spatial': spatial,
                'filename': filename,
                'collection_name': collection_name,
                'collection_description': collection_description,
                'tags': tags
            },
            default=str)

        obj = HarvestObject(job=self.job,
                            guid=unicode(uuid.uuid4()),
                            extras=extras,
                            content=content)

        obj.save()

        return obj.id