Beispiel #1
0
    def gather_stage(self, harvest_job):

        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('z3950Harvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # get current objects out of db
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        guid_to_package_id = dict((res[0], res[1]) for res in query)
        current_guids = set(guid_to_package_id.keys())
        current_guids_in_harvest = set()

        # Get contents
        try:
            conn = zoom.Connection(source_url,
                                   int(self.source_config.get('port', 210)))
            conn.databaseName = self.source_config.get('database', '')
            conn.preferredRecordSyntax = 'XML'
            conn.elementSetName = 'T'
            query = zoom.Query('CCL', 'metadata')
            res = conn.search(query)
            ids = []
            for num, result in enumerate(res):
                hash = hashlib.md5(result.data).hexdigest()
                if hash in current_guids:
                    current_guids_in_harvest.add(hash)
                else:
                    obj = HarvestObject(
                        job=harvest_job,
                        guid=hash,
                        extras=[
                            HOExtra(key='status', value='new'),
                            HOExtra(key='original_document',
                                    value=result.data.decode('latin-1')),
                            HOExtra(key='original_format', value='fgdc')
                        ])
                    obj.save()
                    ids.append(obj.id)
            for guid in (current_guids - current_guids_in_harvest):
                obj = HarvestObject(
                    job=harvest_job,
                    guid=guid,
                    package_id=guid_to_package_id[guid],
                    extras=[HOExtra(key='status', value='delete')])
                obj.save()
                ids.append(obj.id)
            return ids
        except Exception, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None
    def _gather_entry(self, entry, auth=None):
        # Create a harvest object for each entry
        entry_guid = entry['guid']
        log.debug('gathering %s', entry_guid)
        entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '')  # noqa: E501
        entry_restart_date = entry['restart_date']

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if self.update_all:
                log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                status = 'change'
            else:
                log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501  # noqa: E501
                status = 'unchanged'

            obj = HarvestObject(guid=entry_guid,
                                job=self.job,
                                extras=[
                                    HOExtra(key='status', value=status),
                                    HOExtra(key='restart_date', value=entry_restart_date)
                                ])

            obj.content = entry['content']
            obj.package = package
            obj.save()
            return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='restart_date', value=entry_restart_date)
                ])
            obj.content = entry['content']
            obj.package = None
            obj.save()
            return obj.id
        def create_extras(url, status):
            '''

            :param url: 
            :param status: 

            '''
            return [HOExtra(key=u'doc_location', value=url),
                    HOExtra(key=u'status', value=status)]
Beispiel #4
0
 def create_extras(url, date, status):
     extras = [HOExtra(key='waf_modified_date', value=date),
               HOExtra(key='waf_location', value=url),
               HOExtra(key='status', value=status)]
     if collection_package_id:
         extras.append(
             HOExtra(key='collection_package_id',
                     value=collection_package_id)
         )
     return extras
Beispiel #5
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.ITagEnricher.gather')
        log.debug('ITagEnricher gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        org_id = model.Package.get(harvest_job.source.id).owner_org
        organization = logic.get_action('organization_show')(context, {
            'id': org_id
        })  # noqa: E501

        # Exclude Sentinel-3 because it seems like iTag can't handle the curved
        # footprints.
        filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format(
            organization['name'])  # noqa: E501

        ids = []

        # We'll limit this to 10 datasets per job so that results appear
        # faster
        start = 0
        rows = self.source_config.get('datasets_per_job', 10)
        untagged = logic.get_action('package_search')(context, {
            'fq': filter_query,
            'rows': rows,
            'start': start
        })
        results = untagged['results']
        for result in results:
            spatial = None
            for i in result['extras']:
                if i['key'] == 'spatial':
                    spatial = i['value']
            if spatial:
                obj = HarvestObject(
                    guid=result['id'],
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value='change'),  # noqa: E501
                        HOExtra(key='spatial', value=spatial),  # noqa: E501
                        HOExtra(key='package', value=json.dumps(result))
                    ])  # noqa: E501
                obj.save()
                ids.append(obj.id)

        return ids
Beispiel #6
0
    def _gather_object(self, job, url, size, start_date, forecast_date):
        filename = parse_filename(url)
        filename_id = (filename.replace('-v02.0-fv02.0', '').replace(
            '-fv02.0',
            '').replace('-sv01.00', '').replace('-sv05.00', '').replace(
                '-v02', '').replace('-sv10.00',
                                    '').replace('-sv09.00',
                                                '').replace('-sv07.00', ''))

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps(
            {
                'identifier': filename_id,
                'ftp_link': url,
                'size': size,
                'start_date': start_date,
                'forecast_date': forecast_date,
                'restart_date': start_date
            },
            default=str)
        obj = HarvestObject(job=job, guid=url, extras=extras, content=content)
        obj.package = package
        obj.save()
        return obj.id
Beispiel #7
0
    def fetch_stage(self, harvest_object):

        # Check harvest object status
        status = self._get_object_extra(harvest_object,'status')

        if status == 'delete':
            # No need to fetch anything, just pass to the import stage
            return True

        # We need to fetch the remote document

        # Get location
        url = self._get_object_extra(harvest_object, 'waf_location')
        if not url:
            self._save_object_error(
                    'No location defined for object {0}'.format(harvest_object.id),
                    harvest_object)
            return False

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                    object=harvest_object,
                    key='original_document',
                    value=content)
            extra.save()

            extra = HOExtra(
                    object=harvest_object,
                    key='original_format',
                    value=document_format)
            extra.save()

        return True
    def _create_harvest_object(self, content_dict):

        extras = [
            HOExtra(key='status', value='new'),
            HOExtra(key='restart_date', value=content_dict['date_string'])
        ]

        # The NextGEOSS harvester flow requires content in the import stage.
        content = json.dumps(content_dict)

        obj = HarvestObject(job=self.job,
                            guid=content_dict['identifier'],
                            extras=extras,
                            content=content)

        obj.save()

        return obj.id
    def _create_object(self, ebv_type, dataset_info):

        extras = [HOExtra(key='status', value='new')]

        if ebv_type == 'tree_species':
            collectionID = 'TREE_SPECIES_DISTRIBUTION_HABITAT_SUITABILITY'
            collection_name = 'Tree Species Distribution Habitat Suitability'
            collection_description = '	European Distribution of the tress species for the years 2000 (Habitat Suitability baseline), 2020, 2050 and 2080 (Habitat Suitability future), based on different models such as ENS, CCCMA, CSIRO, HADCM3.'  # noqa: E501
        elif ebv_type == 'flood_hazards':
            collectionID = 'FLOOD_HAZARD_EU_GL'
            collection_name = 'Flood Hazard Europe/Global'
            collection_description = 'The maps depict flood prone areas at global/european scale for flood events. Resolution is 30 arcseconds (approx. 1km). Cell values indicate water depth (in m). The map can be used to assess flood exposure and risk of population and assets. NOTE: this dataset is based on JRC elaborations and is not an official flood hazard map.'  # noqa: E501
        title = dataset_info[0]
        description = dataset_info[1]
        start_date = dataset_info[2]
        end_date = dataset_info[3]
        spatial = dataset_info[4]
        filename = dataset_info[5]
        identifier = dataset_info[6]
        download_url = dataset_info[7]
        tags = dataset_info[8]

        content = json.dumps(
            {
                'collectionID': collectionID,
                'title': title,
                'description': description,
                'start_date': start_date,
                'end_date': end_date,  # noqa: E501
                'identifier': identifier,
                'downloadURL': download_url,  # noqa: E501
                'spatial': spatial,
                'filename': filename,
                'collection_name': collection_name,
                'collection_description': collection_description,
                'tags': tags
            },
            default=str)

        obj = HarvestObject(job=self.job,
                            guid=unicode(uuid.uuid4()),
                            extras=extras,
                            content=content)

        obj.save()

        return obj.id
    def _create_object(self, sensor, dataset_info):

        extras = [HOExtra(key='status', value='new')]

        if sensor == 'avhrr':
            collectionID = 'LAI_1KM_AVHRR_8DAYS_GL'
        elif sensor == 'modis':
            collectionID = 'LAI_1KM_MODIS_8DAYS_GL'
        title = dataset_info[0]
        description = dataset_info[1]
        start_date = dataset_info[2]
        end_date = dataset_info[3]
        spatial = dataset_info[4]
        filename = dataset_info[5]
        identifier = dataset_info[6]
        downloadURL = dataset_info[7]
        thumbnailURL = dataset_info[8]
        metadataURL = dataset_info[9]
        tags = dataset_info[10]

        content = json.dumps(
            {
                'collectionID': collectionID,
                'title': title,
                'description': description,
                'start_date': start_date,
                'end_date': end_date,  # noqa: E501
                'identifier': identifier,
                'downloadURL': downloadURL,
                'thumbnailURL': thumbnailURL,
                'metadataURL': metadataURL,  # noqa: E501
                'spatial': spatial,
                'filename': filename,
                'tags': tags
            },
            default=str)

        obj = HarvestObject(job=self.job,
                            guid=unicode(uuid.uuid4()),
                            extras=extras,
                            content=content)

        obj.save()

        return obj.id
Beispiel #11
0
    def _gather_object(self, job, url, start_date):
        filename = parse_filename(url)
        filename_id = filename

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps({
            'identifier': filename_id,
            'http_link': url,
            'start_date': start_date,
            'restart_date': start_date
        }, default=str
        )
        obj = HarvestObject(job=job,
                            guid=url,
                            extras=extras,
                            content=content)
        obj.package = package
        obj.save()
        return obj.id
Beispiel #12
0
    def _gather_object(self, job, product, resources, manifest_content,
                       last_harvest_date):
        name = parse_filename(product).lower()

        status, package = self._was_harvested(name, self.update_all)

        extras = [HOExtra(key='status', value=status)]

        content = json.dumps(
            {
                'name': name,
                'restart_date': last_harvest_date.strftime('%Y-%m-%d'),
                'manifest_content': manifest_content,
                'resources': resources
            },
            default=str)

        obj = HarvestObject(job=job,
                            guid=unicode(uuid.uuid4()),
                            extras=extras,
                            content=content)
        obj.package = package
        obj.save()
        return obj.id
Beispiel #13
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.AKANA.gather')
        log.info('Akana gather_stage for job: %r', harvest_job)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        # get the current objevcts ids and add them to a set
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \
            filter(HarvestObject.current == True). \
            filter(HarvestObject.harvest_source_id == harvest_job.source.id)
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = guid_to_package_id.keys()

        # Get akana ID's contents
        # make request to get object from akana based on tag search
        url = harvest_job.source.url
        pa = PingAuth(environment=pingi_env)

        resp = pa.get(url)
        resp_dict = json.loads(resp.content)

        if resp.status_code == 200:
            try:
                ids = []
                obid = []
                x = 0
                for api in resp_dict:
                    uuid = api['api-id'] + api['swagger']['info'][
                        'version'] + harvest_job.source_id
                    ids.append(uuid)
                    json_api = json.dumps(api)

                    if uuid in guids_in_db:
                        log.info(
                            "This package is already in ckan and is going to be updated: %r",
                            uuid)
                        status = "update"
                    else:
                        log.info("This package is being created: %r", uuid)
                        status = "new"

                    obj = HarvestObject(
                        guid=ids[x],
                        job=harvest_job,
                        extras=[HOExtra(key='status', value=status)],
                        content=json_api)
                    obj.save()
                    obid.append(obj.id)
                    x += 1

                obj_del = list(set(guids_in_db) - set(ids))

                if obj_del:
                    for uuid in obj_del:
                        log.info("This package is being deleted: %r", uuid)
                        obj = HarvestObject(
                            guid=uuid,
                            job=harvest_job,
                            extras=[HOExtra(key='status', value="delete")],
                            content=[])
                        model.Session.query(HarvestObject). \
                            filter_by(guid=guid). \
                            update({'current': False}, False)
                        obj.save()
                        obid.append(obj.id)

                # need to return the list of ID's here that are created above
                return obid
            except Exception, e:
                log.error('Exception: %s' % e)
                self._save_gather_error(
                    'Error gathering the identifiers from the AKANA server [%s]'
                    % str(e), harvest_job)
                return None
Beispiel #14
0
class ArcGISHarvester(SpatialHarvester, SingletonPlugin):

    implements(IHarvester)

    extent_template = Template('''
       {"type": "Polygon", "coordinates": [[[$minx, $miny], [$minx, $maxy], [$maxx, $maxy], [$maxx, $miny], [$minx, $miny]]]}
    ''')

    def info(self):
        '''
        Harvesting implementations must provide this method, which will return a
        dictionary containing different descriptors of the harvester. The
        returned dictionary should contain:

        * name: machine-readable name. This will be the value stored in the
          database, and the one used by ckanext-harvest to call the appropiate
          harvester.
        * title: human-readable name. This will appear in the form's select box
          in the WUI.
        * description: a small description of what the harvester does. This will
          appear on the form as a guidance to the user.

        A complete example may be::

            {
                'name': 'csw',
                'title': 'CSW Server',
                'description': 'A server that implements OGC's Catalog Service
                                for the Web (CSW) standard'
            }

        returns: A dictionary with the harvester descriptors
        '''
        return {
            'name': 'arcgis',
            'title': 'ArcGIS REST API',
            'description': 'An ArcGIS REST API endpoint'
        }

    def extra_schema(self):
        return {
            'private_datasets': [ignore_empty, boolean_validator],
            'extra_search_criteria': [ignore_empty, unicode],
        }

    def gather_stage(self, harvest_job):

        self.harvest_job = harvest_job
        source_url = harvest_job.source.url
        source_config = json.loads(harvest_job.source.config or '{}')
        extra_search_criteria = source_config.get('extra_search_criteria')

        num = 100

        modified_from = 0
        modified_to = 999999999999999999

        query_template = 'modified:[{modified_from}+TO+{modified_to}]'

        if extra_search_criteria:
            query_template = query_template + ' AND (%s)' % extra_search_criteria

        #accountid:0123456789ABCDEF

        query = query_template.format(
            modified_from=str(modified_from).rjust(18, '0'),
            modified_to=str(modified_to).rjust(18, '0'),
        )

        start = 0

        new_metadata = {}

        while start <> -1:
            search_path = 'sharing/search?f=pjson&q={query}&num={num}&start={start}'.format(
                query=query,
                num=num,
                start=start,
            )
            url = urlparse.urljoin(source_url, search_path)

            try:
                r = requests.get(url)
                r.raise_for_status()
            except requests.exceptions.RequestException, e:
                self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
                return None

            results = r.json()

            for result in results['results']:
                if result['type'] not in TYPES:
                    continue
                new_metadata[result['id']] = result
            start = results['nextStart']

        existing_guids = dict()
        query = model.Session.query(HarvestObject.guid, HOExtra.value).\
                                    filter(HarvestObject.current==True).\
                                    join(HOExtra, HarvestObject.extras).\
                                    filter(HOExtra.key=='arcgis_modified_date').\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        for (guid, value) in query:
            existing_guids[guid] = value

        new = set(new_metadata) - set(existing_guids)

        harvest_objects = []

        for guid in new:
            date = str(new_metadata[guid]['modified'])
            obj = HarvestObject(job=harvest_job,
                                content=json.dumps(new_metadata[guid]),
                                extras=[
                                    HOExtra(key='arcgis_modified_date',
                                            value=date),
                                    HOExtra(key='format', value='arcgis_json'),
                                    HOExtra(key='status', value='new')
                                ],
                                guid=guid)
            obj.save()
            harvest_objects.append(obj.id)

        deleted = set(existing_guids) - set(new_metadata)

        for guid in deleted:
            obj = HarvestObject(job=harvest_job,
                                extras=[HOExtra(key='status', value='delete')],
                                guid=guid)
            obj.save()
            harvest_objects.append(obj.id)

        changed = set(existing_guids) & set(new_metadata)

        for guid in changed:
            date = str(new_metadata[guid]['modified'])
            if date == existing_guids[guid]:
                continue
            obj = HarvestObject(job=harvest_job,
                                content=json.dumps(new_metadata[guid]),
                                extras=[
                                    HOExtra(key='arcgis_modified_date',
                                            value=date),
                                    HOExtra(key='format', value='arcgis_json'),
                                    HOExtra(key='status', value='changed')
                                ],
                                guid=guid)
            obj.save()
            harvest_objects.append(obj.id)

        return harvest_objects
Beispiel #15
0
                        existing_object.metadata_modified_date.date() < dataset_last_modified:
                    status = 'changed'
                else:
                    log.debug('Dataset unchanged: %s this="%s" previous="%s"',
                              dataset['title'], dataset_last_modified,
                              existing_object.metadata_modified_date)
                    continue
            else:
                status = 'new'
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                content=doc.serialize_node(dataset_node),
                harvest_source_reference=guid,
                metadata_modified_date=dataset_last_modified,
                extras=[HOExtra(key='status', value=status)],
            )
            obj.save()
            ids.append(obj.id)

        return ids

    def fetch_stage(self, harvest_object):
        '''
        Check that we have content from the gather stage and just return
        success
        :returns: True if everything went right, False if errors were found
        '''
        # There is no fetching because all the content for the objects were got
        # in one request during the gather stage.
        return bool(harvest_object.content)
Beispiel #16
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SatcenBetter Harvester gather_stage for job: %r',
                       harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)

        last_product_index = (self._get_last_harvesting_index(
            harvest_job.source_id, interface))
        interface.update_index(last_product_index)
        interface.build_url()

        log.debug('URL: {}'.format(interface.current_url))  # noqa: E501

        ids = []
        try:
            results = interface.get_results()
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            return ids
        if type(results) is not list:
            self._save_gather_error('{} error: {}'.format(
                results['status_code'], results['message']),
                                    self.job)  # noqa: E501
            return ids

        for entry in results:
            name_path = interface.get_name_path()

            name_url = get_field(entry,
                                 name_path['relative_location'].split(","),
                                 name_path['fixed_attributes'])
            entry_name = parse_name(name_url).lower()
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(),
                            value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids
Beispiel #17
0
class GeoDataGovGeoportalHarvester(CSWHarvester, GeoDataGovHarvester):
    '''
    A Harvester for CSW servers, with customizations for geo.data.gov
    '''
    def info(self):
        return {
            'name': 'geoportal',
            'title': 'Geoportal Server',
            'description': 'A Geoportal Server CSW endpoint',
        }

    def output_schema(self):
        return 'csw'

    def fetch_stage(self, harvest_object):

        log = logging.getLogger(__name__ + '.geoportal.fetch')
        log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id)

        url = harvest_object.source.url

        identifier = harvest_object.guid

        parts = urlparse.urlparse(url)
        url = urlparse.urlunparse((parts.scheme, parts.netloc, '/'.join(
            parts.path.rstrip('/').split('/')[:-2]), None, None, None))
        url = url.rstrip('/') + '/rest/document?id=%s' % identifier
        try:
            response = requests.get(url)
            content = response.content
        except Exception, e:
            self._save_object_error(
                'Error getting the record with GUID %s from %s' %
                (identifier, url), harvest_object)
            return False

        try:
            # Save the fetch contents in the HarvestObject
            # Contents come from csw_client already declared and encoded as utf-8
            # Remove original XML declaration
            content = re.sub('<\?xml(.*)\?>', '', content)

            document_format = guess_standard(content)
            if document_format == 'iso':
                harvest_object.content = content
                harvest_object.save()
            elif document_format == 'fgdc':
                extra = HOExtra(object=harvest_object,
                                key='original_document',
                                value=content)
                extra.save()

                extra = HOExtra(object=harvest_object,
                                key='original_format',
                                value=document_format)
                extra.save()
            else:
                harvest_object.report_status = 'ignored'
                harvest_object.save()
                return False
        except Exception, e:
            self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \
                                    (identifier, e), harvest_object)
            return False
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.gather')
        log.debug('%s gather_stage for job: %r', self.harvester_name(),
                  harvest_job)
        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        try:
            index = self.create_index(url)
            log.debug(f'Index created for {self.harvester_name()}')
        except Exception as e:
            self._save_gather_error(
                'Error harvesting %s: %s' % (self.harvester_name(), e),
                harvest_job)
            log.warning(f"Error while creating index: {e}")
            return None


        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current == True).\
                                    filter(HarvestObject.harvest_source_id == harvest_job.source.id)
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = set(guid_to_package_id.keys())

        #log.debug('Starting gathering for %s' % url)
        guids_in_harvest = index.keys()

        new = guids_in_harvest - guids_in_db
        delete = guids_in_db - guids_in_harvest
        change = guids_in_db & guids_in_harvest

        ids = []
        for guid in new:
            doc = index.get_as_string(guid)
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                content=doc,
                                extras=[HOExtra(key='status', value='new')])
            obj.save()
            ids.append(obj.id)

        for guid in change:
            doc = index.get_as_string(guid)
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                content=doc,
                                package_id=guid_to_package_id[guid],
                                extras=[HOExtra(key='status', value='change')])
            obj.save()
            ids.append(obj.id)

        for guid in delete:
            obj = HarvestObject(guid=guid,
                                job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                extras=[HOExtra(key='status', value='delete')])
            ids.append(obj.id)
            model.Session.query(HarvestObject).\
                  filter_by(guid=guid).\
                  update({'current': False}, False)
            obj.save()

        if len(ids) == 0:
            self._save_gather_error(
                'No records received from the %s service' %
                self.harvester_name(), harvest_job)
            return None

        return ids
Beispiel #19
0
    def gather_stage(self, harvest_job):
        requests_cache.install_cache()
        requests_cache.clear()

        session = requests_cache.CachedSession()

        self.log = logging.getLogger(__file__)
        self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        base_url = self.source_config.get('oai_pmh_url')
        metadata_prefix = self.source_config.get('metadata_prefix')
        start_date = self.source_config.get('start_date', None)
        self.update_all = self.source_config.get('update_all', False)

        last_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'last_token')
        next_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'next_token')
        next_station = self._get_last_harvesting_index(self.job.source_id,
                                                       'next_station')
        restart_date = self._get_last_harvesting_index(self.job.source_id,
                                                       'restart_date')
        restart_date = restart_date if last_token else None

        ids = []
        first_query = True
        while (ids == [] and next_token) or first_query:
            first_query = False

            current_token = last_token if next_station else next_token
            if current_token:
                query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format(
                    base_url, current_token)
            elif restart_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, restart_date)
            elif start_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, start_date)
            else:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format(
                    base_url, metadata_prefix)

            self.log.debug('Querying: {}.'.format(query_url))
            raw_list_ids = self.get_list_identifiers(session, query_url)

            list_stations, largest_datastamp = self.get_station_ids(
                raw_list_ids)

            next_token = self.get_resumption_token(raw_list_ids)
            last_token = current_token
            restart_date = restart_date if restart_date else ''
            restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date

            if list_stations == []:
                next_station = None
            else:
                valid_deployment = None
                station_index = 0
                while not valid_deployment and station_index <= len(
                        list_stations) - 1:
                    station = list_stations[station_index]
                    next_station = None if (next_station
                                            == station) else next_station
                    if not next_station:
                        station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format(
                            base_url, metadata_prefix, station)
                        print('Querying station: {}.'.format(station))
                        record = self.get_record(session, station_query)
                        if record:
                            station_info = StationInfo(record)
                            if station_info.isValid():
                                station_info.id = station
                                observation_list = station_info.get_observations(
                                )
                                station_dict = station_info.get_dict()
                                station_info = None
                                for observation in observation_list:
                                    observation_info = ObservationInfo(
                                        session, observation)
                                    deployments_list = observation_info.get_deployments(
                                    )
                                    observation_dict = observation_info.get_dict(
                                    )
                                    observation_info = None
                                    for deployment in deployments_list:
                                        deployment_info = DeploymentInfo(
                                            session, deployment)
                                        if deployment_info.isValid():
                                            deployment_dict = deployment_info.get_dict(
                                            )
                                            deployment_info = None
                                            valid_deployment = True
                                            if station_index + 1 <= len(
                                                    list_stations) - 1:
                                                next_station = list_stations[
                                                    station_index + 1]
                                            else:
                                                next_station = None
                                            entry_guid = unicode(uuid.uuid4())
                                            entry_id = '{}_{}'.format(
                                                station_dict['id'],
                                                deployment_dict['id'])
                                            entry_name = clean_snakecase(
                                                entry_id)
                                            self.log.debug(
                                                'Gathering %s', entry_name)

                                            content = {}
                                            content['station'] = station_dict
                                            content[
                                                'observation'] = observation_dict
                                            content[
                                                'deployment'] = deployment_dict

                                            package_query = Session.query(
                                                Package)
                                            query_filtered = package_query.filter(
                                                Package.name == entry_name)
                                            package = query_filtered.first()

                                            if package:
                                                # Meaning we've previously harvested this,
                                                # but we may want to reharvest it now.
                                                previous_obj = Session.query(HarvestObject) \
                                                    .filter(HarvestObject.guid == entry_guid) \
                                                    .filter(HarvestObject.current == True) \
                                                    .first()  # noqa: E712
                                                if previous_obj:
                                                    previous_obj.current = False
                                                    previous_obj.save()

                                                if self.update_all:
                                                    self.log.debug(
                                                        '{} already exists and will be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'change'

                                                else:
                                                    self.log.debug(
                                                        '{} will not be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'unchanged'

                                            elif not package:
                                                # It's a product we haven't harvested before.
                                                self.log.debug(
                                                    '{} has not been harvested before. Creating a new harvest object.'
                                                    .  # noqa: E501
                                                    format(entry_name
                                                           ))  # noqa: E501
                                                status = 'new'
                                            obj = HarvestObject(
                                                guid=entry_guid,
                                                job=self.job,
                                                extras=[
                                                    HOExtra(key='status',
                                                            value=status),
                                                    HOExtra(key='last_token',
                                                            value=last_token),
                                                    HOExtra(key='next_token',
                                                            value=next_token),
                                                    HOExtra(
                                                        key='next_station',
                                                        value=next_station),
                                                    HOExtra(key='restart_date',
                                                            value=restart_date)
                                                ])
                                            obj.content = json.dumps(content)
                                            obj.package = None if status == 'new' else package
                                            obj.save()
                                            ids.append(obj.id)

                                if not valid_deployment:
                                    self.log.debug(
                                        'Station {} does not have valid deployments.'
                                        .format(station))
                            else:
                                self.log.debug(
                                    'Station {} is not valid.'.format(station))
                    station_index += 1
        return ids
Beispiel #20
0
 def create_extras(url, status):
     return [
         HOExtra(key='doc_location', value=url),
         HOExtra(key='status', value=status)
     ]
class DocHarvester(SpatialHarvester, SingletonPlugin):
    '''A Harvester for individual spatial metadata documents
    TODO: Move to new logic


    '''

    implements(IHarvester)

    def info(self):
        ''' '''
        return {
            u'name': u'single-doc',
            u'title': u'Single spatial metadata document',
            u'description': u'A single spatial metadata document'
            }

    def get_original_url(self, harvest_object_id):
        '''

        :param harvest_object_id: 

        '''
        obj = model.Session.query(HarvestObject).filter(
            HarvestObject.id == harvest_object_id).first()
        if not obj:
            return None

        return obj.source.url

    def gather_stage(self, harvest_job):
        '''

        :param harvest_job: 

        '''
        log = logging.getLogger(__name__ + u'.individual.gather')
        log.debug(u'DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            self._save_gather_error(u'Unable to get content for URL: %s: %r' % (url, e),
                                    harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid,
                                              HarvestObject.package_id).filter(
            HarvestObject.current is True).filter(
            HarvestObject.harvest_source_id == harvest_job.source.id).first()

        def create_extras(url, status):
            '''

            :param url: 
            :param status: 

            '''
            return [HOExtra(key=u'doc_location', value=url),
                    HOExtra(key=u'status', value=status)]

        if not existing_object:
            guid = hashlib.md5(url.encode(u'utf8', u'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url,
                                                                u'new'),
                                           guid=guid
                                           )
        else:
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url,
                                                                u'change'),
                                           guid=existing_object.guid,
                                           package_id=existing_object.package_id
                                           )

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == u'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(
                object=harvest_object,
                key=u'original_document',
                value=content)
            extra.save()

            extra = HOExtra(
                object=harvest_object,
                key=u'original_format',
                value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        max_dataset = self.source_config.get('max_dataset', 100)
        wfs_url = self.source_config.get('wfs_url')
        wfs_version = self.source_config.get('wfs_version')
        collection = self.source_config.get('collection')
        typename = COLLECTION[collection].get('collection_typename')
        tag_typename = COLLECTION[collection].get('tag_typename', None)
        self.update_all =  self.source_config.get('update_all', False)

        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id)
        )

        if last_product_index:
            last_product_index = last_product_index + 1
        else:
            last_product_index = 0

        wfs = WFS(url=wfs_url, version=wfs_version)

        wfs.set_collection(typename)
        sortby=['When']

        result = wfs.make_request(max_dataset, sortby, last_product_index)
        entries = result['features']
        name = '{}_{}'.format(collection.lower(), '{}')
        ids = []
        for entry in entries:
            entry_guid = unicode(uuid.uuid4())
            entry_name = name.format(convert_to_clean_snakecase(entry['id']))
            log.debug('gathering %s', entry_name)

            
            content = {}
            content['collection_content'] = entry
            if tag_typename:
                wfs.set_collection(tag_typename)
                filterxml = wfs.set_filter_equal_to('image_id', entry['id'])
                result = wfs.make_request(constraint=filterxml)
                result = wfs.get_request(constraint=filterxml)
                content['tag_url'] = result

            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key='index', value=last_product_index)
                ])
            obj.content = json.dumps(content)
            obj.package = None if status == 'new' else package
            obj.save()
            last_product_index += 1
            ids.append(obj.id)
        return ids
Beispiel #23
0
class WAFCollectionHarvester(GeoDataGovWAFHarvester):
    def info(self):
        return {
            'name':
            'waf-collection',
            'title':
            'Web Accessible Folder (WAF) Homogeneous Collection',
            'description':
            'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents with a collection record'
        }

    def extra_schema(self):
        extra_schema = super(WAFCollectionHarvester, self).extra_schema()
        extra_schema['collection_metadata_url'] = [not_empty, unicode]
        return extra_schema

    def get_package_dict(self, iso_values, harvest_object):

        package_dict = super(WAFCollectionHarvester,
                             self).get_package_dict(iso_values, harvest_object)
        if not package_dict:
            return None

        collection_package_id = self._get_object_extra(
            harvest_object, 'collection_package_id')
        if collection_package_id:
            package_dict['extras'].append(
                dict(key='collection_package_id', value=collection_package_id))

        collection_metadata = self._get_object_extra(harvest_object,
                                                     'collection_metadata')
        if collection_metadata:
            package_dict['extras'].append(
                dict(key='collection_metadata', value=collection_metadata))
            status = self._get_object_extra(harvest_object, 'status')
            if status == 'change':
                self.force_import = True
            else:
                self.force_import = False

        return package_dict

    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('WafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        collection_metadata_url = self.source_config.get(
            'collection_metadata_url')

        if not collection_metadata_url:
            self._save_gather_error('collection url does not exist',
                                    harvest_job)
            return None

        try:
            response = requests.get(source_url, timeout=60)
            content = response.content
        except Exception, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None

        guid = hashlib.md5(collection_metadata_url.encode(
            'utf8', 'ignore')).hexdigest()

        existing_harvest_object = model.Session.\
            query(HarvestObject.guid, HarvestObject.package_id, HOExtra.value).\
            join(HOExtra, HarvestObject.extras).\
            filter(HOExtra.key=='collection_metadata').\
            filter(HOExtra.value=='true').\
            filter(HarvestObject.current==True).\
            filter(HarvestObject.harvest_source_id==harvest_job.source.id).first()

        if existing_harvest_object:
            status = 'change'
            guid = existing_harvest_object.guid
            package_id = existing_harvest_object.package_id
        else:
            status, package_id = 'new', None

        obj = HarvestObject(job=harvest_job,
                            extras=[
                                HOExtra(key='collection_metadata',
                                        value='true'),
                                HOExtra(key='waf_location',
                                        value=collection_metadata_url),
                                HOExtra(key='status', value=status)
                            ],
                            guid=guid,
                            status=status,
                            package_id=package_id)
        queue.fetch_and_import_stages(self, obj)
        if obj.state == 'ERROR':
            self._save_gather_error(
                'Collection object failed to harvest, not harvesting',
                harvest_job)
            return None

        return GeoDataGovWAFHarvester.gather_stage(
            self, harvest_job, collection_package_id=obj.package_id)
    def _gather_entry(self, entry, path, row, update_all=False):
        # Create a harvest object for each entry
        entry_guid = unicode(uuid.uuid4())
        entry_name = entry.lower()  # noqa: E501
        log.debug('gathering %s', entry)

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if update_all:
                log.debug('{} already exists and will be updated.'.format(
                    entry_name))  # noqa: E501
                status = 'change'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

            else:
                log.debug(
                    '{} will not be updated.'.format(entry_name))  # noqa: E501
                status = 'unchanged'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='path', value=path),
                    HOExtra(key='row', value=row)
                ])
            obj.content = entry
            obj.package = None
            obj.save()
            return obj.id
    def gather_stage(self, harvest_job):
        logger.debug('CswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url
        self._set_source_config(harvest_job.source.config)

        parts = urlparse.urlparse(url)

        params = {'keywords__slug__in': self.keywords, 'limit': 10000}

        url = urlparse.urlunparse((
            parts.scheme, parts.netloc, '/api/layers', None,
            urllib.urlencode(params, True), None
        ))

        query = model.Session.query(
            HarvestObject.guid, HarvestObject.package_id
        ).filter(HarvestObject.current == True).filter(
            HarvestObject.harvest_source_id == harvest_job.source.id
        )
        guid_to_package_id = {}

        for guid, package_id in query:
            guid_to_package_id[guid] = package_id

        guids_in_db = set(guid_to_package_id.keys())

        logger.debug('Starting gathering for %s' % url)
        guids_in_harvest = set()
        try:
            for obj in requests.get(url).json()['objects']:
                try:
                    uuid = obj['uuid']
                    logger.info('Got identifier %s from the PacGeo', uuid)
                    guids_in_harvest.add(uuid)
                except Exception, e:
                    self._save_gather_error(
                        'Error for the identifier from <%r>: %s' % (obj, e),
                        harvest_job
                    )
                    continue

        except Exception as e:
            logger.error('Exception: %s', e)
            self._save_gather_error(
                'Error gathering the identifiers from the PacGeo server [%s]' %
                str(e), harvest_job
            )
            return None

        new = guids_in_harvest - guids_in_db
        delete = guids_in_db - guids_in_harvest
        change = guids_in_db & guids_in_harvest

        ids = []
        for guid in new:
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                extras=[HOExtra(key='status', value='new')]
            )
            obj.save()
            ids.append(obj.id)
        for guid in change:
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                package_id=guid_to_package_id[guid],
                extras=[HOExtra(key='status', value='change')]
            )
            obj.save()
            ids.append(obj.id)
        for guid in delete:
            obj = HarvestObject(
                guid=guid,
                job=harvest_job,
                package_id=guid_to_package_id[guid],
                extras=[HOExtra(key='status', value='delete')]
            )
            model.Session.query(HarvestObject).filter_by(guid=guid).update({
                'current': False
            }, False)
            obj.save()
            ids.append(obj.id)

        if len(ids) == 0:
            self._save_gather_error(
                'No records received from the CSW server', harvest_job
            )
            return None

        return ids
Beispiel #26
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]
    def _crawl_results(self,
                       harvest_url,
                       timeout=5,
                       limit=100,
                       provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        first_query = True
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')
            json_content = json.loads(soup.text)

            # Get the URL for the next loop, or None to break the loop
            log.debug(harvest_url)
            harvest_url = self._get_next_url(harvest_url, json_content)

            # Get the entries from the results
            entry_list = self._get_entries_from_results(json_content)

            if first_query:
                entries = entry_list
            else:
                entries = entry_list[1:]

            first_query = False

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                    else:
                        log.debug('{} will not be updated.'.format(
                            entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status',
                                                    value=status),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    obj.content = json.dumps(entry['content'])
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)

                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    new_counter += 1
                    obj.content = json.dumps(entry['content'])
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
    def _parse_products(self, products):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:

            entry_guid = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_name = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_restart_date = entry['master']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            msg = u'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == u'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                object=harvest_object,
                key=u'original_document',
                value=content)
            extra.save()

            extra = HOExtra(
                object=harvest_object,
                key=u'original_format',
                value=document_format)
            extra.save()

        return True


apache = parse.SkipTo(parse.CaselessLiteral(u'<a href='),
                      include=True).suppress() + parse.quotedString.setParseAction(
    parse.removeQuotes).setResultsName(u'url') + parse.SkipTo(u'</a>',
Beispiel #30
0
    def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url,
                                 auth=HTTPBasicAuth(username, password),
                                 verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, r.status_code, elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(log_message.format(self.provider,
                    timestamp, r.status_code, r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            # Get the URL for the next loop, or None to break the loop
            harvest_url = self._get_next_url(soup)

            # Get the entries from the results
            entries = self._get_entries_from_results(soup)

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    # We need package_show to ensure that all the conversions
                    # are carried out.
                    context = {"user": "******", "ignore_auth": True,
                               "model": model, "session": Session}
                    pkg_dict = logic.get_action('package_show')(context, {"id": package.name})  # noqa: E501
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    # E.g., a Sentinel dataset exists,
                    # but doesn't have a NOA resource yet.
                    elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra):  # noqa: E501
                        log.debug('{} already exists and will be extended.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, update_counter))  # noqa: E128, E501
        return ids