예제 #1
0
    def gather_stage(self, harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' %
                  harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)
        print('****')
        print(len(package_ids))
        print(package_ids)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    if "http" not in package_id:
                        # Create a new HarvestObject for this identifier
                        obj = HarvestObject(guid=package_id, job=harvest_job)
                        obj.save()
                        object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' %
                                        url, harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
    def gather_stage(self, harvest_job):

        print('Html Harvest Gather Stage')
        db2 = client.odm
        collection = db2.html_jobs
        backupi = 0
        ## Get source URL
        source_url = harvest_job.source.url
        ## mongoDb connection
        document = collection.find_one({"cat_url": source_url})
        id1 = document['_id']
        if 'btn_identifier' in document.keys():
            if document['btn_identifier'] != None and document[
                    'btn_identifier'] != '':
                cat_url = document['cat_url']
                dataset_identifier = document['identifier']
                btn_identifier = document['btn_identifier']
                action_type = document['action_type']
                try:
                    sleep_time = document['sleep_time']
                except:
                    sleep_time = 3
                package_ids = javascript_case.ParseJavascriptPages(
                    cat_url, dataset_identifier, btn_identifier, action_type,
                    sleep_time)
                print(package_ids)
            else:
                package_ids = harvester_final.read_data(id1, backupi)
        else:
            package_ids = harvester_final.read_data(id1, backupi)
        print(package_ids)
        #print(len(package_ids))
        #package_ids=[]
        #package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender')
        #package_ids.append('test')
        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error(
                    'No packages received for URL: %s' % source_url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
예제 #3
0
    def _create_harvest_objects(self, remote_ids, harvest_job):
        '''
        Given a list of remote ids and a Harvest Job, create as many Harvest Objects and
        return a list of their ids to be passed to the fetch stage.

        TODO: Not sure it is worth keeping this function
        '''
        try:
            object_ids = []
            if len(remote_ids):
                for remote_id in remote_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid = remote_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids
            else:
               self._save_gather_error('No remote datasets could be identified', harvest_job)
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)