Exemple #1
0
 def __init__(self, name):
     super(ResourceRemapper, self).__init__(name)
     self.parser.add_option(
         "-n",
         "--nopull",
         dest="nopull",
         action="store_true",
         help=
         "Specifies the the code should not pull the latest version if the repo exists on disk"
     )
     self.parser.add_option(
         "-p",
         "--pretend",
         dest="pretend",
         action="store_true",
         help="Pretends to update the database, but doesn't really.")
     self.local_resource_map = collections.defaultdict(list)
     self.remap_stats = StatsCount()
 def __init__(self, name):
     super(ResourceRemapper, self).__init__(name)
     self.parser.add_option("-n", "--nopull",
               dest="nopull", action="store_true",
               help="Specifies the the code should not pull the latest version if the repo exists on disk")
     self.parser.add_option("-p", "--pretend",
               dest="pretend", action="store_true",
               help="Pretends to update the database, but doesn't really.")        
     self.local_resource_map = collections.defaultdict(list)
     self.remap_stats = StatsCount()
    def __init__(self, name):
        super(GovUkResourceChecker, self).__init__(name)
        self.parser.add_option("-p", "--pretend",
                  dest="pretend", action="store_true",
                  help="Pretends to update the database, but doesn't really.")
        self.parser.add_option("-s", "--single",
                  dest="single",
                  default="",
                  help="Specifies a single dataset to work with")

        self.local_resource_map = collections.defaultdict(list)
        self.remap_stats = StatsCount()

        self.translog = csv.writer(open("derived.log", "wb"))
        self.translog.writerow(["PackageName", "ResourceID", "URL", "Action"])
class ResourceRemapper(CkanCommand):
    """
    Iterates through resources to checks if it was remapped by gov.uk.  

    If the status is 301, then we will modify the URL of the resource, keeping track
    of the # of changes we made.  If a 410 we'll delete the resource, and if it was the
    only resource, we'll delete the dataset as well.
    """
    summary = __doc__.strip().split('\n')[0]
    usage = '\n' + __doc__
    max_args = 0
    min_args = 0

    def __init__(self, name):
        super(ResourceRemapper, self).__init__(name)
        self.parser.add_option("-n", "--nopull",
                  dest="nopull", action="store_true",
                  help="Specifies the the code should not pull the latest version if the repo exists on disk")
        self.parser.add_option("-p", "--pretend",
                  dest="pretend", action="store_true",
                  help="Pretends to update the database, but doesn't really.")        
        self.local_resource_map = collections.defaultdict(list)
        self.remap_stats = StatsCount()

    def record_transaction(self, package, resource, action):
        """ Write a record to the log file """
        row = [package.name, package.state, resource.id, resource.state, action]
        translog.writerow(row)

    def _rss(self):
        """ Return a string containing how much memory we're currently using """
        rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024                
        locale.setlocale(locale.LC_ALL, 'en_GB')
        return locale.format('%d', rss, grouping=True) + "Mb"
                

    def command(self):
        self._load_config()

        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")
        log.debug("MEM: {0}".format(self._rss()))

        # Clone/pull the info from the git repo
        data_folder = self._get_remapping_data()
        self._build_local_resource_map()
        log.debug("MEM: {0}".format(self._rss()))

        log.debug("Looking for CSV files in {0}".format(data_folder))

        # Iterate through all of the CSV files in the repository
        iterator = glob.iglob(os.path.join(data_folder, "*.csv"))
        for csv_file in iterator:
            self.remap_stats.increment('CSV files read')    

            with open(csv_file, "rU") as f:
                rdr = csv.reader(f)
                rdr.next() # Skip the header

                for row in rdr:
                    if row[0] in self.local_resource_map:
                        self.remap_stats.increment('URLs matched')
                
                        # Each URL in our local map might appear more than once, so 
                        # the list of resource IDs it iterated over
                        for resource_id in self.local_resource_map[row[0]]:
                            resource = model.Session.query(model.Resource).\
                                filter(model.Resource.id==resource_id).first()

                            if resource == None:
                                log.error("Resource {0} is not findable".format(resource_id))

                            # Depending on the HTTP code registered for the remap we should
                            # either treat it as gone, or as a simple move.
                            code = int(row[2])
                            if code == 410:
                                self.handle_gone(row, resource)
                            elif code in [301, 418]:
                                self.handle_moved(row, resource)

                    self.remap_stats.increment('Rows read')

        print self.remap_stats.report(order_by_title=True)

    def handle_gone(self, row, resource):
        """ 
            Marks the resource as deleted, and then checks if there are no more 
            resources in the package. it will delete the dataset too if there are no
            other resources 
        """
        import ckan.model as model

        resource.state = 'deleted' 
        if not self.options.pretend:
            model.Session.add(resource)
            model.Session.commit()

        pkg = resource.resource_group.package
        if pkg.state == 'deleted':
            self.remap_stats.increment('URL has GONE within already deleted package')
            self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_ALREADY_DELETED)
            return 

        if self._should_delete(pkg, resource):
            if not self.options.pretend:
                pkg.state == 'deleted'
                model.Session.add(pkg)
                model.Session.commit()
            self.remap_stats.increment('Packages deleted due to 0 resources')
            self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_DELETED)
        else:
            self.record_transaction(pkg, resource, TRANSLOG_RESOURCE_DELETED)            
        
        self.remap_stats.increment('410 GONE')

    def handle_moved(self, row, resource):
        """ 
            Changes the url in the resource to the new one
        """
        import ckan.model as model

        # Alays assign the URL, regardless of the state of the package just so that
        # it is clean (should it be un-deleted)
        resource.url = row[1]       
        if not self.options.pretend:
            model.Session.add(resource)
            model.Session.commit()        

        # Record whether we have updated an active resource within a deleted package
        pkg = resource.resource_group.package
        if pkg.state == 'deleted':
            self.remap_stats.increment('URL has MOVED within already deleted package')
            self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_ALREADY_DELETED)
            return

        self.record_transaction(pkg, resource, TRANSLOG_CHANGED)            
        self.remap_stats.increment('301 MOVED')

    def _should_delete(self, pkg, resource):
        # Should we delete the specified package when there is one less active resource?
        any_left = any([r.id for r in pkg.resources if r.state == 'active' and r.id != resource.id])
        return not any_left

    def _build_local_resource_map(self):
        """ 
        Builds a map of the resources we know about locally that we will store with the URL
        as the key, and the value as a list of resource ids that have this URL """
        import ckan.model as model

        log.debug("Building local resource map")
        q = model.Session.query(model.Resource)\
            .filter(model.Resource.state=='active')
        for resource in q.all():
            self.local_resource_map[resource.url].append(resource.id)
        log.debug("Local resource map contains {0} elements".format(len(self.local_resource_map)))


    def _run_or_exit(self, cmd, error_message):
        """ Runs the specified command, and exits with an error if it fails """
        err = subprocess.call(cmd, shell=True)
        if err != 0:
            log.error(error_message) 
            sys.exit(1)


    def _get_remapping_data(self):
        """
        Fetches the git repo containing the remapping data and 
        pulls it into a temp directory.  If it already exists, we 
        just do a pull instead to make sure it is up-to-date.
        """
        root = "/tmp/resource_remapping/"
        if not os.path.exists(root):
            os.makedirs(root)

        repo_path = os.path.join(root, "redirector")

        if not os.path.exists(repo_path):
            self._run_or_exit("cd {dir}; git clone {repo}".format(dir=root,repo=GIT_REPO),
                "Failed to pull the remote repository at {0}".format(GIT_REPO))
        elif not self.options.nopull:
            log.debug("Pulling latest code")
            self._run_or_exit("cd {dir}; git pull origin master".format(dir=repo_path), 
                "Failed to pull the remote repository at {0}".format(GIT_REPO)) 
        else:
            log.debug("Code exists and nopull specified")

        return os.path.join(repo_path, "data/mappings")
class GovUkResourceChecker(CkanCommand):
    """
    Iterates through gov.uk resources to find duplicates and attached data.

    A lot of the gov.uk resources point to a HTML file, which itself contains the
    link to the data.  In a similar manner to ons_scraper we want to make those HTML
    resources 'documentation' resources and if possible point directly to the data
    file itself.
    """
    summary = __doc__.strip().split('\n')[0]
    usage = '\n' + __doc__
    max_args = 0
    min_args = 0

    def __init__(self, name):
        super(GovUkResourceChecker, self).__init__(name)
        self.parser.add_option("-p", "--pretend",
                  dest="pretend", action="store_true",
                  help="Pretends to update the database, but doesn't really.")
        self.parser.add_option("-s", "--single",
                  dest="single",
                  default="",
                  help="Specifies a single dataset to work with")

        self.local_resource_map = collections.defaultdict(list)
        self.remap_stats = StatsCount()

        self.translog = csv.writer(open("derived.log", "wb"))
        self.translog.writerow(["PackageName", "ResourceID", "URL", "Action"])

    def record_transaction(self, package, resource, action):
        """ Write a record to the log file """
        row = [package.name, resource.id, action]
        self.translog.writerow(row)


    def command(self):
        self._load_config()

        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")

        self._build_resource_map()
        for dataset, resources in self.local_resource_map.iteritems():
            self.process(dataset, resources)

        log.info(self.remap_stats.report(order_by_title=True))

    def process(self, dataset, resources):
        # We want distinct URLs in the resources, and don't really want
        # duplicates.  We should ignore (and eventually delete) dupes
        # UNLESS they have a hub-id in which case we should definitely
        # NOT delete them.
        import ckan.model as model

        dupes = []
        seen = []

        for r in resources[:]:
            if r.url not in seen:
                seen.append(r.url)
            else:
                print "Found a duplicate"
                dupes.append(r)
                resources.remove(r)

        log.info("Dataset '{0}' has {1} duplicates, in {2} resources".
            format(dataset.name, len(dupes), len(dupes) + len(resources)))

        # Handle the valid resources
        for resource in resources:
            if resource.resource_type == "documentation":
                log.info(" - Ignoring documentation resource")
                self.remap_stats.increment('Ignored documentation resource')
                continue

            log.debug("- Fetching attachments for {0}".format(resource.id))
            data = self._get_attachment(resource)
            if not data:
                log.info(" - No attachment for {0}".format(resource.url))
                continue

            attachments = []
            for att in data.get('attachments',[]):
                content_type = att.get('content_type')
                if content_type in ACCEPTED_FORMATS:
                    self.remap_stats.increment('Attachments found')
                    log.info(" - Found {0}".format(att.get("url")))
                    attachments.append(att)
                    break
                else:
                    log.info(" - Skipping attachment as it's {0}".format(content_type))

            if not attachments:
                continue

            for attachment in attachments:
                fmt = "CSV"
                u = attachment.get('url', '').lower()
                if u.endswith('.xls') or u.endswith('xlsx'):
                    fmt = "XLS"
                    self.remap_stats.increment('XLS')
                elif attachment.get('url', '').lower().endswith('.rdf'):
                    fmt = "RDF"
                    self.remap_stats.increment('RDF')
                else:
                    self.remap_stats.increment('CSV')

                # Add the new resource, and then mark the old resource as documentation
                log.info(" - Adding a new resource to {0}".format(dataset.name))
                self.remap_stats.increment('Attachments added')
                self.record_transaction(dataset, resource, "Created new from resource info")
                if not self.options.pretend:
                    # This should be the same type as the original to make sure we correctly
                    # handle time-series resources.
                    dataset.add_resource(url="http://www.gov.uk" + attachment.get('url'),
                             format=fmt,
                             resource_type=resource.resource_type,
                             description=attachment.get('title',''))
                    model.Session.add(dataset)
                    model.Session.commit()

                resource.resource_type = "documentation"
                resource.format = "HTML"
                log.info(" - Changing old resource to documentation")
                self.remap_stats.increment('Resources moved to documentation')
                self.record_transaction(dataset, resource, "Moved to documentation")
                if not self.options.pretend:
                    model.Session.add(resource)
                    model.Session.commit()

        # Handle the dupes, ignore them if they have a hub-id, potentially delete
        # them if they don't.
        log.info("Processing {} duplicates".format(len(dupes)))
        for resource in dupes:
            if 'hub-id' in resource.extras:
                # Don't delete ONS imported dataset
                log.info("Resource {} is an ONS resource, not deleting".format(resource.id))
                self.remap_stats.increment('ONS resources *not* deleted')
                continue

            log.info(" - Deleting duplicate {0} -> {1}".format(resource.url, resource.id))
            resource.state = 'deleted'
            self.remap_stats.increment('Deleted resource')
            self.record_transaction(dataset, resource, "Deleted dupe")
            if not self.options.pretend:
                model.Session.add(resource)
                model.Session.commit()
                log.info(" -- deleted {}".format(resource.id))
        model.Session.flush()

    def _get_attachment(self, resource):
        json_url = "".join([resource.url, ".json"])
        r = requests.head(json_url)
        if not r.status_code == 200:
            log.info("No JSON file at {0}".format(json_url))
            return None

        r = requests.get(json_url)
        if not r.status_code == 200:
            log.error("Failed to retrieve {0} after successful HEAD request".format(json_url))
            return None

        return json.loads(r.content)


    def _build_resource_map(self):
        import ckan.model as model

        # Find all non .csv/.xls links for gov.uk
        resources = model.Session.query(model.Resource).\
            filter(model.Resource.url.like("%/www.gov.uk/%")).\
            filter(not_(model.Resource.url.ilike("%.csv"))).\
            filter(not_(model.Resource.url.ilike("%.xls"))).\
            filter(not_(model.Resource.url.ilike("%.xlsx"))).\
            filter(not_(model.Resource.url.ilike("%.pdf"))).\
            filter(not_(model.Resource.url.ilike("%.rdf"))).\
            filter(not_(model.Resource.url.ilike("%.json"))).\
            filter(not_(model.Resource.url.ilike("%.doc"))).\
            filter(not_(model.Resource.resource_type=='documentation')).\
            filter(not_(model.Resource.resource_type=='timeseries'))
            #filter(model.Resource.state=='active')

        log.info("Found %d resources for www.gov.uk links" % resources.count())
        for r in resources:
            pkg = r.resource_group.package

            # If we only want one, then skip the others
            if self.options.single and not pkg.name == self.options.single:
                continue

            if pkg.state == 'active':
                self.local_resource_map[pkg].append(r)
Exemple #6
0
class ResourceRemapper(CkanCommand):
    """
    Iterates through resources to checks if it was remapped by gov.uk.  

    If the status is 301, then we will modify the URL of the resource, keeping track
    of the # of changes we made.  If a 410 we'll delete the resource, and if it was the
    only resource, we'll delete the dataset as well.
    """
    summary = __doc__.strip().split('\n')[0]
    usage = '\n' + __doc__
    max_args = 0
    min_args = 0

    def __init__(self, name):
        super(ResourceRemapper, self).__init__(name)
        self.parser.add_option(
            "-n",
            "--nopull",
            dest="nopull",
            action="store_true",
            help=
            "Specifies the the code should not pull the latest version if the repo exists on disk"
        )
        self.parser.add_option(
            "-p",
            "--pretend",
            dest="pretend",
            action="store_true",
            help="Pretends to update the database, but doesn't really.")
        self.local_resource_map = collections.defaultdict(list)
        self.remap_stats = StatsCount()

    def record_transaction(self, package, resource, action):
        """ Write a record to the log file """
        row = [
            package.name, package.state, resource.id, resource.state, action
        ]
        translog.writerow(row)

    def _rss(self):
        """ Return a string containing how much memory we're currently using """
        rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
        locale.setlocale(locale.LC_ALL, 'en_GB')
        return locale.format('%d', rss, grouping=True) + "Mb"

    def command(self):
        self._load_config()

        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")
        log.debug("MEM: {0}".format(self._rss()))

        # Clone/pull the info from the git repo
        data_folder = self._get_remapping_data()
        self._build_local_resource_map()
        log.debug("MEM: {0}".format(self._rss()))

        log.debug("Looking for CSV files in {0}".format(data_folder))

        # Iterate through all of the CSV files in the repository
        iterator = glob.iglob(os.path.join(data_folder, "*.csv"))
        for csv_file in iterator:
            self.remap_stats.increment('CSV files read')

            with open(csv_file, "rU") as f:
                rdr = csv.reader(f)
                rdr.next()  # Skip the header

                for row in rdr:
                    if row[0] in self.local_resource_map:
                        self.remap_stats.increment('URLs matched')

                        # Each URL in our local map might appear more than once, so
                        # the list of resource IDs it iterated over
                        for resource_id in self.local_resource_map[row[0]]:
                            resource = model.Session.query(model.Resource).\
                                filter(model.Resource.id==resource_id).first()

                            if resource == None:
                                log.error(
                                    "Resource {0} is not findable".format(
                                        resource_id))

                            # Depending on the HTTP code registered for the remap we should
                            # either treat it as gone, or as a simple move.
                            code = int(row[2])
                            if code == 410:
                                self.handle_gone(row, resource)
                            elif code in [301, 418]:
                                self.handle_moved(row, resource)

                    self.remap_stats.increment('Rows read')

        print self.remap_stats.report(order_by_title=True)

    def handle_gone(self, row, resource):
        """ 
            Marks the resource as deleted, and then checks if there are no more 
            resources in the package. it will delete the dataset too if there are no
            other resources 
        """
        import ckan.model as model

        resource.state = 'deleted'
        if not self.options.pretend:
            model.Session.add(resource)
            model.Session.commit()

        pkg = resource.resource_group.package
        if pkg.state == 'deleted':
            self.remap_stats.increment(
                'URL has GONE within already deleted package')
            self.record_transaction(pkg, resource,
                                    TRANSLOG_PACKAGE_ALREADY_DELETED)
            return

        if self._should_delete(pkg, resource):
            if not self.options.pretend:
                pkg.state == 'deleted'
                model.Session.add(pkg)
                model.Session.commit()
            self.remap_stats.increment('Packages deleted due to 0 resources')
            self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_DELETED)
        else:
            self.record_transaction(pkg, resource, TRANSLOG_RESOURCE_DELETED)

        self.remap_stats.increment('410 GONE')

    def handle_moved(self, row, resource):
        """ 
            Changes the url in the resource to the new one
        """
        import ckan.model as model

        # Alays assign the URL, regardless of the state of the package just so that
        # it is clean (should it be un-deleted)
        resource.url = row[1]
        if not self.options.pretend:
            model.Session.add(resource)
            model.Session.commit()

        # Record whether we have updated an active resource within a deleted package
        pkg = resource.resource_group.package
        if pkg.state == 'deleted':
            self.remap_stats.increment(
                'URL has MOVED within already deleted package')
            self.record_transaction(pkg, resource,
                                    TRANSLOG_PACKAGE_ALREADY_DELETED)
            return

        self.record_transaction(pkg, resource, TRANSLOG_CHANGED)
        self.remap_stats.increment('301 MOVED')

    def _should_delete(self, pkg, resource):
        # Should we delete the specified package when there is one less active resource?
        any_left = any([
            r.id for r in pkg.resources
            if r.state == 'active' and r.id != resource.id
        ])
        return not any_left

    def _build_local_resource_map(self):
        """ 
        Builds a map of the resources we know about locally that we will store with the URL
        as the key, and the value as a list of resource ids that have this URL """
        import ckan.model as model

        log.debug("Building local resource map")
        q = model.Session.query(model.Resource)\
            .filter(model.Resource.state=='active')
        for resource in q.all():
            self.local_resource_map[resource.url].append(resource.id)
        log.debug("Local resource map contains {0} elements".format(
            len(self.local_resource_map)))

    def _run_or_exit(self, cmd, error_message):
        """ Runs the specified command, and exits with an error if it fails """
        err = subprocess.call(cmd, shell=True)
        if err != 0:
            log.error(error_message)
            sys.exit(1)

    def _get_remapping_data(self):
        """
        Fetches the git repo containing the remapping data and 
        pulls it into a temp directory.  If it already exists, we 
        just do a pull instead to make sure it is up-to-date.
        """
        root = "/tmp/resource_remapping/"
        if not os.path.exists(root):
            os.makedirs(root)

        repo_path = os.path.join(root, "redirector")

        if not os.path.exists(repo_path):
            self._run_or_exit(
                "cd {dir}; git clone {repo}".format(dir=root, repo=GIT_REPO),
                "Failed to pull the remote repository at {0}".format(GIT_REPO))
        elif not self.options.nopull:
            log.debug("Pulling latest code")
            self._run_or_exit(
                "cd {dir}; git pull origin master".format(dir=repo_path),
                "Failed to pull the remote repository at {0}".format(GIT_REPO))
        else:
            log.debug("Code exists and nopull specified")

        return os.path.join(repo_path, "data/mappings")