def __init__(self, name): super(ResourceRemapper, self).__init__(name) self.parser.add_option( "-n", "--nopull", dest="nopull", action="store_true", help= "Specifies the the code should not pull the latest version if the repo exists on disk" ) self.parser.add_option( "-p", "--pretend", dest="pretend", action="store_true", help="Pretends to update the database, but doesn't really.") self.local_resource_map = collections.defaultdict(list) self.remap_stats = StatsCount()
def __init__(self, name): super(ResourceRemapper, self).__init__(name) self.parser.add_option("-n", "--nopull", dest="nopull", action="store_true", help="Specifies the the code should not pull the latest version if the repo exists on disk") self.parser.add_option("-p", "--pretend", dest="pretend", action="store_true", help="Pretends to update the database, but doesn't really.") self.local_resource_map = collections.defaultdict(list) self.remap_stats = StatsCount()
def __init__(self, name): super(GovUkResourceChecker, self).__init__(name) self.parser.add_option("-p", "--pretend", dest="pretend", action="store_true", help="Pretends to update the database, but doesn't really.") self.parser.add_option("-s", "--single", dest="single", default="", help="Specifies a single dataset to work with") self.local_resource_map = collections.defaultdict(list) self.remap_stats = StatsCount() self.translog = csv.writer(open("derived.log", "wb")) self.translog.writerow(["PackageName", "ResourceID", "URL", "Action"])
class ResourceRemapper(CkanCommand): """ Iterates through resources to checks if it was remapped by gov.uk. If the status is 301, then we will modify the URL of the resource, keeping track of the # of changes we made. If a 410 we'll delete the resource, and if it was the only resource, we'll delete the dataset as well. """ summary = __doc__.strip().split('\n')[0] usage = '\n' + __doc__ max_args = 0 min_args = 0 def __init__(self, name): super(ResourceRemapper, self).__init__(name) self.parser.add_option("-n", "--nopull", dest="nopull", action="store_true", help="Specifies the the code should not pull the latest version if the repo exists on disk") self.parser.add_option("-p", "--pretend", dest="pretend", action="store_true", help="Pretends to update the database, but doesn't really.") self.local_resource_map = collections.defaultdict(list) self.remap_stats = StatsCount() def record_transaction(self, package, resource, action): """ Write a record to the log file """ row = [package.name, package.state, resource.id, resource.state, action] translog.writerow(row) def _rss(self): """ Return a string containing how much memory we're currently using """ rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 locale.setlocale(locale.LC_ALL, 'en_GB') return locale.format('%d', rss, grouping=True) + "Mb" def command(self): self._load_config() import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") log.debug("MEM: {0}".format(self._rss())) # Clone/pull the info from the git repo data_folder = self._get_remapping_data() self._build_local_resource_map() log.debug("MEM: {0}".format(self._rss())) log.debug("Looking for CSV files in {0}".format(data_folder)) # Iterate through all of the CSV files in the repository iterator = glob.iglob(os.path.join(data_folder, "*.csv")) for csv_file in iterator: self.remap_stats.increment('CSV files read') with open(csv_file, "rU") as f: rdr = csv.reader(f) rdr.next() # Skip the header for row in rdr: if row[0] in self.local_resource_map: self.remap_stats.increment('URLs matched') # Each URL in our local map might appear more than once, so # the list of resource IDs it iterated over for resource_id in self.local_resource_map[row[0]]: resource = model.Session.query(model.Resource).\ filter(model.Resource.id==resource_id).first() if resource == None: log.error("Resource {0} is not findable".format(resource_id)) # Depending on the HTTP code registered for the remap we should # either treat it as gone, or as a simple move. code = int(row[2]) if code == 410: self.handle_gone(row, resource) elif code in [301, 418]: self.handle_moved(row, resource) self.remap_stats.increment('Rows read') print self.remap_stats.report(order_by_title=True) def handle_gone(self, row, resource): """ Marks the resource as deleted, and then checks if there are no more resources in the package. it will delete the dataset too if there are no other resources """ import ckan.model as model resource.state = 'deleted' if not self.options.pretend: model.Session.add(resource) model.Session.commit() pkg = resource.resource_group.package if pkg.state == 'deleted': self.remap_stats.increment('URL has GONE within already deleted package') self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_ALREADY_DELETED) return if self._should_delete(pkg, resource): if not self.options.pretend: pkg.state == 'deleted' model.Session.add(pkg) model.Session.commit() self.remap_stats.increment('Packages deleted due to 0 resources') self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_DELETED) else: self.record_transaction(pkg, resource, TRANSLOG_RESOURCE_DELETED) self.remap_stats.increment('410 GONE') def handle_moved(self, row, resource): """ Changes the url in the resource to the new one """ import ckan.model as model # Alays assign the URL, regardless of the state of the package just so that # it is clean (should it be un-deleted) resource.url = row[1] if not self.options.pretend: model.Session.add(resource) model.Session.commit() # Record whether we have updated an active resource within a deleted package pkg = resource.resource_group.package if pkg.state == 'deleted': self.remap_stats.increment('URL has MOVED within already deleted package') self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_ALREADY_DELETED) return self.record_transaction(pkg, resource, TRANSLOG_CHANGED) self.remap_stats.increment('301 MOVED') def _should_delete(self, pkg, resource): # Should we delete the specified package when there is one less active resource? any_left = any([r.id for r in pkg.resources if r.state == 'active' and r.id != resource.id]) return not any_left def _build_local_resource_map(self): """ Builds a map of the resources we know about locally that we will store with the URL as the key, and the value as a list of resource ids that have this URL """ import ckan.model as model log.debug("Building local resource map") q = model.Session.query(model.Resource)\ .filter(model.Resource.state=='active') for resource in q.all(): self.local_resource_map[resource.url].append(resource.id) log.debug("Local resource map contains {0} elements".format(len(self.local_resource_map))) def _run_or_exit(self, cmd, error_message): """ Runs the specified command, and exits with an error if it fails """ err = subprocess.call(cmd, shell=True) if err != 0: log.error(error_message) sys.exit(1) def _get_remapping_data(self): """ Fetches the git repo containing the remapping data and pulls it into a temp directory. If it already exists, we just do a pull instead to make sure it is up-to-date. """ root = "/tmp/resource_remapping/" if not os.path.exists(root): os.makedirs(root) repo_path = os.path.join(root, "redirector") if not os.path.exists(repo_path): self._run_or_exit("cd {dir}; git clone {repo}".format(dir=root,repo=GIT_REPO), "Failed to pull the remote repository at {0}".format(GIT_REPO)) elif not self.options.nopull: log.debug("Pulling latest code") self._run_or_exit("cd {dir}; git pull origin master".format(dir=repo_path), "Failed to pull the remote repository at {0}".format(GIT_REPO)) else: log.debug("Code exists and nopull specified") return os.path.join(repo_path, "data/mappings")
class GovUkResourceChecker(CkanCommand): """ Iterates through gov.uk resources to find duplicates and attached data. A lot of the gov.uk resources point to a HTML file, which itself contains the link to the data. In a similar manner to ons_scraper we want to make those HTML resources 'documentation' resources and if possible point directly to the data file itself. """ summary = __doc__.strip().split('\n')[0] usage = '\n' + __doc__ max_args = 0 min_args = 0 def __init__(self, name): super(GovUkResourceChecker, self).__init__(name) self.parser.add_option("-p", "--pretend", dest="pretend", action="store_true", help="Pretends to update the database, but doesn't really.") self.parser.add_option("-s", "--single", dest="single", default="", help="Specifies a single dataset to work with") self.local_resource_map = collections.defaultdict(list) self.remap_stats = StatsCount() self.translog = csv.writer(open("derived.log", "wb")) self.translog.writerow(["PackageName", "ResourceID", "URL", "Action"]) def record_transaction(self, package, resource, action): """ Write a record to the log file """ row = [package.name, resource.id, action] self.translog.writerow(row) def command(self): self._load_config() import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") self._build_resource_map() for dataset, resources in self.local_resource_map.iteritems(): self.process(dataset, resources) log.info(self.remap_stats.report(order_by_title=True)) def process(self, dataset, resources): # We want distinct URLs in the resources, and don't really want # duplicates. We should ignore (and eventually delete) dupes # UNLESS they have a hub-id in which case we should definitely # NOT delete them. import ckan.model as model dupes = [] seen = [] for r in resources[:]: if r.url not in seen: seen.append(r.url) else: print "Found a duplicate" dupes.append(r) resources.remove(r) log.info("Dataset '{0}' has {1} duplicates, in {2} resources". format(dataset.name, len(dupes), len(dupes) + len(resources))) # Handle the valid resources for resource in resources: if resource.resource_type == "documentation": log.info(" - Ignoring documentation resource") self.remap_stats.increment('Ignored documentation resource') continue log.debug("- Fetching attachments for {0}".format(resource.id)) data = self._get_attachment(resource) if not data: log.info(" - No attachment for {0}".format(resource.url)) continue attachments = [] for att in data.get('attachments',[]): content_type = att.get('content_type') if content_type in ACCEPTED_FORMATS: self.remap_stats.increment('Attachments found') log.info(" - Found {0}".format(att.get("url"))) attachments.append(att) break else: log.info(" - Skipping attachment as it's {0}".format(content_type)) if not attachments: continue for attachment in attachments: fmt = "CSV" u = attachment.get('url', '').lower() if u.endswith('.xls') or u.endswith('xlsx'): fmt = "XLS" self.remap_stats.increment('XLS') elif attachment.get('url', '').lower().endswith('.rdf'): fmt = "RDF" self.remap_stats.increment('RDF') else: self.remap_stats.increment('CSV') # Add the new resource, and then mark the old resource as documentation log.info(" - Adding a new resource to {0}".format(dataset.name)) self.remap_stats.increment('Attachments added') self.record_transaction(dataset, resource, "Created new from resource info") if not self.options.pretend: # This should be the same type as the original to make sure we correctly # handle time-series resources. dataset.add_resource(url="http://www.gov.uk" + attachment.get('url'), format=fmt, resource_type=resource.resource_type, description=attachment.get('title','')) model.Session.add(dataset) model.Session.commit() resource.resource_type = "documentation" resource.format = "HTML" log.info(" - Changing old resource to documentation") self.remap_stats.increment('Resources moved to documentation') self.record_transaction(dataset, resource, "Moved to documentation") if not self.options.pretend: model.Session.add(resource) model.Session.commit() # Handle the dupes, ignore them if they have a hub-id, potentially delete # them if they don't. log.info("Processing {} duplicates".format(len(dupes))) for resource in dupes: if 'hub-id' in resource.extras: # Don't delete ONS imported dataset log.info("Resource {} is an ONS resource, not deleting".format(resource.id)) self.remap_stats.increment('ONS resources *not* deleted') continue log.info(" - Deleting duplicate {0} -> {1}".format(resource.url, resource.id)) resource.state = 'deleted' self.remap_stats.increment('Deleted resource') self.record_transaction(dataset, resource, "Deleted dupe") if not self.options.pretend: model.Session.add(resource) model.Session.commit() log.info(" -- deleted {}".format(resource.id)) model.Session.flush() def _get_attachment(self, resource): json_url = "".join([resource.url, ".json"]) r = requests.head(json_url) if not r.status_code == 200: log.info("No JSON file at {0}".format(json_url)) return None r = requests.get(json_url) if not r.status_code == 200: log.error("Failed to retrieve {0} after successful HEAD request".format(json_url)) return None return json.loads(r.content) def _build_resource_map(self): import ckan.model as model # Find all non .csv/.xls links for gov.uk resources = model.Session.query(model.Resource).\ filter(model.Resource.url.like("%/www.gov.uk/%")).\ filter(not_(model.Resource.url.ilike("%.csv"))).\ filter(not_(model.Resource.url.ilike("%.xls"))).\ filter(not_(model.Resource.url.ilike("%.xlsx"))).\ filter(not_(model.Resource.url.ilike("%.pdf"))).\ filter(not_(model.Resource.url.ilike("%.rdf"))).\ filter(not_(model.Resource.url.ilike("%.json"))).\ filter(not_(model.Resource.url.ilike("%.doc"))).\ filter(not_(model.Resource.resource_type=='documentation')).\ filter(not_(model.Resource.resource_type=='timeseries')) #filter(model.Resource.state=='active') log.info("Found %d resources for www.gov.uk links" % resources.count()) for r in resources: pkg = r.resource_group.package # If we only want one, then skip the others if self.options.single and not pkg.name == self.options.single: continue if pkg.state == 'active': self.local_resource_map[pkg].append(r)
class ResourceRemapper(CkanCommand): """ Iterates through resources to checks if it was remapped by gov.uk. If the status is 301, then we will modify the URL of the resource, keeping track of the # of changes we made. If a 410 we'll delete the resource, and if it was the only resource, we'll delete the dataset as well. """ summary = __doc__.strip().split('\n')[0] usage = '\n' + __doc__ max_args = 0 min_args = 0 def __init__(self, name): super(ResourceRemapper, self).__init__(name) self.parser.add_option( "-n", "--nopull", dest="nopull", action="store_true", help= "Specifies the the code should not pull the latest version if the repo exists on disk" ) self.parser.add_option( "-p", "--pretend", dest="pretend", action="store_true", help="Pretends to update the database, but doesn't really.") self.local_resource_map = collections.defaultdict(list) self.remap_stats = StatsCount() def record_transaction(self, package, resource, action): """ Write a record to the log file """ row = [ package.name, package.state, resource.id, resource.state, action ] translog.writerow(row) def _rss(self): """ Return a string containing how much memory we're currently using """ rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 locale.setlocale(locale.LC_ALL, 'en_GB') return locale.format('%d', rss, grouping=True) + "Mb" def command(self): self._load_config() import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") log.debug("MEM: {0}".format(self._rss())) # Clone/pull the info from the git repo data_folder = self._get_remapping_data() self._build_local_resource_map() log.debug("MEM: {0}".format(self._rss())) log.debug("Looking for CSV files in {0}".format(data_folder)) # Iterate through all of the CSV files in the repository iterator = glob.iglob(os.path.join(data_folder, "*.csv")) for csv_file in iterator: self.remap_stats.increment('CSV files read') with open(csv_file, "rU") as f: rdr = csv.reader(f) rdr.next() # Skip the header for row in rdr: if row[0] in self.local_resource_map: self.remap_stats.increment('URLs matched') # Each URL in our local map might appear more than once, so # the list of resource IDs it iterated over for resource_id in self.local_resource_map[row[0]]: resource = model.Session.query(model.Resource).\ filter(model.Resource.id==resource_id).first() if resource == None: log.error( "Resource {0} is not findable".format( resource_id)) # Depending on the HTTP code registered for the remap we should # either treat it as gone, or as a simple move. code = int(row[2]) if code == 410: self.handle_gone(row, resource) elif code in [301, 418]: self.handle_moved(row, resource) self.remap_stats.increment('Rows read') print self.remap_stats.report(order_by_title=True) def handle_gone(self, row, resource): """ Marks the resource as deleted, and then checks if there are no more resources in the package. it will delete the dataset too if there are no other resources """ import ckan.model as model resource.state = 'deleted' if not self.options.pretend: model.Session.add(resource) model.Session.commit() pkg = resource.resource_group.package if pkg.state == 'deleted': self.remap_stats.increment( 'URL has GONE within already deleted package') self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_ALREADY_DELETED) return if self._should_delete(pkg, resource): if not self.options.pretend: pkg.state == 'deleted' model.Session.add(pkg) model.Session.commit() self.remap_stats.increment('Packages deleted due to 0 resources') self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_DELETED) else: self.record_transaction(pkg, resource, TRANSLOG_RESOURCE_DELETED) self.remap_stats.increment('410 GONE') def handle_moved(self, row, resource): """ Changes the url in the resource to the new one """ import ckan.model as model # Alays assign the URL, regardless of the state of the package just so that # it is clean (should it be un-deleted) resource.url = row[1] if not self.options.pretend: model.Session.add(resource) model.Session.commit() # Record whether we have updated an active resource within a deleted package pkg = resource.resource_group.package if pkg.state == 'deleted': self.remap_stats.increment( 'URL has MOVED within already deleted package') self.record_transaction(pkg, resource, TRANSLOG_PACKAGE_ALREADY_DELETED) return self.record_transaction(pkg, resource, TRANSLOG_CHANGED) self.remap_stats.increment('301 MOVED') def _should_delete(self, pkg, resource): # Should we delete the specified package when there is one less active resource? any_left = any([ r.id for r in pkg.resources if r.state == 'active' and r.id != resource.id ]) return not any_left def _build_local_resource_map(self): """ Builds a map of the resources we know about locally that we will store with the URL as the key, and the value as a list of resource ids that have this URL """ import ckan.model as model log.debug("Building local resource map") q = model.Session.query(model.Resource)\ .filter(model.Resource.state=='active') for resource in q.all(): self.local_resource_map[resource.url].append(resource.id) log.debug("Local resource map contains {0} elements".format( len(self.local_resource_map))) def _run_or_exit(self, cmd, error_message): """ Runs the specified command, and exits with an error if it fails """ err = subprocess.call(cmd, shell=True) if err != 0: log.error(error_message) sys.exit(1) def _get_remapping_data(self): """ Fetches the git repo containing the remapping data and pulls it into a temp directory. If it already exists, we just do a pull instead to make sure it is up-to-date. """ root = "/tmp/resource_remapping/" if not os.path.exists(root): os.makedirs(root) repo_path = os.path.join(root, "redirector") if not os.path.exists(repo_path): self._run_or_exit( "cd {dir}; git clone {repo}".format(dir=root, repo=GIT_REPO), "Failed to pull the remote repository at {0}".format(GIT_REPO)) elif not self.options.nopull: log.debug("Pulling latest code") self._run_or_exit( "cd {dir}; git pull origin master".format(dir=repo_path), "Failed to pull the remote repository at {0}".format(GIT_REPO)) else: log.debug("Code exists and nopull specified") return os.path.join(repo_path, "data/mappings")