def _get_url(self, resource, query): ''' Given a resource, return the URL for the data. This allows a local cache to be used in preference to the resource.url. If we are going to use an external URL, then we can do a HEAD request to check it works and record the mimetype & length in the query dict. :param resource: resource object :param query: dict describing the properties of the data ''' url = None query['mimetype'] = None # Look for a local cache of the data file # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv" cache_filepath = resource.extras.get('cache_filepath') if cache_filepath: if os.path.exists(cache_filepath.encode('utf8')): log.debug('Previewing local cached data: %s', cache_filepath) url = cache_filepath else: log.debug('Local cached data file missing: %s', cache_filepath) # Otherwise try the cache_url # This works well when running on a database copied from another # machine - all the cached files are missing locally, but it can use # them from the original machine using the cache_url. if not url and hasattr(resource, 'cache_url') and resource.cache_url: u = fix_url(resource.cache_url) # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf" try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info()["content-length"] query['mimetype'] = r.info().get('content-type', None) log.debug('Previewing cache URL: %s', url) except Exception, e: log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e))
try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info()["content-length"] query['mimetype'] = r.info().get('content-type', None) log.debug('Previewing cache URL: %s', url) except Exception, e: log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e)) # Otherwise use the URL itself if not url: u = fix_url(resource.url) try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info()["content-length"] query['mimetype'] = r.info().get('content-type', None) log.debug('Previewing direct from URL: %s', url) elif r.getcode() > 400: return None except Exception, e: log.error(u"Request {0} with url {1}, {2}".format(identify_resource(resource), u, e))
def _get_url(self, resource, query): ''' Given a resource, return the URL for the data and a flag denoting whether the URL is to a local file (and therefore can ignore size limit checks.) This allows a local cache to be used in preference to the resource.url. If we are going to use an external URL, then we can do a HEAD request to check it works and record the mimetype & length in the query dict. :param resource: resource object :param query: dict describing the properties of the data ''' from requests.exceptions import InvalidURL url = None archived = False query['mimetype'] = None archival = Archival.get_for_resource(resource.id) if archival: # Look for a local cache of the data file # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv" if archival.cache_filepath: if os.path.exists(archival.cache_filepath.encode('utf8')): log.debug('Previewing local cached data: %s', archival.cache_filepath) url = archival.cache_filepath archived = True else: log.debug('Local cached data file missing: %s', archival.cache_filepath) else: log.debug('No cache_filepath for resource %s', identify_resource(resource)) # Otherwise try the cache_url # This works well when running on a database copied from another # machine - all the cached files are missing locally, but it can use # them from the original machine using the cache_url. if not url: if archival.cache_url: try: u = fix_url(archival.cache_url) except InvalidURL: log.error("Unable to fix the URL for resource: %s" % identify_resource(resource)) return None, False # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf" try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info().get("content-length", 0) query['mimetype'] = r.info().get( 'content-type', None) log.debug('Previewing cache URL: %s', url) except Exception, e: log.error( u"Request {0} with cache url {1}, {2}".format( identify_resource(resource), u, e)) else: log.debug('No cache_url for resource %s', identify_resource(resource))
def _get_url(self, resource, query): ''' Given a resource, return the URL for the data and a flag denoting whether the URL is to a local file (and therefore can ignore size limit checks.) This allows a local cache to be used in preference to the resource.url. If we are going to use an external URL, then we can do a HEAD request to check it works and record the mimetype & length in the query dict. :param resource: resource object :param query: dict describing the properties of the data ''' from requests.exceptions import InvalidURL url = None archived = False query['mimetype'] = None archival = Archival.get_for_resource(resource.id) if archival: # Look for a local cache of the data file # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv" if archival.cache_filepath: if os.path.exists(archival.cache_filepath.encode('utf8')): log.debug('Previewing local cached data: %s', archival.cache_filepath) url = archival.cache_filepath archived = True else: log.debug('Local cached data file missing: %s', archival.cache_filepath) else: log.debug('No cache_filepath for resource %s', identify_resource(resource)) # Otherwise try the cache_url # This works well when running on a database copied from another # machine - all the cached files are missing locally, but it can use # them from the original machine using the cache_url. if not url: if archival.cache_url: try: u = fix_url(archival.cache_url) except InvalidURL: log.error("Unable to fix the URL for resource: %s" % identify_resource(resource)) return None, False # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf" try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info().get("content-length", 0) query['mimetype'] = r.info().get('content-type', None) log.debug('Previewing cache URL: %s', url) except Exception, e: log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e)) else: log.debug('No cache_url for resource %s', identify_resource(resource))
log.debug('Previewing cache URL: %s', url) except Exception, e: log.error( u"Request {0} with cache url {1}, {2}".format( identify_resource(resource), u, e)) else: log.debug('No cache_url for resource %s', identify_resource(resource)) else: log.debug('Resource is not archived: %s', identify_resource(resource)) # Otherwise use the URL itself if not url: try: u = fix_url(resource.url) except InvalidURL: log.error("Unable to fix the URL for resource: %s" % identify_resource(resource)) return None, False try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info().get("content-length", 0) query['mimetype'] = r.info().get('content-type', None) log.debug('Previewing direct from URL: %s', url)