Ejemplo n.º 1
0
    def _get_url(self, resource, query):
        '''
        Given a resource, return the URL for the data.

        This allows a local cache to be used in preference to the
        resource.url.

        If we are going to use an external URL, then we can do a HEAD request
        to check it works and record the mimetype & length in the query dict.

        :param resource: resource object
        :param query: dict describing the properties of the data
        '''
        url = None
        query['mimetype'] = None

        # Look for a local cache of the data file
        # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv"
        cache_filepath = resource.extras.get('cache_filepath')
        if cache_filepath:
            if os.path.exists(cache_filepath.encode('utf8')):
                log.debug('Previewing local cached data: %s', cache_filepath)
                url = cache_filepath
            else:
                log.debug('Local cached data file missing: %s', cache_filepath)

        # Otherwise try the cache_url
        # This works well when running on a database copied from another
        # machine - all the cached files are missing locally, but it can use
        # them from the original machine using the cache_url.
        if not url and hasattr(resource, 'cache_url') and resource.cache_url:
            u = fix_url(resource.cache_url)

            # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf"
            try:
                req = urllib2.Request(u)
                req.get_method = lambda: 'HEAD'

                r = urllib2.urlopen(req)
                if r.getcode() == 200:
                    url = u
                    query['length'] = r.info()["content-length"]
                    query['mimetype'] = r.info().get('content-type', None)
                    log.debug('Previewing cache URL: %s', url)
            except Exception, e:
                log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e))
Ejemplo n.º 2
0
            try:
                req = urllib2.Request(u)
                req.get_method = lambda: 'HEAD'

                r = urllib2.urlopen(req)
                if r.getcode() == 200:
                    url = u
                    query['length'] = r.info()["content-length"]
                    query['mimetype'] = r.info().get('content-type', None)
                    log.debug('Previewing cache URL: %s', url)
            except Exception, e:
                log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e))

        # Otherwise use the URL itself
        if not url:
            u = fix_url(resource.url)
            try:
                req = urllib2.Request(u)
                req.get_method = lambda: 'HEAD'

                r = urllib2.urlopen(req)
                if r.getcode() == 200:
                    url = u
                    query['length'] = r.info()["content-length"]
                    query['mimetype'] = r.info().get('content-type', None)
                    log.debug('Previewing direct from URL: %s', url)
                elif r.getcode() > 400:
                    return None

            except Exception, e:
                log.error(u"Request {0} with url {1}, {2}".format(identify_resource(resource), u, e))
    def _get_url(self, resource, query):
        '''
        Given a resource, return the URL for the data and a flag denoting whether
        the URL is to a local file (and therefore can ignore size limit checks.)

        This allows a local cache to be used in preference to the
        resource.url.

        If we are going to use an external URL, then we can do a HEAD request
        to check it works and record the mimetype & length in the query dict.

        :param resource: resource object
        :param query: dict describing the properties of the data
        '''
        from requests.exceptions import InvalidURL

        url = None
        archived = False
        query['mimetype'] = None
        archival = Archival.get_for_resource(resource.id)

        if archival:
            # Look for a local cache of the data file
            # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv"
            if archival.cache_filepath:
                if os.path.exists(archival.cache_filepath.encode('utf8')):
                    log.debug('Previewing local cached data: %s',
                              archival.cache_filepath)
                    url = archival.cache_filepath
                    archived = True
                else:
                    log.debug('Local cached data file missing: %s',
                              archival.cache_filepath)
            else:
                log.debug('No cache_filepath for resource %s',
                          identify_resource(resource))

            # Otherwise try the cache_url
            # This works well when running on a database copied from another
            # machine - all the cached files are missing locally, but it can use
            # them from the original machine using the cache_url.
            if not url:
                if archival.cache_url:
                    try:
                        u = fix_url(archival.cache_url)
                    except InvalidURL:
                        log.error("Unable to fix the URL for resource: %s" %
                                  identify_resource(resource))
                        return None, False

                    # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf"
                    try:
                        req = urllib2.Request(u)
                        req.get_method = lambda: 'HEAD'

                        r = urllib2.urlopen(req)
                        if r.getcode() == 200:
                            url = u
                            query['length'] = r.info().get("content-length", 0)
                            query['mimetype'] = r.info().get(
                                'content-type', None)
                            log.debug('Previewing cache URL: %s', url)
                    except Exception, e:
                        log.error(
                            u"Request {0} with cache url {1}, {2}".format(
                                identify_resource(resource), u, e))
                else:
                    log.debug('No cache_url for resource %s',
                              identify_resource(resource))
Ejemplo n.º 4
0
    def _get_url(self, resource, query):
        '''
        Given a resource, return the URL for the data and a flag denoting whether
        the URL is to a local file (and therefore can ignore size limit checks.)

        This allows a local cache to be used in preference to the
        resource.url.

        If we are going to use an external URL, then we can do a HEAD request
        to check it works and record the mimetype & length in the query dict.

        :param resource: resource object
        :param query: dict describing the properties of the data
        '''
        from requests.exceptions import InvalidURL

        url = None
        archived = False
        query['mimetype'] = None
        archival = Archival.get_for_resource(resource.id)

        if archival:
            # Look for a local cache of the data file
            # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv"
            if archival.cache_filepath:
                if os.path.exists(archival.cache_filepath.encode('utf8')):
                    log.debug('Previewing local cached data: %s', archival.cache_filepath)
                    url = archival.cache_filepath
                    archived = True
                else:
                    log.debug('Local cached data file missing: %s', archival.cache_filepath)
            else:
                log.debug('No cache_filepath for resource %s', identify_resource(resource))

            # Otherwise try the cache_url
            # This works well when running on a database copied from another
            # machine - all the cached files are missing locally, but it can use
            # them from the original machine using the cache_url.
            if not url:
                if archival.cache_url:
                    try:
                        u = fix_url(archival.cache_url)
                    except InvalidURL:
                        log.error("Unable to fix the URL for resource: %s" % identify_resource(resource))
                        return None, False

                    # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf"
                    try:
                        req = urllib2.Request(u)
                        req.get_method = lambda: 'HEAD'

                        r = urllib2.urlopen(req)
                        if r.getcode() == 200:
                            url = u
                            query['length'] = r.info().get("content-length", 0)
                            query['mimetype'] = r.info().get('content-type', None)
                            log.debug('Previewing cache URL: %s', url)
                    except Exception, e:
                        log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e))
                else:
                    log.debug('No cache_url for resource %s', identify_resource(resource))
                            log.debug('Previewing cache URL: %s', url)
                    except Exception, e:
                        log.error(
                            u"Request {0} with cache url {1}, {2}".format(
                                identify_resource(resource), u, e))
                else:
                    log.debug('No cache_url for resource %s',
                              identify_resource(resource))
        else:
            log.debug('Resource is not archived: %s',
                      identify_resource(resource))

        # Otherwise use the URL itself
        if not url:
            try:
                u = fix_url(resource.url)
            except InvalidURL:
                log.error("Unable to fix the URL for resource: %s" %
                          identify_resource(resource))
                return None, False

            try:
                req = urllib2.Request(u)
                req.get_method = lambda: 'HEAD'

                r = urllib2.urlopen(req)
                if r.getcode() == 200:
                    url = u
                    query['length'] = r.info().get("content-length", 0)
                    query['mimetype'] = r.info().get('content-type', None)
                    log.debug('Previewing direct from URL: %s', url)