Exemple #1
0
    def crawl_archives(self):
        registry = getUtility(IRegistry)
        base_url = registry.get('castle.aws_s3_base_url', None)

        storage = archival.Storage(self.site)
        urls = []
        for key, archive_data in storage.archives.items():
            # archives do not need to be re-indexed ever.
            # see if the key is in ES, if it is move on
            url = archive_data.get('view_url', None) or archive_data['url']
            urls.append(aws.swap_url(url, base_url=base_url))

        query = {"bool": {"filter": {"term": {"sitemap": "archives"}}}}
        existing_urls = self.get_all_from_es(query)
        for _id in set(urls) - set(existing_urls):
            # pages that have not yet been crawled
            try:
                self.crawl_archive_url(_id)
            except Exception:
                logger.error('Error indexing archive url: ' + _id,
                             exc_info=True)

        for _id in set(existing_urls) - set(urls):
            # pages that have been removed from the archive
            self.delete_from_index(_id)
    def test_swap_url(self):
        awsurl = 'https://s3-us-gov-west-1.amazonaws.com/bucketname/archives/path/to/resource'
        baseurl = 'http://foo.com/'
        swappedurl = 'http://foo.com/archives/path/to/resource'

        resulturl = aws.swap_url(awsurl, base_url=baseurl)
        self.assertEqual(swappedurl, resulturl)
Exemple #3
0
    def __call__(self):
        shield.protect(self.request, recheck=True)
        self.notfound = self.context
        self.context = api.portal.get()
        if '++' in self.request.URL:
            self.request.response.setStatus(404)
            try:
                return self.index()
            except Exception:
                logger.warn(
                    "Failed to render 404 template, had to return simple response"
                )
                return "not found"

        archive_storage = archival.Storage(self.context)
        site_url = self.context.absolute_url()
        path = self.request.ACTUAL_URL[len(site_url):].rstrip('/')

        wants_view = False
        if path.endswith('/view'):
            wants_view = True
            path = path.rsplit('/view', 1)[0]

        new_url = None
        if path.startswith('/resolveuid'):
            uid = path.replace('/resolveuid/', '')
            try:
                new_url = archive_storage.get_archive_url_by_uid(uid)
            except Exception:
                pass
        else:
            try:
                new_url = archive_storage.get_archive_url_by_path(
                    path, wants_view)
            except Exception:
                pass
        if new_url:
            # XXX need to force redirect this way since normal redirect
            # gets overridden with 404
            if self.request.environ.get('QUERY_STRING'):
                new_url += '?' + self.request.environ['QUERY_STRING']
            raise Redirect(aws.swap_url(new_url))

        self.attempt_redirect()

        self.request.response.setStatus(404)
        return self.index()
Exemple #4
0
def fix_urls(storage, dom):
    parsed_endpoint = urlparse(storage.s3_conn.meta.client.meta.endpoint_url)
    for Mover in storage.Movers:
        mover = Mover(dom)
        for el in mover.get_elements():
            url = mover.get_url(el)
            if url is None:
                continue
            # check that the url is an s3 url
            if not is_s3_url(url, parsed_endpoint):
                continue

            original = None
            if 'original-url' in el.attrib:
                # need to maintain the original original
                original = el.attrib['original-url']
            mover.modify(el, aws.swap_url(url))
            if original:
                el.attrib['original-url'] = original
def fix_urls(storage, dom):
    for Mover in storage.Movers:
        mover = Mover(dom)
        for el in mover.get_elements():
            url = mover.get_url(el)
            if url is None:
                continue
            # check that the url is an s3 url
            parsed = urlparse(url)
            if parsed.netloc != storage.s3_conn.server_name():
                continue

            original = None
            if 'original-url' in el.attrib:
                # need to maintain the original original
                original = el.attrib['original-url']
            mover.modify(el, aws.swap_url(url))
            if original:
                el.attrib['original-url'] = original
Exemple #6
0
    def __call__(self):
        shield.protect(self.request)

        self.notfound = self.context
        self.context = api.portal.get()
        archive_storage = archival.Storage(self.context)
        site_url = self.context.absolute_url()
        path = self.request.ACTUAL_URL[len(site_url):].rstrip('/')

        wants_view = False
        if path.endswith('/view'):
            wants_view = True
            path = path.rsplit('/view', 1)[0]

        new_url = None
        if path.startswith('/resolveuid'):
            uid = path.replace('/resolveuid/', '')
            try:
                new_url = archive_storage.get_archive_url_by_uid(uid)
            except:
                pass
        else:
            try:
                new_url = archive_storage.get_archive_url_by_path(path, wants_view)
            except:
                pass
        if new_url:
            # XXX need to force redirect this way since normal redirect
            # gets overridden with 404
            if self.request.environ.get('QUERY_STRING'):
                new_url += '?' + self.request.environ['QUERY_STRING']
            raise Redirect(aws.swap_url(new_url))

        # seems this overrides plone.app.redirector handler
        redirector = queryMultiAdapter((self.context, self.request),
                                       name=u'plone_redirector_view')
        if redirector:
            redirector.attempt_redirect()

        return self.index()
Exemple #7
0
    def transform_content(self, content, from_url):
        parsed_url = urlparse(from_url)
        domain = parsed_url.netloc
        dom = fromstring(content)
        for Mover in self.Movers:
            mover = Mover(dom)
            for el in mover.get_elements():
                url = mover.get_url(el)
                if url is None:
                    continue

                if url[0] == '/':
                    url = '{}://{}{}'.format(parsed_url.scheme, domain, url)
                elif 'https://' not in url and 'http://' not in url:
                    url = urljoin(from_url, url)

                # check that the url is on the site...
                rdomain = urlparse(url).netloc
                if rdomain and domain != rdomain:
                    continue
                if url not in self.resources:
                    # need to move resource
                    resource_url = url
                    if not url.startswith('http'):
                        resource_url = urljoin(from_url, url)
                    moved_url = self.move_resource(resource_url,
                                                   mover.keep_ext)
                    if moved_url:
                        self.resources[url] = moved_url
                if url in self.resources:
                    mover.modify(el, aws.swap_url(self.resources[url]))
        content = tostring(dom)
        for Util in getAllUtilitiesRegisteredFor(IArchiveContentTransformer):
            try:
                util = Util(self)
                content = util(content)
            except Exception:
                logger.info('Error with archive utility', exc_info=True)
        return content