Ejemplo n.º 1
0
    def __call__(self, params):
        mode = params.get('mode', 'index')
        if mode == 'list_sources':
            return {}, self.index_source.get_source_list(params), {}

        if mode != 'index':
            return {}, self.get_supported_modes(), {}

        output = params.get('output', self.DEF_OUTPUT)
        fields = params.get('fields')

        if fields and isinstance(fields, str):
            fields = fields.split(',')

        handler = self.OUTPUTS.get(output, fields)
        if not handler:
            errs = dict(last_exc=BadRequestException(
                'output={0} not supported'.format(output)))
            return None, None, errs

        cdx_iter, errs = self._load_index_source(params)
        if not cdx_iter:
            return None, None, errs

        content_type, res = handler(cdx_iter, fields, params)
        out_headers = {'Content-Type': content_type}

        def check_str(lines):
            for line in lines:
                if isinstance(line, six.text_type):
                    line = line.encode('utf-8')
                yield line

        return out_headers, check_str(res), errs
Ejemplo n.º 2
0
    def load_cdx(self, query):
        if self.remote_processing:
            remote_query = query
        else:
            # Only send url and matchType to remote
            remote_query = CDXQuery(url=query.url, match_type=query.match_type)

        urlparams = remote_query.urlencode()

        try:
            request = urllib2.Request(self.remote_url + '?' + urlparams)

            if self.cookie:
                request.add_header('Cookie', self.cookie)

            response = urllib2.urlopen(request)

        except urllib2.HTTPError as e:
            if e.code == 403:
                raise AccessException('Access Denied')
            elif e.code == 404:
                # return empty list for consistency with other cdx sources
                # will be converted to 404 if no other retry
                return []
            elif e.code == 400:
                raise BadRequestException()
            else:
                raise WbException('Invalid response from remote cdx server')

        return iter(response)
Ejemplo n.º 3
0
    def load_index(self, params):
        """Loads the xml query index based on the supplied params

        :param dict[str, str] params: The query params
        :return: A list or generator of cdx objects
        :raises NotFoundException: If the query url is not found
        or the results of the query returns no cdx entries
        :raises BadRequestException: If the match type is not exact or prefix
        """
        closest = params.get('closest')

        url = params.get('url', '')

        matchType = params.get('matchType', 'exact')

        if matchType == 'exact':
            query = self.EXACT_QUERY
        elif matchType == 'prefix':
            query = self.PREFIX_QUERY
        else:
            raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType))

        try:
            limit = params.get('limit')
            if limit:
                query = 'limit:{0} '.format(limit) + query

            # OpenSearch API requires double-escaping
            # TODO: add option to not double escape if needed
            query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url))
            self.logger.debug("Running query: %s" % query_url)
            response = self.session.get(query_url)
            response.raise_for_status()

            results = etree.fromstring(response.content)

            items = results.find('results')

        except Exception:
            if self.logger.getEffectiveLevel() == logging.DEBUG:
                import traceback
                traceback.print_exc()

            raise NotFoundException('url {0} not found'.format(url))

        if not items:
            raise NotFoundException('url {0} not found'.format(url))

        items = items.findall('result')

        if matchType == 'exact':
            cdx_iter = [self.convert_to_cdx(item) for item in items]
            if closest:
                cdx_iter = cdx_sort_closest(closest, cdx_iter, limit=10000)

        else:
            cdx_iter = self.prefix_query_iter(items)

        return cdx_iter
Ejemplo n.º 4
0
    def load_cdx(self, **params):
        prefix = ''

        is_text = (params.get('output') == 'text')

        # lookup collection prefix
        filters = params.get('filter')
        if filters:
            for f in filters:
                if f.startswith('prefix:'):
                    prefix = f[7:]

        # special path for list all
        if params.get('listColls') and is_text:
            colls = self._load_colls()
            return '\n'.join(colls)

        url = params['url']

        # force http prefix
        if url.startswith(self.HTTPS_PREFIX):
            url = self.HTTP_PREFIX + url[len(self.HTTPS_PREFIX):]
        elif not url.startswith(self.HTTP_PREFIX):
            url = self.HTTP_PREFIX + url

        request_uri = self.warcbase_path
        request_uri += prefix
        request_uri += '*/' + url

        try:
            response = requests.get(request_uri)
        except Exception:
            raise WbException('Error reading from: ' + request_uri)

        if response.status_code != 200:
            if response.status_code == 500:
                self._invalid_collection(prefix)
            else:
                raise BadRequestException(('Invalid status code: {0}'.
                                           format(response.status_code)))

        if len(response.content) == 0:
            msg = ('No captures found for <b>{0}</b> in collection <i>{1}</i>'.
                   format(url, prefix.strip('/')))

            raise NotFoundException(msg, url=url)

        lines = response.content.rstrip().split('\n')

        if len(lines[0].split('\t')) != 3:
            self._invalid_collection(prefix)

        resp_iter = self.iter_cdx(lines, url)
        if is_text:
            resp_iter = self.iter_text(resp_iter)

        return resp_iter
Ejemplo n.º 5
0
    def __call__(self, params):
        mode = params.get('mode', 'index')
        if mode == 'list_sources':
            return {}, self.index_source.get_source_list(params), {}

        if mode != 'index':
            return {}, self.get_supported_modes(), {}

        output = params.get('output', self.DEF_OUTPUT)
        fields = params.get('fields')
        if not fields:
            fields = params.get('fl')

        if fields and isinstance(fields, str):
            fields = fields.split(',')

        handler = self.OUTPUTS.get(output)
        if not handler:
            errs = dict(last_exc=BadRequestException(
                'output={0} not supported'.format(output)))
            return None, None, errs

        cdx_iter = None
        try:
            cdx_iter, errs = self._load_index_source(params)
        except BadRequestException as e:
            errs = dict(last_exc=e)
        if not cdx_iter:
            return None, None, errs

        content_type, res = handler(cdx_iter, fields, params)
        out_headers = {'Content-Type': content_type}

        first_line = None
        try:
            # raise exceptions early so that they can be handled properly
            first_line = next(res)
        except StopIteration:
            pass
        except CDXException as e:
            errs = dict(last_exc=e)
            return None, None, errs

        def check_str(first_line, lines):
            if first_line is not None:
                if isinstance(first_line, six.text_type):
                    first_line = first_line.encode('utf-8')
                yield first_line
            for line in lines:
                if isinstance(line, six.text_type):
                    line = line.encode('utf-8')
                yield line

        return out_headers, check_str(first_line, res), errs
Ejemplo n.º 6
0
    def _load_index_source(self, params):
        url = params.get('url')
        if not url:
            errs = dict(last_exc=BadRequestException('The "url" param is required'))
            return None, errs

        input_req = params.get('_input_req')
        if input_req:
            params['alt_url'] = input_req.include_method_query(url)

        return self.fuzzy(self.index_source, params)
Ejemplo n.º 7
0
    def _load_index_source(self, params):
        url = params.get('url')
        if not url:
            errs = dict(last_exc=BadRequestException('The "url" param is required'))
            return None, errs

        input_req = params.get('_input_req')
        if input_req:
            params['alt_url'] = input_req.include_method_query(url)

        cdx_iter = self.fuzzy(self.index_source, params)

        acl_user = params['_input_req'].env.get("HTTP_X_PYWB_ACL_USER")

        if self.access_checker:
            cdx_iter = self.access_checker(cdx_iter, acl_user)

        return cdx_iter
Ejemplo n.º 8
0
    def _parse_extra(self):
        if not self.wb_url:
            return

        if self.wb_url.type != self.wb_url.LATEST_REPLAY:
            return

        self.options['is_timegate'] = True

        accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME')
        if not accept_datetime:
            return

        try:
            timestamp = http_date_to_timestamp(accept_datetime)
        except Exception:
            raise BadRequestException('Invalid Accept-Datetime: ' +
                                      accept_datetime)

        self.wb_url.set_replay_timestamp(timestamp)
Ejemplo n.º 9
0
    def handle_connect(self, env):
        sock = self.get_request_socket(env)
        if not sock:
            return WbResponse.text_response('HTTPS Proxy Not Supported',
                                            '405 HTTPS Proxy Not Supported')

        sock.send(b'HTTP/1.0 200 Connection Established\r\n')
        sock.send(b'Proxy-Connection: close\r\n')
        sock.send(b'Server: pywb proxy\r\n')
        sock.send(b'\r\n')

        hostname, port = env['REL_REQUEST_URI'].split(':')

        if not self.use_wildcard:
            certfile = self.ca.cert_for_host(hostname)
        else:
            certfile = self.ca.get_wildcard_cert(hostname)

        try:
            ssl_sock = ssl.wrap_socket(
                sock,
                server_side=True,
                certfile=certfile,
                #ciphers="ALL",
                suppress_ragged_eofs=False,
                ssl_version=ssl.PROTOCOL_SSLv23)
            env['pywb.proxy_ssl_sock'] = ssl_sock

            buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)

            statusline = to_native_str(buffreader.readline().rstrip())

        except Exception as se:
            raise BadRequestException(se.message)

        statusparts = statusline.split(' ')

        if len(statusparts) < 3:
            raise BadRequestException('Invalid Proxy Request: ' + statusline)

        env['REQUEST_METHOD'] = statusparts[0]
        env['REL_REQUEST_URI'] = ('https://' +
                                  env['REL_REQUEST_URI'].replace(':443', '') +
                                  statusparts[1])

        env['SERVER_PROTOCOL'] = statusparts[2].strip()

        env['pywb.proxy_scheme'] = 'https'

        env['pywb.proxy_host'] = hostname
        env['pywb.proxy_port'] = port
        env['pywb.proxy_req_uri'] = statusparts[1]

        queryparts = env['REL_REQUEST_URI'].split('?', 1)
        env['PATH_INFO'] = queryparts[0]
        env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
        env['pywb.proxy_query'] = env['QUERY_STRING']

        while True:
            line = to_native_str(buffreader.readline())
            if line:
                line = line.rstrip()

            if not line:
                break

            parts = line.split(':', 1)
            if len(parts) < 2:
                continue

            name = parts[0].strip()
            value = parts[1].strip()

            name = name.replace('-', '_').upper()

            if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
                name = 'HTTP_' + name

            env[name] = value

        env['wsgi.input'] = buffreader