Ejemplo n.º 1
0
    def aiohttp_session_cookies(self):
        ''' Returns an aiohttp compatible list of relevant session cookies. '''

        output_cookie = SimpleCookie()

        for cookie in self.session_cookies:
            converted_cookie = self.cookiejar_cookie_to_simplecookie(cookie)
            output_cookie.update(converted_cookie)
        return output_cookie
Ejemplo n.º 2
0
    async def open(
        self,
        path: str,
        *,
        method: str = "GET",
        headers: Optional[Union[dict, CIMultiDict]] = None,
        data: Any = None,
        form: Optional[dict] = None,
        query_string: Optional[dict] = None,
        json: Any = sentinel,
        scheme: str = "http",
        cookies: Optional[dict] = None,
        stream: bool = False,
        allow_redirects: bool = True,
    ):
        """Open a request to the app associated with this client.

        Arguments:
            path
                The path to request. If the query_string argument is not
                defined this argument will be partitioned on a '?' with the
                following part being considered the query_string.

            method
                The method to make the request with, defaults to 'GET'.

            headers
                Headers to include in the request.

            data
                Raw data to send in the request body or async generator

            form
                Data to send form encoded in the request body.

            query_string
                To send as a dictionary, alternatively the query_string can be
                determined from the path.

            json
                Data to send json encoded in the request body.

            scheme
                The scheme to use in the request, default http.

            cookies
                Cookies to send in the request instead of cookies in
                TestClient.cookie_jar

            stream
                Return the response in streaming instead of buffering

            allow_redirects
                If set to True follows redirects

        Returns:
            The response from the app handling the request.
        """
        input_queue: asyncio.Queue[dict] = asyncio.Queue()
        output_queue: asyncio.Queue[dict] = asyncio.Queue()

        headers, path, query_string_bytes = make_test_headers_path_and_query_string(
            self.application, path, headers, query_string)

        if [json is not sentinel, form is not None, data is not None
            ].count(True) > 1:
            raise ValueError(
                "Test args 'json', 'form', and 'data' are mutually exclusive")

        request_data = b""

        if isinstance(data, str):
            request_data = data.encode("utf-8")
        elif isinstance(data, bytes):
            request_data = data

        if json is not sentinel:
            request_data = dumps(json).encode("utf-8")
            headers["Content-Type"] = "application/json"

        if form is not None:
            request_data = urlencode(form).encode("utf-8")
            headers["Content-Type"] = "application/x-www-form-urlencoded"

        if cookies is None:  # use TestClient.cookie_jar
            cookie_jar = self.cookie_jar
        else:
            cookie_jar = SimpleCookie(cookies)

        if cookie_jar and cookie_jar.output(header=""):
            headers.add("Cookie", cookie_jar.output(header=""))

        flat_headers: List[Tuple] = [(bytes(k.lower(),
                                            "utf8"), bytes(v, "utf8"))
                                     for k, v in headers.items()]

        scope = {
            "type": "http",
            "http_version": "1.1",
            "asgi": {
                "version": "3.0"
            },
            "method": method,
            "scheme": scheme,
            "path": path,
            "query_string": query_string_bytes,
            "root_path": "",
            "headers": flat_headers,
        }

        create_monitored_task(
            self.application(scope, input_queue.get, output_queue.put),
            output_queue.put_nowait,
        )

        send = input_queue.put_nowait
        receive_or_fail = partial(receive, output_queue, timeout=self.timeout)

        # Send request
        if inspect.isasyncgen(data):
            async for is_last, body in is_last_one(data):
                send({
                    "type": "http.request",
                    "body": body,
                    "more_body": not is_last
                })
        else:
            send({"type": "http.request", "body": request_data})

        response = Response(stream, receive_or_fail, send)

        # Receive response start
        message = await self.wait_response(receive_or_fail,
                                           "http.response.start")
        response.status_code = message["status"]
        response.headers = CIMultiDict([(k.decode("utf8"), v.decode("utf8"))
                                        for k, v in message["headers"]])

        # Receive initial response body
        message = await self.wait_response(receive_or_fail,
                                           "http.response.body")
        response.raw.write(message["body"])
        response._more_body = message.get("more_body", False)

        # Consume the remaining response if not in stream
        if not stream:
            bytes_io = BytesRW()
            bytes_io.write(response.raw.read())
            async for chunk in response:
                bytes_io.write(chunk)
            response.raw = bytes_io
            response._content = bytes_io.read()
            response._content_consumed = True

        if cookie_jar is not None:
            cookies = SimpleCookie()
            for c in response.headers.getall("Set-Cookie", ""):
                cookies.load(c)
            response.cookies = requests.cookies.RequestsCookieJar()
            response.cookies.update(cookies)
            cookie_jar.update(cookies)

        if allow_redirects and response.is_redirect:
            path = response.headers["location"]
            return await self.get(path)
        else:
            return response
Ejemplo n.º 3
0
class Client(object):
    """
    A class that can act as a client for testing purposes.

    It allows the user to compose GET and POST requests, and
    obtain the response that the server gave to those requests.
    The server Response objects are annotated with the details
    of the contexts and templates that were rendered during the
    process of serving the request.

    Client objects are stateful - they will retain cookie (and
    thus session) details for the lifetime of the Client instance.

    This is not intended as a replacement for Twill/Selenium or
    the like - it is here to allow testing against the
    contexts and templates produced by a view, rather than the
    HTML rendered to the end-user.
    """
    def __init__(self, cookies=None, handler_class=LocalHandler, **defaults):
        self.handler = handler_class()
        self.defaults = {'SERVER_NAME': 'localserver'}
        self.defaults.update(defaults)
        self.cookies = SimpleCookie(cookies or {})
        self.exc_info = None
        self.errors = StringIO()

    def _session(self):
        """
        Obtains the current session variables.
        """
        if 'django.contrib.sessions' in settings.INSTALLED_APPS:
            engine = import_module(settings.SESSION_ENGINE)
            cookie = self.cookies.get(settings.SESSION_COOKIE_NAME, None)
            if cookie:
                return engine.SessionStore(cookie.value)
        return {}

    session = property(_session)

    def request(self, **request):
        """
        The master request method. Composes the environment dictionary
        and passes to the handler, returning the result of the handler.
        Assumes defaults for the query environment, which can be overridden
        using the arguments to the request.
        """
        environ = {
            'HTTP_COOKIE': self.cookies.output(header='', sep='; '),
            'PATH_INFO': '/',
            'QUERY_STRING': '',
            'REMOTE_ADDR': '127.0.0.1',
            'REQUEST_METHOD': 'GET',
            'SCRIPT_NAME': '',
            'SERVER_NAME': 'testserver',
            'SERVER_PORT': '80',
            'SERVER_PROTOCOL': 'HTTP/1.1',
            'wsgi.version': (1, 0),
            'wsgi.url_scheme': 'http',
            'wsgi.errors': self.errors,
            'wsgi.multiprocess': True,
            'wsgi.multithread': False,
            'wsgi.run_once': False,
        }
        environ.update(self.defaults)
        environ.update(request)

        try:
            response = self.handler(environ)
        except TemplateDoesNotExist as e:
            # If the view raises an exception, Django will attempt to show
            # the 500.html template. If that template is not available,
            # we should ignore the error in favor of re-raising the
            # underlying exception that caused the 500 error. Any other
            # template found to be missing during view error handling
            # should be reported as-is.
            if e.args != ('500.html', ):
                raise

        # Update persistent cookie data.
        if response.cookies:
            self.cookies.update(response.cookies)

        return response

    def get(self, path, data={}, follow=False, **extra):
        """
        Requests a response from the server using GET.
        """
        parsed = urlparse(path)
        r = {
            'CONTENT_TYPE': 'text/html; charset=utf-8',
            'PATH_INFO': unquote(parsed[2]),
            'QUERY_STRING': urlencode(data, doseq=True) or parsed[4],
            'REQUEST_METHOD': 'GET',
            'wsgi.input': FakePayload('')
        }
        r.update(extra)

        response = self.request(**r)
        if follow:
            response = self._handle_redirects(response, **extra)
        return response

    def post(self,
             path,
             data={},
             content_type=MULTIPART_CONTENT,
             follow=False,
             **extra):
        """
        Requests a response from the server using POST.
        """
        if content_type is MULTIPART_CONTENT:
            post_data = encode_multipart(BOUNDARY, data)
        else:
            # Encode the content so that the byte representation is correct.
            match = CONTENT_TYPE_RE.match(content_type)
            if match:
                charset = match.group(1)
            else:
                charset = settings.DEFAULT_CHARSET
            post_data = smart_str(data, encoding=charset)

        parsed = urlparse(path)
        r = {
            'CONTENT_LENGTH': len(post_data),
            'CONTENT_TYPE': content_type,
            'PATH_INFO': urllib.unquote(parsed[2]),
            'QUERY_STRING': parsed[4],
            'REQUEST_METHOD': 'POST',
            'wsgi.input': FakePayload(post_data),
        }
        r.update(extra)

        response = self.request(**r)
        if follow:
            response = self._handle_redirects(response, **extra)
        return response

    def head(self, path, data={}, follow=False, **extra):
        """
        Request a response from the server using HEAD.
        """
        parsed = urlparse(path)
        r = {
            'CONTENT_TYPE': 'text/html; charset=utf-8',
            'PATH_INFO': urllib.unquote(parsed[2]),
            'QUERY_STRING': urlencode(data, doseq=True) or parsed[4],
            'REQUEST_METHOD': 'HEAD',
            'wsgi.input': FakePayload('')
        }
        r.update(extra)

        response = self.request(**r)
        if follow:
            response = self._handle_redirects(response, **extra)
        return response

    def options(self, path, data={}, follow=False, **extra):
        """
        Request a response from the server using OPTIONS.
        """
        parsed = urlparse(path)
        r = {
            'PATH_INFO': urllib.unquote(parsed[2]),
            'QUERY_STRING': urlencode(data, doseq=True) or parsed[4],
            'REQUEST_METHOD': 'OPTIONS',
            'wsgi.input': FakePayload('')
        }
        r.update(extra)

        response = self.request(**r)
        if follow:
            response = self._handle_redirects(response, **extra)
        return response

    def put(self,
            path,
            data={},
            content_type=MULTIPART_CONTENT,
            follow=False,
            **extra):
        """
        Send a resource to the server using PUT.
        """
        if content_type is MULTIPART_CONTENT:
            post_data = encode_multipart(BOUNDARY, data)
        else:
            post_data = data

        # Make `data` into a querystring only if it's not already a string. If
        # it is a string, we'll assume that the caller has already encoded it.
        query_string = None
        if not isinstance(data, basestring):
            query_string = urlencode(data, doseq=True)

        parsed = urlparse(path)
        r = {
            'CONTENT_LENGTH': len(post_data),
            'CONTENT_TYPE': content_type,
            'PATH_INFO': urllib.unquote(parsed[2]),
            'QUERY_STRING': query_string or parsed[4],
            'REQUEST_METHOD': 'PUT',
            'wsgi.input': FakePayload(post_data),
        }
        r.update(extra)

        response = self.request(**r)
        if follow:
            response = self._handle_redirects(response, **extra)
        return response

    def delete(self, path, data={}, follow=False, **extra):
        """
        Send a DELETE request to the server.
        """
        parsed = urlparse(path)
        r = {
            'PATH_INFO': urllib.unquote(parsed[2]),
            'QUERY_STRING': urlencode(data, doseq=True) or parsed[4],
            'REQUEST_METHOD': 'DELETE',
            'wsgi.input': FakePayload('')
        }
        r.update(extra)

        response = self.request(**r)
        if follow:
            response = self._handle_redirects(response, **extra)
        return response

    def _handle_redirects(self, response, **extra):
        "Follows any redirects by requesting responses from the server using GET."

        response.redirect_chain = []
        while response.status_code in (301, 302, 303, 307):
            url = response['Location']
            scheme, netloc, path, query, fragment = urlsplit(url)

            redirect_chain = response.redirect_chain
            redirect_chain.append((url, response.status_code))

            if scheme:
                extra['wsgi.url_scheme'] = scheme

            # The test client doesn't handle external links,
            # but since the situation is simulated in test_client,
            # we fake things here by ignoring the netloc portion of the
            # redirected URL.
            response = self.get(path, QueryDict(query), follow=False, **extra)
            response.redirect_chain = redirect_chain

            # Prevent loops
            if response.redirect_chain[-1] in response.redirect_chain[0:-1]:
                break
        return response
Ejemplo n.º 4
0
class WebTest(tornado.testing.AsyncHTTPTestCase):
    def __init__(self, *rest):
        self.cookies = SimpleCookie()
        tornado.testing.AsyncHTTPTestCase.__init__(self, *rest)

    def setUp(self):
        super(WebTest, self).setUp()

        # create database
        conn = functions.DB.conn(config.DB["username"], config.DB["password"], "")
        x = conn.cursor()
        x.execute(
            "DROP DATABASE IF EXISTS `{db}`; CREATE DATABASE `{db}`;".format(
                db=config.DB["db"]
            )
        )
        x.close()
        conn.close()

        # create tables
        conn = functions.DB.conn(
            config.DB["username"], config.DB["password"], config.DB["db"]
        )
        functions.DB.execute_sql_in_file(conn, config.ROOT + "/sql/schema.sql")
        conn.close()

    def get_app(self):
        client.server_settings["debug"] = False
        client.server_settings["xsrf_cookies"] = False
        return tornado.web.Application(
            client.web_urls.www_urls, **client.server_settings
        )

    def _update_cookies(self, headers):
        cs = str(headers["Set-Cookie"])
        cs = escape.native_str(cs)
        cookies = cs.split(",")
        for cookie in cookies:
            self.cookies.update(SimpleCookie(cookie))

    def fetch(self, url, *r, **kw):
        if "follow_redirects" not in kw:
            kw["follow_redirects"] = False

        header = {"Cookie": ""}
        for cookie in self.cookies:
            header["Cookie"] += cookie + "=" + self.cookies[cookie].value + "; "

        resp = tornado.testing.AsyncHTTPTestCase.fetch(
            self, url, headers=header, *r, **kw
        )
        self._update_cookies(resp.headers)
        return resp

    def post(self, url, data, *r, **kw):
        body = urllib.parse.urlencode(data)
        return self.fetch(url, body=body, method="POST", *r, **kw)

    def get_cookie(self, name):
        cookie = decode_signed_value(
            config.COOKIE_SECRET, name, self.cookies[name].value
        )
        if cookie:
            return cookie.decode()
        return None
Ejemplo n.º 5
0
class Chinaso(SearchEngine):
    name = 'Chinaso'
    fake_url = True
    source_importance = 2

    page_size = 10

    def __init__(self):
        self.cookies = SimpleCookie()
        asyncio.ensure_future(self.update_cookies())

    def search_url(self, query):
        return 'http://www.chinaso.com/search/pagesearch.htm?q={}'.format(
            quote(query))

    def page_requests(self, query, **kwargs):
        max_records = kwargs.get('data_source_results')
        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = 'http://www.chinaso.com/search/pagesearch.htm?q={}&page={}&wd={}'.format(
                quote(query), num // self.page_size + 1, quote(query))
            yield HttpRequest(url)

    def before_request(self, request):
        self.set_cookie_header(request, self.cookies)

    def after_request(self, response):
        self.cookies.update(self.get_cookies_in_response(response))

    def extract_results(self, response):
        selector = Selector(response.text)
        for item in selector.css('li.reItem'):
            a = item.css('h2>a')
            if len(a) <= 0:
                continue
            title = a[0].text.strip()
            text = None
            div = item.css('div.reNewsWrapper')
            if len(div) > 0:
                text = div[0].text.strip().split('\n')[0]
            url = urljoin('http://www.chinaso.com/search/',
                          item.css('h2>a')[0].attr('href').strip())
            if text is not None:
                yield {'title': title, 'text': text, 'url': url}

    async def update_cookies(self):
        while True:
            try:
                url = 'http://www.chinaso.com/search/pagesearch.htm?q={}'.format(
                    quote('中国搜索'))
                try:
                    req = HttpRequest(url, allow_redirects=False)
                    await self.extension.handle_request(req)
                    resp = await self.downloader.fetch(req)
                except HttpError as e:
                    resp = e.response
                cookies = self.get_cookies_in_response(resp)
                self.cookies.update(cookies)
            except Exception as e:
                log.warning('Failed to update cookies: %s', e)
            finally:
                await asyncio.sleep(5 * 60)
Ejemplo n.º 6
0
class Baidu(SearchEngine):
    name = 'Baidu'
    fake_url = True
    source_importance = 2

    page_size = 10

    def __init__(self):
        self.cookies = SimpleCookie()
        asyncio.ensure_future(self.update_cookies())

    def search_url(self, query):
        return 'https://www.baidu.com/s?wd={}'.format(quote(query))

    def page_requests(self, query, **kwargs):
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')
        if max_records is None:
            max_records = self.page_size

        if site:
            query = query + " site:" + site

        if recent_days:
            today = datetime.now()
            if recent_days == 1:
                start = today + timedelta(days=-1)
            elif recent_days == 7:
                start = today + timedelta(days=-7)
            elif recent_days == 30:
                start = today + timedelta(days=-30)
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            start, end = int(time.mktime(start.timetuple())), int(
                time.mktime(today.timetuple()))
            raw_url = 'http://www.baidu.com/s?wd={}&gpc=stf%3D{}%2C{}|stftype%3D1'.format(
                quote(query), start, end)
        else:
            raw_url = 'http://www.baidu.com/s?wd={}'.format(quote(query))

        for num in range(0, max_records, self.page_size):
            url = '{}&pn={}'.format(raw_url, num)
            yield HttpRequest(url)

    def extract_results(self, response):
        selector = Selector(response.text)
        for item in selector.css('div.result'):
            title = item.css('h3>a')[0].text.strip()
            text = None
            abstract = item.css('div.c-abstract')
            if len(abstract) > 0:
                text = abstract[0].text.strip()
            url = item.css('h3>a')[0].attr('href').strip()
            if text is not None:
                yield {'title': title, 'text': text, 'url': url}

    def before_request(self, request):
        self.set_cookie_header(request, self.cookies)

    def after_request(self, response):
        self.cookies.update(self.get_cookies_in_response(response))

    async def update_cookies(self):
        """
        避免被BAN,定时通过主页刷新Cookie
        """
        while True:
            try:
                req = HttpRequest('http://www.baidu.com/')
                await self.extension.handle_request(req)
                resp = await self.downloader.fetch(req)
                self.cookies.update(self.get_cookies_in_response(resp))
            except Exception as e:
                log.warning('Failed to update cookies: %s', e)
            finally:
                await asyncio.sleep(5 * 60)
Ejemplo n.º 7
0
Archivo: so.py Proyecto: jadbin/metase
class So(SearchEngine):
    name = 'So'
    fake_url = False
    source_importance = 1

    page_size = 10

    def __init__(self):
        self.cookies = SimpleCookie()
        asyncio.ensure_future(self.update_cookies())

    def search_url(self, query):
        return 'https://www.so.com/s?q={}'.format(quote(query))

    def page_requests(self, query, **kwargs):
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            query = query + " site:" + site

        if recent_days:
            if recent_days == 1:
                adv_t = 'd'
            elif recent_days == 7:
                adv_t = 'w'
            elif recent_days == 30:
                adv_t = 'm'
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://www.so.com/s?q={}&adv_t={}'.format(
                quote(query), adv_t)
        else:
            raw_url = 'https://www.so.com/s?q={}'.format(quote(query))

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&pn={}'.format(raw_url, num // self.page_size + 1)
            yield HttpRequest(url)

    def before_request(self, request):
        self.set_cookie_header(request, self.cookies)

    def after_request(self, response):
        self.cookies.update(self.get_cookies_in_response(response))

    def extract_results(self, response):
        selector = Selector(response.text)
        for item in selector.css('li.res-list'):
            title = item.css('h3>a')[0].text.strip()
            text = None
            res_desc = item.css('p.res-desc')
            if len(res_desc) > 0:
                text = res_desc[0].text.strip()
            else:
                res_rich = item.css('div.res-rich')
                if len(res_rich) > 0:
                    text = res_rich[0].text.strip()
            h3_a = item.css('h3>a')[0]
            url = h3_a.attr('data-url')
            if not url:
                url = h3_a.attr('href').strip()
            if text is not None:
                yield {'title': title, 'text': text, 'url': url}

    async def update_cookies(self):
        """
        避免被BAN,定时通过主页刷新Cookie
        """
        while True:
            try:
                req = HttpRequest('https://www.so.com/')
                await self.extension.handle_request(req)
                resp = await self.downloader.fetch(req)
                self.cookies.update(self.get_cookies_in_response(resp))
            except Exception as e:
                log.warning('Failed to update cookies: %s', e)
            finally:
                await asyncio.sleep(5 * 60)
Ejemplo n.º 8
0
class Sogou(SearchEngine):
    name = 'Sogou'
    fake_url = True
    source_importance = 2

    page_size = 20

    def __init__(self):
        self.cookies = SimpleCookie()
        self.cookies['com_sohu_websearch_ITEM_PER_PAGE'] = str(self.page_size)
        asyncio.ensure_future(self.update_cookies())

    def search_url(self, query):
        return 'https://www.sogou.com/web?query={}'.format(quote(query))

    def page_requests(self, query, **kwargs):
        """
        tsn=1&sourceid=inttime_day
        tsn=2&sourceid=inttime_week
        tsn=3&sourceid=inttime_month
        北京+site%3A*.gov.cn
        """
        max_records = kwargs.get('data_source_results')
        recent_days = kwargs.get('recent_days')
        site = kwargs.get('site')

        if site:
            query = query + " site:" + site
        else:
            query = query

        if recent_days:
            if recent_days == 1:
                tsn, sourceid = 1, "inttime_day"
            elif recent_days == 7:
                tsn, sourceid = 2, "inttime_week"
            elif recent_days == 30:
                tsn, sourceid = 3, "inttime_month"
            else:
                raise ValueError('recent_days: {}'.format(recent_days))
            raw_url = 'https://www.sogou.com/web?query={}&tsn={}&sourceid={}'.format(quote(query), tsn, sourceid)
        else:
            raw_url = 'https://www.sogou.com/web?query={}'.format(quote(query))

        if max_records is None:
            max_records = self.page_size
        for num in range(0, max_records, self.page_size):
            url = '{}&page={}&ie=utf8'.format(raw_url, num // self.page_size + 1)
            yield HttpRequest(url)

    def before_request(self, request):
        self.set_cookie_header(request, self.cookies)

    def after_request(self, response):
        self.cookies.update(self.get_cookies_in_response(response))

    def extract_results(self, response):
        selector = Selector(response.text)
        for item in selector.css('div.vrwrap,div.rb'):
            h = item.css('h3>a')
            if len(h) <= 0:
                continue
            title = h[0].text.strip()
            text = None
            div_ft = item.css('div.ft')
            if len(div_ft) > 0:
                text = div_ft[0].text.strip()
            else:
                p_str = item.css('p.str_info')
                if len(p_str) > 0:
                    text = p_str[0].text.strip()
            url = urljoin('https://www.sogou.com/', item.css('h3>a')[0].attr('href').strip())
            if text is not None:
                yield {'title': title, 'text': text, 'url': url}

    async def update_cookies(self):
        """
        避免被BAN,定时通过主页刷新Cookie
        """
        while True:
            try:
                req = HttpRequest('https://www.sogou.com/')
                await self.extension.handle_request(req)
                resp = await self.downloader.fetch(req)
                self.cookies.update(self.get_cookies_in_response(resp))
            except Exception as e:
                log.warning('Failed to update cookies: %s', e)
            finally:
                await asyncio.sleep(5 * 60)