Esempio n. 1
0
def session_request(self,
                    method,
                    url,
                    params=None,
                    data=None,
                    headers=None,
                    cookies=None,
                    files=None,
                    auth=None,
                    timeout=conf.timeout if 'timeout' in conf else None,
                    allow_redirects=True,
                    proxies=None,
                    hooks=None,
                    stream=None,
                    verify=False,
                    cert=None,
                    json=None):
    # Create the Request.
    merged_cookies = merge_cookies(
        merge_cookies(RequestsCookieJar(), self.cookies),
        cookies or conf.cookie if 'cookie' in conf else None)

    req = Request(
        method=method.upper(),
        url=url,
        headers=merge_setting(
            headers, conf.http_headers if 'http_headers' in conf else {}),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)

    proxies = proxies or conf.proxies if 'proxies' in conf else {}

    settings = self.merge_environment_settings(prep.url, proxies, stream,
                                               verify, cert)

    # Send the request.
    send_kwargs = {
        'timeout': timeout,
        'allow_redirects': allow_redirects,
    }
    send_kwargs.update(settings)
    resp = self.send(prep, **send_kwargs)

    if resp.encoding == 'ISO-8859-1':
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding

    return resp
Esempio n. 2
0
 def test_precedence(self):
     content = '''
     <?xml version="1.0" encoding="XML"?>
     <meta charset="HTML5">
     <meta http-equiv="Content-type" content="text/html;charset=HTML4" />
     '''.strip()
     assert get_encodings_from_content(content) == ['HTML5', 'HTML4', 'XML']
Esempio n. 3
0
def guess_response_encoding(resp):
    '''
    Guess the content encoding of a requests response.

    Note: there's a performance issue due to chardet.
    '''
    # first try the encoding supplied by responce header and content
    encs = get_encodings_from_content(resp.content) or []
    for enc in encs:
        try:
            resp.content.decode(enc)
            LOG.info('Detected encoding %s from response content.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from response content doesn\'t work.')

    enc = get_encoding_from_headers(resp.headers)
    if enc:
        try:
            resp.content.decode(enc)
            LOG.info('Detected encoding %s from response header.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from response header doesn\'t work.')

    # neither encoding works, we have to go the hard way.
    start = clock()
    g = detect(resp.content)
    LOG.info('Detected encoding %s with cofidence of %g in %gs.' % (g['encoding'], g['confidence'], clock() - start))
    return g['encoding']
Esempio n. 4
0
 def test_precedence(self):
     content = '''
     <?xml version="1.0" encoding="XML"?>
     <meta charset="HTML5">
     <meta http-equiv="Content-type" content="text/html;charset=HTML4" />
     '''.strip()
     assert get_encodings_from_content(content) == ['HTML5', 'HTML4', 'XML']
Esempio n. 5
0
def encoding(rsp):
    """
    encoding of Response.content.

    if Response.encoding is None, encoding will be guessed
    by header or content or chardet if avaibable.
    """
    # content is unicode
    if isinstance(rsp.content, six.text_type):
        return 'unicode'

    # Try charset from content-type
    encoding = get_encoding_from_headers(rsp.headers)
    if encoding == 'ISO-8859-1':
        encoding = None

    # Try charset from content
    if not encoding and get_encodings_from_content:
        encoding = get_encodings_from_content(rsp.content)
        encoding = encoding and encoding[0] or None

    # Fallback to auto-detected encoding.
    if not encoding and chardet is not None:
        encoding = chardet.detect(rsp.content)['encoding']

    if encoding and encoding.lower() == 'gb2312':
        encoding = 'gb18030'

    encoding = encoding or 'utf-8'
    return encoding
Esempio n. 6
0
    def encoding(self):
        if hasattr(self, "_encoding"):
            return self._encoding

        # content is unicode
        if isinstance(self.content, unicode):
            return "unicode"

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == "ISO-8859-1":
            encoding = None

        # Try charset from content
        if not encoding:
            encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content)["encoding"]

        if encoding and encoding.lower() == "gb2312":
            encoding = "gb18030"

        self._encoding = encoding or "utf-8"
        return self._encoding
Esempio n. 7
0
def find_encoding(content, headers=None):
    # content is unicode
    if isinstance(content, str):
        return 'utf-8'

    encoding = None

    # Try charset from content-type
    if headers:
        encoding = get_encoding_from_headers(headers)
        if encoding == 'ISO-8859-1':
            encoding = None

    # Fallback to auto-detected encoding.
    if not encoding and chardet is not None:
        encoding = chardet.detect(content)['encoding']

    # Try charset from content
    if not encoding:
        try:
            encoding = get_encodings_from_content(content)
            encoding = encoding and encoding[0] or None
        except:
            if isinstance(content, bytes):
                return encoding or 'utf-8'

    if encoding and encoding.lower() == 'gb2312':
        encoding = 'gb18030'

    return encoding or 'latin_1'
Esempio n. 8
0
    def encoding(self):
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, unicode):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding:
            encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content)['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
Esempio n. 9
0
    def encoding(self):
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, unicode):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding:
            encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content)['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
Esempio n. 10
0
def find_encoding(content, headers=None):
    # content is unicode
    if isinstance(content, unicode):
        return 'unicode'

    encoding = None

    # Try charset from content-type
    if headers:
        encoding = get_encoding_from_headers(headers)
        if encoding == 'ISO-8859-1':
            encoding = None

    # Try charset from content
    if not encoding:
        encoding = get_encodings_from_content(content)
        encoding = encoding and encoding[0] or None

    # Fallback to auto-detected encoding.
    if not encoding and chardet is not None:
        encoding = chardet.detect(content)['encoding']

    if encoding and encoding.lower() == 'gb2312':
        encoding = 'gb18030'

    return encoding or 'latin_1'
Esempio n. 11
0
def session_request(self,
                    method,
                    url,
                    params=None,
                    data=None,
                    headers=None,
                    cookies=None,
                    files=None,
                    auth=None,
                    timeout=None,
                    allow_redirects=True,
                    proxies=None,
                    hooks=None,
                    stream=None,
                    verify=False,
                    cert=None,
                    json=None):
    conf = CONF.get("requests", {})
    if timeout is None and "timeout" in conf:
        timeout = conf["timeout"]
    merged_cookies = merge_cookies(
        merge_cookies(RequestsCookieJar(), self.cookies), cookies
        or (conf.cookie if "cookie" in conf else None))

    req = Request(
        method=method.upper(),
        url=url,
        headers=merge_setting(headers,
                              conf["headers"] if "headers" in conf else {}),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)
    proxies = proxies or (conf["proxies"] if "proxies" in conf else {})

    settings = self.merged_environment_settings(prep.url, proxies, stream,
                                                verify, cert)

    send_kwargs = {
        "timeout": timeout,
        "allow_redirects": allow_redirects,
    }
    send_kwargs.update(settings)
    resp = self.send(prep, **send_kwargs)

    if resp.encoding == "ISO-8859-1":
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding
    return resp
Esempio n. 12
0
    def _fetchContent(self):
        r = requests.get(self.url)

        if get_encodings_from_content(r.content):
            self.encoding = get_encodings_from_content(r.content)[0]
        else:
            from contextlib import closing
            from urllib.request import urlopen
            with closing(urlopen(self.url)) as f:
                self.encoding = f.info().getparam("charset")

        # Set System default Codeing
        reload(sys)
        sys.setdefaultencoding(self.encoding)

        content = r.content.decode(self.encoding)

        return content
Esempio n. 13
0
    def _fetchContent(self):
        r = requests.get(self.url)

        if get_encodings_from_content(r.content):
            self.encoding = get_encodings_from_content(r.content)[0]
        else:
            from contextlib import closing
            from urllib2 import urlopen
            with closing(urlopen(self.url)) as f:
                self.encoding = f.info().getparam("charset")

        # Set System default Codeing
        reload(sys)
        sys.setdefaultencoding(self.encoding)

        content = r.content.decode(self.encoding)

        return content
Esempio n. 14
0
    def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding and get_encodings_from_content:
            if six.PY3:
                encoding = get_encodings_from_content(
                    utils.pretty_unicode(self.content[:100]))
            else:
                encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content)['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
Esempio n. 15
0
def html_encoding(response_inst):
    encoding = getattr(response_inst, 'encoding', None)
    if encoding is False:  # 不使用编码, 可能是非文本页面
        return

    if not encoding:
        html_text = response_inst.response.text[1:]  # 去除bom标记
        encodings = get_encodings_from_content(html_text)
        if encodings:
            encoding = encodings[0]

    if not encoding:
        encoding = get_encoding_from_headers(response_inst.response.headers)

    response_inst.response.encoding = encoding or Public_Constant.default_html_encoding
Esempio n. 16
0
    def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type
        encoding = get_encoding_from_headers(self.headers)
        if encoding == 'ISO-8859-1':
            encoding = None

        # Try charset from content
        if not encoding and get_encodings_from_content:
            if six.PY3:
                encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100]))
            else:
                encoding = get_encodings_from_content(self.content)
            encoding = encoding and encoding[0] or None

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content[:600])['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding
Esempio n. 17
0
def session_request(self, method, url,
                    params=None, data=None, headers=None, cookies=None, files=None, auth=None,
                    timeout=10,
                    allow_redirects=True, proxies=None, hooks=None, stream=None, verify=False, cert=None, json=None):
    # Create the Request.
    merged_cookies = merge_cookies(merge_cookies(RequestsCookieJar(), self.cookies),
                                   cookies)
    default_header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }
    req = Request(
        method=method.upper(),
        url=url,
        headers=merge_setting(headers, default_header),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)

    proxies = proxies or {}

    settings = self.merge_environment_settings(
        prep.url, proxies, stream, verify, cert
    )

    # Send the request.
    send_kwargs = {
        'timeout': TIMEOUT,
        'allow_redirects': allow_redirects,
    }
    send_kwargs.update(settings)
    resp = self.send(prep, **send_kwargs)

    if resp.encoding == 'ISO-8859-1':
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding

    return resp
Esempio n. 18
0
def guess_response_encoding(headers, content):
    '''
    Guess the content encoding of a requests response content and header.

    Note: there's a performance issue due to chardet.
    '''
    # first try the encoding supplied by responce header and content
    encs = get_encodings_from_content(content) or []
    for enc in encs:
        try:
            content.decode(enc)
            LOG.info('Detected encoding %s from response content.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from response content doesn\'t work.')
        except Exception, e:
            LOG.error('Decode failed, %s' % e)
Esempio n. 19
0
def procdata_getencoding(seed,headers,content):

	code = utils.get_encoding_from_headers(headers)
	if code:
		if code.lower() == 'gbk' or code.lower() == 'gb2312':
			code = 'gbk'
		elif code.lower() == 'utf-8':
			code = 'utf-8'
		else:
			code = None

	if code == None:
		code = utils.get_encodings_from_content(content)
		print "content",seed,code
		if code:
			code = code[0]
			if code.lower() == 'gbk' or code.lower() == 'gb2312':
				code = 'gbk'

	return code
Esempio n. 20
0
def procdata_getencoding(seed, headers, content):

    code = utils.get_encoding_from_headers(headers)
    if code:
        if code.lower() == 'gbk' or code.lower() == 'gb2312':
            code = 'gbk'
        elif code.lower() == 'utf-8':
            code = 'utf-8'
        else:
            code = None

    if code == None:
        code = utils.get_encodings_from_content(content)
        print "content", seed, code
        if code:
            code = code[0]
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'

    return code
Esempio n. 21
0
def guess_content_encoding(content):
    '''
    Guess the encoding for plain content.

    Note: there's a performance issue due to chardet.
    '''
    # first try the encoding supplied by content
    encs = get_encodings_from_content(content) or []
    for enc in encs:
        try:
            content.decode(enc)
            LOG.info('Detected encoding %s from content.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from content doesn\'t work.')

    # neither encoding works, we have to go the hard way.
    start = clock()
    g = detect(content)
    LOG.info('Detected encoding %s with cofidence of %g in %gs.' % (g['encoding'], g['confidence'], clock() - start))
    return g['encoding']
Esempio n. 22
0
def guess_content_encoding(content):
    '''
    Guess the encoding for plain content.

    Note: there's a performance issue due to chardet.
    '''
    # first try the encoding supplied by content
    encs = get_encodings_from_content(content) or []
    for enc in encs:
        try:
            content.decode(enc)
            LOG.info('Detected encoding %s from content.', enc)
            return enc
        except UnicodeDecodeError:
            LOG.debug('Encoding from content doesn\'t work.')

    # neither encoding works, we have to go the hard way.
    start = clock()
    g = detect(content)
    LOG.info('Detected encoding %s with cofidence of %g in %gs.' %
             (g['encoding'], g['confidence'], clock() - start))
    return g['encoding']
Esempio n. 23
0
    def filter_encoding(self,seed, headers,content):

        code = utils.get_encoding_from_headers(headers)
        if code:
            if code.lower() == 'gbk' or code.lower() == 'gb2312':
                code = 'gbk'
                return True
            elif code.lower() == 'utf-8' or code.lower() == 'utf8':
                code = 'utf8'
                # as for utf8, we should check the content
            else: #  'ISO-8859-1' and so on, 
                code = None

        # chinese website may also miss the content-encoding header, so detect the content
        if code == None:
            codes = utils.get_encodings_from_content(content)
            if codes:
                for code in codes:
                    if code.lower() in [ 'gbk','gb2312']:
                        return True
                    elif code.lower() == 'utf8' or code.lower() == 'utf-8':
                        code = 'utf8'
                        break
       
        if code != 'utf8':
            return False
 
        # here handle utf8
        # to detect any chinese char win
        try:
            ucon = content.decode('utf8')
            for uchar in ucon:
                i = ord(uchar)
                if i >= 0x4e00 and i <= 0x9fa5:
                    return True
        except Exception, e:
            print url, e
            pass
Esempio n. 24
0
    def on_incoming(self, msg):
        if not msg.type == msg.CHANNEL:
            return

        # Catching all exceptions without alerting, as there is just so much crap that can go wrong with web stuff. Also, I'm lazy.
        try:
            urls = self.url_re.findall(msg.body)
            for url in urls:
                # Catch edge case where url is in brackets
                while url.startswith('(') and url.endswith(')'):
                    url = url[1:-1]

                head = requests.head(url, allow_redirects=True)
                # work on the URL we were redirected to, if any
                url = head.url

                message = ""
                content_type = head.headers['content-type']

                # HTML websites
                if 'text/html' in content_type:
                    # Set up any required request headers
                    req_headers = {}
                    # TODO: Accept-Language header from config

                    req = requests.get(url, headers=req_headers, timeout=5)

                    if 'charset' not in content_type:
                        # requests only looks at headers to detect the encoding, we must find the charset ourselves
                        # we can't use req.content because regex doesn't work on bytestrings apparently
                        encodings = get_encodings_from_content(req.text)
                        if encodings:
                            req.encoding = encodings[0]

                    soup = BeautifulSoup(req.text)

                    # Look for the <title> tag or an <h1>, whichever is first
                    title = soup.find(['title', 'h1'])
                    if title is None:
                        return
                    title = self.utils.tag_to_string(title)
                    title = ' '.join(title.split())
                    message = "Title: " + title

                # Other resources
                else:
                    content_length = head.headers.get('content-length', '')
                    if content_length.isdigit():
                        size = self.sizeof_fmt(int(content_length))
                    else:
                        size = "Unknown size"

                    # Searches for the last segment of the URL (the filename)
                    filename = re.search(r'/([^/]+)/?$', url).groups(1)[0]

                    message = "{}: {} ({})".format(filename, content_type,
                                                   size)

                self.bot.privmsg(msg.channel, message)

        except Exception as exception:
            print("Link Info Exception!")
            print(type(exception), exception)
Esempio n. 25
0
 def test_none(self):
     encodings = get_encodings_from_content('')
     assert not len(encodings)
Esempio n. 26
0
File: get.py Progetto: ye40/MiniUrl
def geturl(url,
           m='get',
           headers={},
           ip=None,
           retries=0,
           encoding=None,
           data=None,
           *args,
           **kwargs):
    """基于requests第三方库,对其进一步封装,返回response对象
            1.自动设定User-Agent
            2.自动判断html编码方式(url链接为文件的话无法自动判断)

        参数:
            url: 请求的网址 
            m:请求的方式,默认get
            headers: 设置头文件,默认添加User-Agent
            ip:代理,输入字符串格式 [ip:port]
            retries:请求错误之后的重试次数,显示错误信息
            encoding: 网页的编码格式,默认不编码
            data:向服务器发送的数据,注意修改方式
            其他:可以传入基于requests的其他参数
        返回值:
            requests的response对象
    """
    us = {'User-Agent': user_agent()}
    if headers.get('User-Agent') == None:
        headers.update(us)

    # 代理ip的处理
    proxies = None
    if ip:
        proxies = {'http': ip, 'https': ip, 'socks5': ip}

    # 请求网页的主函数
    #    retry 请求响应错误时重试
    @retry(retries=retries)
    def _response():
        request_method = getattr(requests, m)  # 字符串变成函数变量名
        rm = request_method(url,
                            headers=headers,
                            proxies=proxies,
                            *args,
                            **kwargs)
        return rm

    r = _response()

    # 重试失败的话返回错误信息
    if False in r:
        return r[1]

    # 优先采用encoding参数,如果不存在则自动判断
    if encoding != None:
        r.encoding = encoding
    else:
        # 一般的下载文件都很大,判断编码方式过于占用时间
        # 故先判断url链接是否为文件,不是文件才分析编码
        if "text/html" in r.headers['Content-Type']:
            encoding = get_encodings_from_content(r.text)
            if encoding:
                r.encoding = encoding[0]

    return r
Esempio n. 27
0
def session_request(self,
                    method,
                    url,
                    params=None,
                    data=None,
                    headers=None,
                    cookies=None,
                    files=None,
                    auth=None,
                    timeout=None,
                    allow_redirects=True,
                    proxies=None,
                    hooks=None,
                    stream=None,
                    verify=False,
                    cert=None,
                    json=None):
    # Create the Request.
    merged_cookies = merge_cookies(
        merge_cookies(RequestsCookieJar(), self.cookies), cookies)
    default_header = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }
    req = Request(
        method=method.upper(),
        url=url,
        headers=merge_setting(headers, default_header),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)

    raw = ''
    if prep.body:
        raw = "{}\n{}\n\n{}".format(
            prep.method + ' ' + prep.url,
            '\n'.join('{}: {}'.format(k, v) for k, v in prep.headers.items()),
            prep.body)
    else:
        raw = "{}\n{}".format(
            prep.method + ' ' + prep.url,
            '\n'.join('{}: {}'.format(k, v) for k, v in prep.headers.items()))

    proxies = proxies or {}
    if PROXY_CONFIG_BOOL and not proxies:
        proxies = PROXY_CONFIG

    settings = self.merge_environment_settings(prep.url, proxies, stream,
                                               verify, cert)

    # Send the request.
    send_kwargs = {
        'timeout': timeout or TIMEOUT,
        'allow_redirects': allow_redirects,
    }
    send_kwargs.update(settings)
    resp = self.send(prep, **send_kwargs)

    if resp.encoding == 'ISO-8859-1':
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding

    resp.raw = raw

    return resp
Esempio n. 28
0
 def test_none(self):
     encodings = get_encodings_from_content('')
     assert not len(encodings)
Esempio n. 29
0
 def test_pragmas(self, content):
     encodings = get_encodings_from_content(content)
     assert len(encodings) == 1
     assert encodings[0] == 'UTF-8'
Esempio n. 30
0
def session_request(self,
                    method,
                    url,
                    params=None,
                    data=None,
                    headers=None,
                    cookies=None,
                    files=None,
                    auth=None,
                    timeout=None,
                    allow_redirects=True,
                    proxies=None,
                    hooks=None,
                    stream=None,
                    verify=False,
                    cert=None,
                    json=None):
    # In order to remove headers that are set to None
    def _merge_retain_none(request_setting,
                           session_setting,
                           dict_class=OrderedDict):

        if session_setting is None:
            return request_setting

        if request_setting is None:
            return session_setting

        # Bypass if not a dictionary (e.g. verify)
        if not (isinstance(session_setting, Mapping)
                and isinstance(request_setting, Mapping)):
            return request_setting

        merged_setting = dict_class(to_key_val_list(session_setting))
        merged_setting.update(to_key_val_list(request_setting))

        return merged_setting

    # Create the Request.
    merged_cookies = merge_cookies(
        merge_cookies(RequestsCookieJar(), self.cookies), cookies
        or (conf.cookie if 'cookie' in conf else None))
    if conf.random_agent:
        conf.http_headers[HTTP_HEADER.USER_AGENT] = choice(conf.agents)

    req = Request(
        method=method.upper(),
        url=url,
        headers=_merge_retain_none(
            headers, conf.http_headers if 'http_headers' in conf else {}),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)

    # proxies = proxies or (conf.proxies if 'proxies' in conf else {})
    if proxies is None:
        proxies = conf.proxies if 'proxies' in conf else {}

    settings = self.merge_environment_settings(prep.url, proxies, stream,
                                               verify, cert)

    timeout = timeout or conf.get("timeout", None)
    if timeout:
        timeout = float(timeout)

    # Send the request.
    send_kwargs = {
        'timeout': timeout,
        'allow_redirects': allow_redirects,
    }
    send_kwargs.update(settings)
    resp = self.send(prep, **send_kwargs)

    if resp.encoding == 'ISO-8859-1':
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding

    return resp
Esempio n. 31
0
 def test_pragmas(self, content):
     encodings = get_encodings_from_content(content)
     assert len(encodings) == 1
     assert encodings[0] == 'UTF-8'
Esempio n. 32
0
def session_request(self, method, url,
                    params=None, data=None, headers=None, cookies=None, files=None, auth=None,
                    timeout=None,
                    allow_redirects=True, proxies=None, hooks=None, stream=None, verify=False, cert=None, json=None):
    # Create the Request.
    merged_cookies = merge_cookies(merge_cookies(RequestsCookieJar(), self.cookies),
                                   cookies)
    default_header = {
        "User-Agent": conf.agent,
        "Connection": "close"
    }
    req = Request(
        method=method.upper(),
        url=url,
        headers=merge_setting(headers, default_header),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)

    raw = ''
    p = urlparse(url)
    _headers = copy.deepcopy(prep.headers)
    if "Host" not in _headers:
        _headers["Host"] = p.netloc
    if prep.body:

        raw = "{}\n{}\n\n{}\n\n".format(
            prep.method + ' ' + prep.url + ' HTTP/1.1',
            '\n'.join('{}: {}'.format(k, v) for k, v in _headers.items()),
            prep.body)
    else:
        raw = "{}\n{}\n\n".format(
            prep.method + ' ' + prep.url + ' HTTP/1.1',
            '\n'.join('{}: {}'.format(k, v) for k, v in _headers.items()))

    proxies = proxies or {}
    if conf["proxy_config_bool"] and not proxies:
        proxies = conf["proxy"]
        if "socks4" in proxies.keys():
            _tmp_str = "socks4://" + proxies["socks4"]
            _tmp_proxy = {
                "http": _tmp_str,
                "https": _tmp_str
            }
            proxies = _tmp_proxy
        elif "socks5" in proxies.keys():
            _tmp_str = "socks5://" + proxies["socks5"]
            _tmp_proxy = {
                "http": _tmp_str,
                "https": _tmp_str
            }
            proxies = _tmp_proxy

    # prep.url = prep.url.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
    # fix https://github.com/boy-hack/w13scan/issues/64

    settings = self.merge_environment_settings(
        prep.url, proxies, stream, verify, cert
    )

    # Send the request.
    send_kwargs = {
        'timeout': timeout or conf["timeout"],
        'allow_redirects': allow_redirects,
    }
    send_kwargs.update(settings)

    resp = self.send(prep, **send_kwargs)

    if resp.encoding == 'ISO-8859-1':
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding

    setattr(resp, 'reqinfo', raw)
    return resp
Esempio n. 33
0
def session_request(self,
                    method,
                    url,
                    params=None,
                    data=None,
                    headers=None,
                    cookies=None,
                    files=None,
                    auth=None,
                    timeout=None,
                    allow_redirects=True,
                    proxies=None,
                    hooks=None,
                    stream=None,
                    verify=False,
                    cert=None,
                    json=None):
    # Create the Request.
    merged_cookies = merge_cookies(
        merge_cookies(RequestsCookieJar(), self.cookies), cookies)
    default_header = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        "Connection": "close"
    }
    req = Request(
        method=method.upper(),
        url=url,
        headers=merge_setting(headers, default_header),
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=merged_cookies,
        hooks=hooks,
    )
    prep = self.prepare_request(req)

    raw = ''
    if prep.body:
        raw = "{}\n{}\n\n{}".format(
            prep.method + ' ' + prep.url,
            '\n'.join('{}: {}'.format(k, v) for k, v in prep.headers.items()),
            prep.body)
    else:
        raw = "{}\n{}".format(
            prep.method + ' ' + prep.url,
            '\n'.join('{}: {}'.format(k, v) for k, v in prep.headers.items()))

    proxies = proxies or {}
    if conf["proxy_config_bool"] and not proxies:
        proxies = conf["proxy"]

    # prep.url = prep.url.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
    # fix https://github.com/boy-hack/w13scan/issues/64

    settings = self.merge_environment_settings(prep.url, proxies, stream,
                                               verify, cert)

    # Send the request.
    send_kwargs = {
        'timeout': timeout or conf["timeout"],
        'allow_redirects': allow_redirects,
    }
    send_kwargs.update(settings)

    resp = self.send(prep, **send_kwargs)

    if resp.encoding == 'ISO-8859-1':
        encodings = get_encodings_from_content(resp.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = resp.apparent_encoding

        resp.encoding = encoding

    if not getattr(resp, 'raw', None):
        resp.raw = raw
    else:
        tmp = resp.raw
        resp.raw = raw
        setattr(resp, 'raw2', tmp)
    return resp
Esempio n. 34
0
	def on_incoming(self, msg):
		if not msg.type == msg.CHANNEL:
			return

		# Catching all exceptions without alerting, as there is just so much crap that can go wrong with web stuff. Also, I'm lazy.
		try:
			urls = self.url_re.findall(msg.body)
			for url in urls:
				# Catch edge case where url is in brackets
				while url.startswith('(') and url.endswith(')'):
					url = url[1:-1]

				head = requests.head(url, allow_redirects=True)
				# work on the URL we were redirected to, if any
				url = head.url

				message = ""
				content_type = head.headers['content-type']

				# HTML websites
				if 'text/html' in content_type:
					# Set up any required request headers
					req_headers = {}
					# TODO: Accept-Language header from config

					req = requests.get(url, headers=req_headers, timeout=5)

					if 'charset' not in content_type:
						# requests only looks at headers to detect the encoding, we must find the charset ourselves
						# we can't use req.content because regex doesn't work on bytestrings apparently
						encodings = get_encodings_from_content(req.text)
						if encodings:
							req.encoding = encodings[0]

					soup = BeautifulSoup(req.text)

					# Look for the <title> tag or an <h1>, whichever is first
					title = soup.find(['title', 'h1'])
					if title is None:
						return
					title = self.utils.tag_to_string(title)
					title = ' '.join(title.split())
					message = "Title: " + title

				# Other resources
				else:
					content_length = head.headers.get('content-length', '')
					if content_length.isdigit():
						size = self.sizeof_fmt(int(content_length))
					else:
						size = "Unknown size"

					# Searches for the last segment of the URL (the filename)
					filename = re.search(r'/([^/]+)/?$', url).groups(1)[0]

					message = "{}: {} ({})".format(filename, content_type, size)

				self.bot.privmsg(msg.channel, message)

		except Exception as exception:
			print("Link Info Exception!")
			print(type(exception), exception)