Esempio n. 1
0
def main():
    mh = MysqlHelper('47.110.88.64', 'root', 'admin963', 'zhizhudashi', 'utf8')

    url_list = [1, 342, 1679, 344, 11]
    for i in url_list:
        url = 'http://top.baidu.com/buzz?b=' + str(i) + '&c=513&fr=topbuzz_b42_c513'
        try:
            response = requests.get(url, headers=headers)
            response.encoding = 'gb2312'
            html = response.text
            doc = pq(html)
            new_names = doc(".list-title")
            for name in new_names.items():
                string = name.text()
                select_sql = 'select * from keywords where title = "%s"' % (string)
                id = mh.find_id(select_sql)
                if id != None and id != '' and id != 0:
                    continue
                else:
                    insert_sql = "insert into keywords (title,object_id) values ('%s',%d)" % (string, 1)
                    mh.cud(insert_sql)
            # f = open('百度热词.txt', 'a')
            # for name in new_names.items():
            #     string = name.text()
            #     f.write('\n' + str(string))
            # f.close()
        except Exception as e:
            print(e)
Esempio n. 2
0
    def returnSohuURLs(self, index, number, dataList):
        '''用于批量提取搜狐新闻
            过程:先模拟后台请求,得到json数据,分析得到文章的id,再将文章的网页前缀与文章id拼接,得到详情页地址'''
        extends = '&sceneId=1460&page=1&size=20'
        page = ''
        rang = number.split('-')

        start = int(rang[0])
        end = int(rang[1])

        pageSize = end - start
        if pageSize <= 0:
            print('后缀为:' + index + '的url所配置的数量不合法==>' + range[0] + ':' +
                  range[1])
        parameter = '&sceneId=' + index
        page = int(end / pageSize)
        parameter = parameter + '&page=' + str(page) + '&size=' + str(pageSize)
        # response = requests.get(dataList[0]+parameter)

        # 请求头 添加cookie信息
        cookies = requests.cookies.RequestsCookieJar()
        # 添加请求头和cookies
        response = requests.get(dataList[0] + parameter,
                                headers=getHeader(),
                                cookies=cookies)
        content = response.text
        # 将conten字符格式转化为json格式,便于操作
        data = json.loads(content)
        urls = []
        for temp in (data):
            url = dataList[1] + str(temp['id']) + '_' + str(temp['authorId'])
            urls.append(url)
        # print(urls)
        return urls
Esempio n. 3
0
def _read_from_url(url: str, config: Config = None) -> IO:
    """Reads data from *url* with an HTTP *GET*.

    This function supports fetching from resources which use basic HTTP auth as
    laid out by RFC1738 § 3.1. See § 5 for grammar definitions for URLs.

    .. seealso:

       https://www.ietf.org/rfc/rfc1738.txt

    :param url: URL of an HTTP resource
    :type url: ``str``

    :return: data read from resource described by *url*
    :rtype: ``file``-like object
    """
    r = requests.get(url,
                     stream=True,
                     config=config,
                     timeout=config.intersphinx_timeout)
    r.raise_for_status()
    r.raw.url = r.url
    # decode content-body based on the header.
    # ref: https://github.com/kennethreitz/requests/issues/2155
    r.raw.read = functools.partial(r.raw.read, decode_content=True)
    return r.raw
Esempio n. 4
0
    def handle(self, node: nodes.image) -> None:
        try:
            basename = os.path.basename(node['uri'])
            if '?' in basename:
                basename = basename.split('?')[0]
            if basename == '' or len(basename) > MAX_FILENAME_LEN:
                filename, ext = os.path.splitext(node['uri'])
                basename = sha1(filename.encode()).hexdigest() + ext
            basename = re.sub(CRITICAL_PATH_CHAR_RE, "_", basename)

            dirname = node['uri'].replace('://', '/').translate({
                ord("?"): "/",
                ord("&"): "/"
            })
            if len(dirname) > MAX_FILENAME_LEN:
                dirname = sha1(dirname.encode()).hexdigest()
            ensuredir(os.path.join(self.imagedir, dirname))
            path = os.path.join(self.imagedir, dirname, basename)

            headers = {}
            if os.path.exists(path):
                timestamp = ceil(os.stat(path).st_mtime)  # type: float
                headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp)

            r = requests.get(node['uri'], headers=headers)
            if r.status_code >= 400:
                logger.warning(
                    __('Could not fetch remote image: %s [%d]') %
                    (node['uri'], r.status_code))
            else:
                self.app.env.original_image_uri[path] = node['uri']

                if r.status_code == 200:
                    with open(path, 'wb') as f:
                        f.write(r.content)

                last_modified = r.headers.get('last-modified')
                if last_modified:
                    timestamp = rfc1123_to_epoch(last_modified)
                    os.utime(path, (timestamp, timestamp))

                mimetype = guess_mimetype(path, default='*')
                if mimetype != '*' and os.path.splitext(basename)[1] == '':
                    # append a suffix if URI does not contain suffix
                    ext = get_image_extension(mimetype)
                    newpath = os.path.join(self.imagedir, dirname,
                                           basename + ext)
                    os.replace(path, newpath)
                    self.app.env.original_image_uri.pop(path)
                    self.app.env.original_image_uri[newpath] = node['uri']
                    path = newpath
                node['candidates'].pop('?')
                node['candidates'][mimetype] = path
                node['uri'] = path
                self.app.env.images.add_file(self.env.docname, path)
        except Exception as exc:
            logger.warning(
                __('Could not fetch remote image: %s [%s]') %
                (node['uri'], exc))
Esempio n. 5
0
    def getgooddetail(self, response):

        # urls = response.url
        # gid = urls.split('/')[-1].split('.')[0]
        # gcomnent_json_url = 'https://club.jd.com/comment/product' \
        #                     'CommentSummaries.action?referenceIds={}'.format(gid)
        # gprice = requests.get(gcomnent_json_url)
        # goodcom = gprice.json()
        # gooddt = BeautifulSoup(response.text,'html.parser')
        # print('好评数:',goodcom['CommentsCount'][0]['GoodCountStr'])
        #     print('差评数:',gcom['CommentsCount']['0']['PoorCountStr'])
        #     print('商品链接:',urls)
        # gpce = jdGoodItem()
        # gpce['price'] = goodprice['stock']['jdPrice']['p']
        # yield gpce
        # 构建一个商品对象item
        goods = goodsItem()
        goods['goodsName'] = response.xpath(
            '//div[@class="sku-name"]/text()')[0].root
        goods['goodsUrl'] = response.url
        # goods['goodsUrl'] = response.xpath('//div[@class="sku-name"]/text()')
        goods['goodsId'] = response.url.split('/')[-1].split('.')[0]
        # 获取传过来的分类
        # goods['goodsClassify'] = response.meta['classify']

        p_Url = 'https://c0.3.cn/stock?skuId=%s&cat=1320,1585,10975&venderId=100000' \
                '8814&area=1_72_4137_0&buyNum=1&choseSuitSkuIds=&extraParam={"originid":"1"}&ch=1&f' \
                'qsp=0&pduid=775011473' % \
                goods['goodsId']
        # p_Url = self.price_url.format(goods['goodsId'])
        resp = requests.get(p_Url)
        resp = resp.json()

        goods['goodsPrice'] = resp['stock']['jdPrice']['p']

        p_Url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + goods[
            'goodsId']
        com_resp = requests.get(p_Url)
        com_resp = com_resp.json()
        goods['goodsCommentCount'] = com_resp['CommentsCount'][0][
            'CommentCountStr']
        goods['hp'] = com_resp['CommentsCount'][0]['GoodCountStr']
        goods['zp'] = com_resp['CommentsCount'][0]['GeneralCountStr']
        goods['cp'] = com_resp['CommentsCount'][0]['PoorCountStr']
        yield goods
Esempio n. 6
0
    def handle(self, node):
        # type: (nodes.Node) -> None
        try:
            basename = os.path.basename(node['uri'])
            if '?' in basename:
                basename = basename.split('?')[0]
            if basename == '' or len(basename) > MAX_FILENAME_LEN:
                filename, ext = os.path.splitext(node['uri'])
                basename = sha1(filename.encode("utf-8")).hexdigest() + ext

            dirname = node['uri'].replace('://', '/').translate({ord("?"): u"/",
                                                                 ord("&"): u"/"})
            if len(dirname) > MAX_FILENAME_LEN:
                dirname = sha1(dirname.encode('utf-8')).hexdigest()
            ensuredir(os.path.join(self.imagedir, dirname))
            path = os.path.join(self.imagedir, dirname, basename)

            headers = {}
            if os.path.exists(path):
                timestamp = ceil(os.stat(path).st_mtime)  # type: float
                headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp)

            r = requests.get(node['uri'], headers=headers)
            if r.status_code >= 400:
                logger.warning(__('Could not fetch remote image: %s [%d]') %
                               (node['uri'], r.status_code))
            else:
                self.app.env.original_image_uri[path] = node['uri']

                if r.status_code == 200:
                    with open(path, 'wb') as f:
                        f.write(r.content)

                last_modified = r.headers.get('last-modified')
                if last_modified:
                    timestamp = rfc1123_to_epoch(last_modified)
                    os.utime(path, (timestamp, timestamp))

                mimetype = guess_mimetype(path, default='*')
                if mimetype != '*' and os.path.splitext(basename)[1] == '':
                    # append a suffix if URI does not contain suffix
                    ext = get_image_extension(mimetype)
                    newpath = os.path.join(self.imagedir, dirname, basename + ext)
                    movefile(path, newpath)
                    self.app.env.original_image_uri.pop(path)
                    self.app.env.original_image_uri[newpath] = node['uri']
                    path = newpath
                node['candidates'].pop('?')
                node['candidates'][mimetype] = path
                node['uri'] = path
                self.app.env.images.add_file(self.env.docname, path)
        except Exception as exc:
            logger.warning(__('Could not fetch remote image: %s [%s]') %
                           (node['uri'], text_type(exc)))
Esempio n. 7
0
    def handle(self, node):
        # type: (nodes.Node) -> None
        basename = os.path.basename(node['uri'])
        if '?' in basename:
            basename = basename.split('?')[0]
        dirname = node['uri'].replace('://', '/').translate({
            ord("?"): u"/",
            ord("&"): u"/"
        })
        ensuredir(os.path.join(self.imagedir, dirname))
        path = os.path.join(self.imagedir, dirname, basename)
        try:
            headers = {}
            if os.path.exists(path):
                timestamp = ceil(os.stat(path).st_mtime)
                headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp)

            r = requests.get(node['uri'], headers=headers)
            if r.status_code >= 400:
                logger.warning('Could not fetch remote image: %s [%d]' %
                               (node['uri'], r.status_code))
            else:
                self.app.env.original_image_uri[path] = node['uri']

                if r.status_code == 200:
                    with open(path, 'wb') as f:
                        f.write(r.content)

                last_modified = r.headers.get('last-modified')
                if last_modified:
                    timestamp = rfc1123_to_epoch(last_modified)
                    os.utime(path, (timestamp, timestamp))

                mimetype = guess_mimetype(path, default='*')
                node['candidates'].pop('?')
                node['candidates'][mimetype] = path
                node['uri'] = path
                self.app.env.images.add_file(self.env.docname, path)
        except Exception as exc:
            logger.warning('Could not fetch remote image: %s [%s]' %
                           (node['uri'], text_type(exc)))
Esempio n. 8
0
def _read_from_url(url, config=None):
    # type: (unicode, Config) -> IO
    """Reads data from *url* with an HTTP *GET*.

    This function supports fetching from resources which use basic HTTP auth as
    laid out by RFC1738 § 3.1. See § 5 for grammar definitions for URLs.

    .. seealso:

       https://www.ietf.org/rfc/rfc1738.txt

    :param url: URL of an HTTP resource
    :type url: ``str``

    :return: data read from resource described by *url*
    :rtype: ``file``-like object
    """
    r = requests.get(url, stream=True, config=config, timeout=config.intersphinx_timeout)
    r.raise_for_status()
    r.raw.url = r.url
    return r.raw
Esempio n. 9
0
    def returnQQURLs(self, number, req):
        '''用于批量提取腾讯新闻  只提取新闻,不提取专题
           过程:后台请求时,会携带上次请求所得的id作为参数'''
        # 腾讯新闻一次返回10 个新闻
        pageSize = 10
        rang = number.split('-')
        start = int(rang[0])
        end = int(rang[1])
        pageCount = int((end - start) / pageSize)
        urlList = []
        expIdsList = []

        # 请求头 添加cookie信息
        cookies = requests.cookies.RequestsCookieJar()
        for i in range(pageCount):
            if i == 0:
                url2 = req + '&page=' + str(i) + '&expIds='
            else:
                url2 = req + '&page=' + str(i) + '&expIds=' + '|'.join(
                    str(id) for id in expIdsList)
            expIdsList.clear()
            # response = requests.get(url2)
            # 添加请求头和cookies
            response = requests.get(url2, headers=getHeader(), cookies=cookies)
            # 将数据转为json格式
            conten = json.loads(response.text)
            dataList = conten.get('data')
            for temp in dataList:
                # article_tple为11代表该条新闻指定的是一个专题
                if (temp['article_type']) != 11:
                    url = temp['vurl']
                    id = temp['id']
                    urlList.append(url)
                    expIdsList.append(id)
                else:
                    pass
                    # print('专题链接:'+temp['vurl'])
        # print('腾讯新闻url数量:%d'%len(urlList))
        # print(urlList)
        return urlList
Esempio n. 10
0
    def handle(self, node):
        # type: (nodes.Node) -> None
        basename = os.path.basename(node['uri'])
        if '?' in basename:
            basename = basename.split('?')[0]
        dirname = node['uri'].replace('://', '/').translate({ord("?"): u"/",
                                                             ord("&"): u"/"})
        ensuredir(os.path.join(self.imagedir, dirname))
        path = os.path.join(self.imagedir, dirname, basename)
        try:
            headers = {}
            if os.path.exists(path):
                timestamp = ceil(os.stat(path).st_mtime)
                headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp)

            r = requests.get(node['uri'], headers=headers)
            if r.status_code >= 400:
                logger.warning('Could not fetch remote image: %s [%d]' %
                               (node['uri'], r.status_code))
            else:
                self.app.env.original_image_uri[path] = node['uri']

                if r.status_code == 200:
                    with open(path, 'wb') as f:
                        f.write(r.content)

                last_modified = r.headers.get('last-modified')
                if last_modified:
                    timestamp = rfc1123_to_epoch(last_modified)
                    os.utime(path, (timestamp, timestamp))

                mimetype = guess_mimetype(path, default='*')
                node['candidates'].pop('?')
                node['candidates'][mimetype] = path
                node['uri'] = path
                self.app.env.images.add_file(self.env.docname, path)
        except Exception as exc:
            logger.warning('Could not fetch remote image: %s [%s]' %
                           (node['uri'], text_type(exc)))
Esempio n. 11
0
    def returnTouTiaoURLs(self, index, number, req):
        '''用于批量提取今日头条的新闻
           过程:后台请求时,会携带上次请求中的max_behot_time的值作为参数'''
        max_behot_time = max_behot_time_tmp = 0
        category = index
        rang = number.split('-')
        start = int(rang[0])
        end = int(rang[1])
        pageSize = 10
        pageCount = int((end - start) / pageSize)
        urlList = []
        # 请求头 添加cookie信息
        cookies = requests.cookies.RequestsCookieJar()
        for i in range(pageCount):
            reqUrl = req + '&category=%s&max_behot_time=%d&max_behot_time_tmp=%d' % (
                category, max_behot_time, max_behot_time_tmp)

            print(reqUrl)
            # 添加请求头和cookies
            response = requests.get(reqUrl,
                                    headers=getHeader(),
                                    cookies=cookies)
            cookies.update(response.cookies)
            content = json.loads(response.text)
            dataList = content['data']
            for data in dataList:
                if 'article'.__eq__(data['article_genre']):
                    itemId = data['item_id']
                    # 请求得到的json数据中,不会包含完整的详情页地址,需要与详情页前缀拼接
                    newsUrl = 'https://www.toutiao.com/a%s' % (itemId)
                    urlList.append(newsUrl)
            # print('urlList长度:%d'%(len(urlList)))
            nextDic = content['next']
            nextId = nextDic['max_behot_time']
            max_behot_time = max_behot_time_tmp = nextId
        # print(urlList)
        return urlList
Esempio n. 12
0
def _read_from_url(url, config=None):
    # type: (unicode, Config) -> IO
    """Reads data from *url* with an HTTP *GET*.

    This function supports fetching from resources which use basic HTTP auth as
    laid out by RFC1738 § 3.1. See § 5 for grammar definitions for URLs.

    .. seealso:

       https://www.ietf.org/rfc/rfc1738.txt

    :param url: URL of an HTTP resource
    :type url: ``str``

    :return: data read from resource described by *url*
    :rtype: ``file``-like object
    """
    r = requests.get(url, stream=True, config=config, timeout=config.intersphinx_timeout)
    r.raise_for_status()
    r.raw.url = r.url
    # decode content-body based on the header.
    # ref: https://github.com/kennethreitz/requests/issues/2155
    r.raw.read = functools.partial(r.raw.read, decode_content=True)
    return r.raw
Esempio n. 13
0
        def check_uri() -> Tuple[str, str, int]:
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
                for rex in self.anchors_ignore:
                    if rex.match(anchor):
                        anchor = None
                        break
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            # Get auth info, if any
            for pattern, auth_info in self.auth:
                if pattern.match(uri):
                    break
            else:
                auth_info = None

            # update request headers for the URL
            kwargs['headers'] = get_request_headers()

            try:
                if anchor and self.config.linkcheck_anchors:
                    # Read the whole document and see if #anchor exists
                    response = requests.get(req_url,
                                            stream=True,
                                            config=self.config,
                                            auth=auth_info,
                                            **kwargs)
                    response.raise_for_status()
                    found = check_anchor(response, unquote(anchor))

                    if not found:
                        raise Exception(__("Anchor '%s' not found") % anchor)
                else:
                    try:
                        # try a HEAD request first, which should be easier on
                        # the server and the network
                        response = requests.head(req_url,
                                                 allow_redirects=True,
                                                 config=self.config,
                                                 auth=auth_info,
                                                 **kwargs)
                        response.raise_for_status()
                    # Servers drop the connection on HEAD requests, causing
                    # ConnectionError.
                    except (ConnectionError, HTTPError,
                            TooManyRedirects) as err:
                        if isinstance(
                                err,
                                HTTPError) and err.response.status_code == 429:
                            raise
                        # retry with GET request if that fails, some servers
                        # don't like HEAD requests.
                        response = requests.get(req_url,
                                                stream=True,
                                                config=self.config,
                                                auth=auth_info,
                                                **kwargs)
                        response.raise_for_status()
            except HTTPError as err:
                if err.response.status_code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                elif err.response.status_code == 429:
                    next_check = self.limit_rate(err.response)
                    if next_check is not None:
                        self.wqueue.put(CheckRequest(next_check, hyperlink),
                                        False)
                        return 'rate-limited', '', 0
                    return 'broken', str(err), 0
                elif err.response.status_code == 503:
                    # We'll take "Service Unavailable" as ignored.
                    return 'ignored', str(err), 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                return 'broken', str(err), 0
            else:
                netloc = urlparse(req_url).netloc
                try:
                    del self.rate_limits[netloc]
                except KeyError:
                    pass
            if response.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = response.url
                if anchor:
                    new_url += '#' + anchor

                if allowed_redirect(req_url, new_url):
                    return 'working', '', 0
                elif response.history:
                    # history contains any redirects, get last
                    code = response.history[-1].status_code
                    return 'redirected', new_url, code
                else:
                    return 'redirected', new_url, 0
Esempio n. 14
0
        def check_uri():
            # type: () -> Tuple[unicode, unicode, int]
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
                for rex in self.anchors_ignore:
                    if rex.match(anchor):
                        anchor = None
                        break
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            try:
                if anchor and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #anchor exists
                    response = requests.get(req_url,
                                            stream=True,
                                            config=self.app.config,
                                            **kwargs)
                    found = check_anchor(response, unquote(anchor))

                    if not found:
                        raise Exception("Anchor '%s' not found" % anchor)
                else:
                    try:
                        # try a HEAD request first, which should be easier on
                        # the server and the network
                        response = requests.head(req_url,
                                                 config=self.app.config,
                                                 **kwargs)
                        response.raise_for_status()
                    except HTTPError as err:
                        # retry with GET request if that fails, some servers
                        # don't like HEAD requests.
                        response = requests.get(req_url,
                                                stream=True,
                                                config=self.app.config,
                                                **kwargs)
                        response.raise_for_status()
            except HTTPError as err:
                if err.response.status_code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                if is_ssl_error(err):
                    return 'ignored', str(err), 0
                else:
                    return 'broken', str(err), 0
            if response.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = response.url
                if anchor:
                    new_url += '#' + anchor
                # history contains any redirects, get last
                if response.history:
                    code = response.history[-1].status_code
                return 'redirected', new_url, code
Esempio n. 15
0
        def check_uri():
            # type: () -> Tuple[unicode, unicode, int]
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
                for rex in self.anchors_ignore:
                    if rex.match(anchor):
                        anchor = None
                        break
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            try:
                if anchor and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #anchor exists
                    response = requests.get(req_url, stream=True, config=self.app.config,
                                            **kwargs)
                    found = check_anchor(response, unquote(anchor))

                    if not found:
                        raise Exception("Anchor '%s' not found" % anchor)
                else:
                    try:
                        # try a HEAD request first, which should be easier on
                        # the server and the network
                        response = requests.head(req_url, config=self.app.config, **kwargs)
                        response.raise_for_status()
                    except HTTPError as err:
                        # retry with GET request if that fails, some servers
                        # don't like HEAD requests.
                        response = requests.get(req_url, stream=True, config=self.app.config,
                                                **kwargs)
                        response.raise_for_status()
            except HTTPError as err:
                if err.response.status_code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                if is_ssl_error(err):
                    return 'ignored', str(err), 0
                else:
                    return 'broken', str(err), 0
            if response.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = response.url
                if anchor:
                    new_url += '#' + anchor
                # history contains any redirects, get last
                if response.history:
                    code = response.history[-1].status_code
                    return 'redirected', new_url, code
                else:
                    return 'redirected', new_url, 0
Esempio n. 16
0
        def check_uri() -> Tuple[str, str, int]:
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
                for rex in self.anchors_ignore:
                    if rex.match(anchor):
                        anchor = None
                        break
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            # Get auth info, if any
            for pattern, auth_info in self.auth:
                if pattern.match(uri):
                    break
            else:
                auth_info = None

            # update request headers for the URL
            kwargs['headers'] = get_request_headers()

            try:
                if anchor and self.config.linkcheck_anchors:
                    # Read the whole document and see if #anchor exists
                    response = requests.get(req_url,
                                            stream=True,
                                            config=self.config,
                                            auth=auth_info,
                                            **kwargs)
                    response.raise_for_status()
                    anchor_str = unquote(anchor)
                    # Hack (?): https://github.com/container-storage-interface/spec/blob/master/spec.md#getplugininfo
                    # is a valid anchor, but the actual id of the anchor is user-content-getplugininfo, which causes
                    # the anchor check to fail:
                    # <a id="user-content-getplugininfo" class="anchor" aria-hidden="true" href="#getplugininfo">
                    #
                    # Might have to be fixed in AnchorCheckParser instead?
                    if req_url.startswith('https://github.com/'):
                        anchor_str = "user-content-" + anchor_str
                    found = check_anchor(response, anchor_str)

                    if not found:
                        raise Exception(__("Anchor '%s' not found") % anchor)
                else:
                    try:
                        # try a HEAD request first, which should be easier on
                        # the server and the network
                        response = requests.head(req_url,
                                                 allow_redirects=True,
                                                 config=self.config,
                                                 auth=auth_info,
                                                 **kwargs)
                        response.raise_for_status()
                    except (HTTPError, TooManyRedirects) as err:
                        if isinstance(
                                err,
                                HTTPError) and err.response.status_code == 429:
                            raise
                        # retry with GET request if that fails, some servers
                        # don't like HEAD requests.
                        response = requests.get(req_url,
                                                stream=True,
                                                config=self.config,
                                                auth=auth_info,
                                                **kwargs)
                        response.raise_for_status()
            except HTTPError as err:
                if err.response.status_code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                elif err.response.status_code == 429:
                    next_check = self.limit_rate(err.response)
                    if next_check is not None:
                        self.wqueue.put(CheckRequest(next_check, hyperlink),
                                        False)
                        return 'rate-limited', '', 0
                    return 'broken', str(err), 0
                elif err.response.status_code == 503:
                    # We'll take "Service Unavailable" as ignored.
                    return 'ignored', str(err), 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                return 'broken', str(err), 0
            else:
                netloc = urlparse(req_url).netloc
                try:
                    del self.rate_limits[netloc]
                except KeyError:
                    pass
            if response.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = response.url
                if anchor:
                    new_url += '#' + anchor
                # history contains any redirects, get last
                if response.history:
                    code = response.history[-1].status_code
                    return 'redirected', new_url, code
                else:
                    return 'redirected', new_url, 0
Esempio n. 17
0
        def check_uri() -> Tuple[str, str, int]:
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
                for rex in self.anchors_ignore:
                    if rex.match(anchor):
                        anchor = None
                        break
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            # Get auth info, if any
            for pattern, auth_info in self.auth:
                if pattern.match(uri):
                    break
            else:
                auth_info = None

            # update request headers for the URL
            kwargs['headers'] = get_request_headers()

            try:
                if anchor and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #anchor exists
                    response = requests.get(req_url,
                                            stream=True,
                                            config=self.app.config,
                                            auth=auth_info,
                                            **kwargs)
                    response.raise_for_status()
                    found = check_anchor(response, unquote(anchor))

                    if not found:
                        raise Exception(__("Anchor '%s' not found") % anchor)
                else:
                    try:
                        # try a HEAD request first, which should be easier on
                        # the server and the network
                        response = requests.head(req_url,
                                                 allow_redirects=True,
                                                 config=self.app.config,
                                                 auth=auth_info,
                                                 **kwargs)
                        response.raise_for_status()
                    except HTTPError:
                        # retry with GET request if that fails, some servers
                        # don't like HEAD requests.
                        response = requests.get(req_url,
                                                stream=True,
                                                config=self.app.config,
                                                auth=auth_info,
                                                **kwargs)
                        response.raise_for_status()
            except HTTPError as err:
                if err.response.status_code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                elif err.response.status_code == 503:
                    # We'll take "Service Unavailable" as ignored.
                    return 'ignored', str(err), 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                return 'broken', str(err), 0
            if response.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = response.url
                if anchor:
                    new_url += '#' + anchor
                # history contains any redirects, get last
                if response.history:
                    code = response.history[-1].status_code
                    return 'redirected', new_url, code
                else:
                    return 'redirected', new_url, 0
Esempio n. 18
0
 def getImgByUrl(self, url):
     response = requests.get(url, stream=True)
     return io.BytesIO(response.content)