def main(): mh = MysqlHelper('47.110.88.64', 'root', 'admin963', 'zhizhudashi', 'utf8') url_list = [1, 342, 1679, 344, 11] for i in url_list: url = 'http://top.baidu.com/buzz?b=' + str(i) + '&c=513&fr=topbuzz_b42_c513' try: response = requests.get(url, headers=headers) response.encoding = 'gb2312' html = response.text doc = pq(html) new_names = doc(".list-title") for name in new_names.items(): string = name.text() select_sql = 'select * from keywords where title = "%s"' % (string) id = mh.find_id(select_sql) if id != None and id != '' and id != 0: continue else: insert_sql = "insert into keywords (title,object_id) values ('%s',%d)" % (string, 1) mh.cud(insert_sql) # f = open('百度热词.txt', 'a') # for name in new_names.items(): # string = name.text() # f.write('\n' + str(string)) # f.close() except Exception as e: print(e)
def returnSohuURLs(self, index, number, dataList): '''用于批量提取搜狐新闻 过程:先模拟后台请求,得到json数据,分析得到文章的id,再将文章的网页前缀与文章id拼接,得到详情页地址''' extends = '&sceneId=1460&page=1&size=20' page = '' rang = number.split('-') start = int(rang[0]) end = int(rang[1]) pageSize = end - start if pageSize <= 0: print('后缀为:' + index + '的url所配置的数量不合法==>' + range[0] + ':' + range[1]) parameter = '&sceneId=' + index page = int(end / pageSize) parameter = parameter + '&page=' + str(page) + '&size=' + str(pageSize) # response = requests.get(dataList[0]+parameter) # 请求头 添加cookie信息 cookies = requests.cookies.RequestsCookieJar() # 添加请求头和cookies response = requests.get(dataList[0] + parameter, headers=getHeader(), cookies=cookies) content = response.text # 将conten字符格式转化为json格式,便于操作 data = json.loads(content) urls = [] for temp in (data): url = dataList[1] + str(temp['id']) + '_' + str(temp['authorId']) urls.append(url) # print(urls) return urls
def _read_from_url(url: str, config: Config = None) -> IO: """Reads data from *url* with an HTTP *GET*. This function supports fetching from resources which use basic HTTP auth as laid out by RFC1738 § 3.1. See § 5 for grammar definitions for URLs. .. seealso: https://www.ietf.org/rfc/rfc1738.txt :param url: URL of an HTTP resource :type url: ``str`` :return: data read from resource described by *url* :rtype: ``file``-like object """ r = requests.get(url, stream=True, config=config, timeout=config.intersphinx_timeout) r.raise_for_status() r.raw.url = r.url # decode content-body based on the header. # ref: https://github.com/kennethreitz/requests/issues/2155 r.raw.read = functools.partial(r.raw.read, decode_content=True) return r.raw
def handle(self, node: nodes.image) -> None: try: basename = os.path.basename(node['uri']) if '?' in basename: basename = basename.split('?')[0] if basename == '' or len(basename) > MAX_FILENAME_LEN: filename, ext = os.path.splitext(node['uri']) basename = sha1(filename.encode()).hexdigest() + ext basename = re.sub(CRITICAL_PATH_CHAR_RE, "_", basename) dirname = node['uri'].replace('://', '/').translate({ ord("?"): "/", ord("&"): "/" }) if len(dirname) > MAX_FILENAME_LEN: dirname = sha1(dirname.encode()).hexdigest() ensuredir(os.path.join(self.imagedir, dirname)) path = os.path.join(self.imagedir, dirname, basename) headers = {} if os.path.exists(path): timestamp = ceil(os.stat(path).st_mtime) # type: float headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp) r = requests.get(node['uri'], headers=headers) if r.status_code >= 400: logger.warning( __('Could not fetch remote image: %s [%d]') % (node['uri'], r.status_code)) else: self.app.env.original_image_uri[path] = node['uri'] if r.status_code == 200: with open(path, 'wb') as f: f.write(r.content) last_modified = r.headers.get('last-modified') if last_modified: timestamp = rfc1123_to_epoch(last_modified) os.utime(path, (timestamp, timestamp)) mimetype = guess_mimetype(path, default='*') if mimetype != '*' and os.path.splitext(basename)[1] == '': # append a suffix if URI does not contain suffix ext = get_image_extension(mimetype) newpath = os.path.join(self.imagedir, dirname, basename + ext) os.replace(path, newpath) self.app.env.original_image_uri.pop(path) self.app.env.original_image_uri[newpath] = node['uri'] path = newpath node['candidates'].pop('?') node['candidates'][mimetype] = path node['uri'] = path self.app.env.images.add_file(self.env.docname, path) except Exception as exc: logger.warning( __('Could not fetch remote image: %s [%s]') % (node['uri'], exc))
def getgooddetail(self, response): # urls = response.url # gid = urls.split('/')[-1].split('.')[0] # gcomnent_json_url = 'https://club.jd.com/comment/product' \ # 'CommentSummaries.action?referenceIds={}'.format(gid) # gprice = requests.get(gcomnent_json_url) # goodcom = gprice.json() # gooddt = BeautifulSoup(response.text,'html.parser') # print('好评数:',goodcom['CommentsCount'][0]['GoodCountStr']) # print('差评数:',gcom['CommentsCount']['0']['PoorCountStr']) # print('商品链接:',urls) # gpce = jdGoodItem() # gpce['price'] = goodprice['stock']['jdPrice']['p'] # yield gpce # 构建一个商品对象item goods = goodsItem() goods['goodsName'] = response.xpath( '//div[@class="sku-name"]/text()')[0].root goods['goodsUrl'] = response.url # goods['goodsUrl'] = response.xpath('//div[@class="sku-name"]/text()') goods['goodsId'] = response.url.split('/')[-1].split('.')[0] # 获取传过来的分类 # goods['goodsClassify'] = response.meta['classify'] p_Url = 'https://c0.3.cn/stock?skuId=%s&cat=1320,1585,10975&venderId=100000' \ '8814&area=1_72_4137_0&buyNum=1&choseSuitSkuIds=&extraParam={"originid":"1"}&ch=1&f' \ 'qsp=0&pduid=775011473' % \ goods['goodsId'] # p_Url = self.price_url.format(goods['goodsId']) resp = requests.get(p_Url) resp = resp.json() goods['goodsPrice'] = resp['stock']['jdPrice']['p'] p_Url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + goods[ 'goodsId'] com_resp = requests.get(p_Url) com_resp = com_resp.json() goods['goodsCommentCount'] = com_resp['CommentsCount'][0][ 'CommentCountStr'] goods['hp'] = com_resp['CommentsCount'][0]['GoodCountStr'] goods['zp'] = com_resp['CommentsCount'][0]['GeneralCountStr'] goods['cp'] = com_resp['CommentsCount'][0]['PoorCountStr'] yield goods
def handle(self, node): # type: (nodes.Node) -> None try: basename = os.path.basename(node['uri']) if '?' in basename: basename = basename.split('?')[0] if basename == '' or len(basename) > MAX_FILENAME_LEN: filename, ext = os.path.splitext(node['uri']) basename = sha1(filename.encode("utf-8")).hexdigest() + ext dirname = node['uri'].replace('://', '/').translate({ord("?"): u"/", ord("&"): u"/"}) if len(dirname) > MAX_FILENAME_LEN: dirname = sha1(dirname.encode('utf-8')).hexdigest() ensuredir(os.path.join(self.imagedir, dirname)) path = os.path.join(self.imagedir, dirname, basename) headers = {} if os.path.exists(path): timestamp = ceil(os.stat(path).st_mtime) # type: float headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp) r = requests.get(node['uri'], headers=headers) if r.status_code >= 400: logger.warning(__('Could not fetch remote image: %s [%d]') % (node['uri'], r.status_code)) else: self.app.env.original_image_uri[path] = node['uri'] if r.status_code == 200: with open(path, 'wb') as f: f.write(r.content) last_modified = r.headers.get('last-modified') if last_modified: timestamp = rfc1123_to_epoch(last_modified) os.utime(path, (timestamp, timestamp)) mimetype = guess_mimetype(path, default='*') if mimetype != '*' and os.path.splitext(basename)[1] == '': # append a suffix if URI does not contain suffix ext = get_image_extension(mimetype) newpath = os.path.join(self.imagedir, dirname, basename + ext) movefile(path, newpath) self.app.env.original_image_uri.pop(path) self.app.env.original_image_uri[newpath] = node['uri'] path = newpath node['candidates'].pop('?') node['candidates'][mimetype] = path node['uri'] = path self.app.env.images.add_file(self.env.docname, path) except Exception as exc: logger.warning(__('Could not fetch remote image: %s [%s]') % (node['uri'], text_type(exc)))
def handle(self, node): # type: (nodes.Node) -> None basename = os.path.basename(node['uri']) if '?' in basename: basename = basename.split('?')[0] dirname = node['uri'].replace('://', '/').translate({ ord("?"): u"/", ord("&"): u"/" }) ensuredir(os.path.join(self.imagedir, dirname)) path = os.path.join(self.imagedir, dirname, basename) try: headers = {} if os.path.exists(path): timestamp = ceil(os.stat(path).st_mtime) headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp) r = requests.get(node['uri'], headers=headers) if r.status_code >= 400: logger.warning('Could not fetch remote image: %s [%d]' % (node['uri'], r.status_code)) else: self.app.env.original_image_uri[path] = node['uri'] if r.status_code == 200: with open(path, 'wb') as f: f.write(r.content) last_modified = r.headers.get('last-modified') if last_modified: timestamp = rfc1123_to_epoch(last_modified) os.utime(path, (timestamp, timestamp)) mimetype = guess_mimetype(path, default='*') node['candidates'].pop('?') node['candidates'][mimetype] = path node['uri'] = path self.app.env.images.add_file(self.env.docname, path) except Exception as exc: logger.warning('Could not fetch remote image: %s [%s]' % (node['uri'], text_type(exc)))
def _read_from_url(url, config=None): # type: (unicode, Config) -> IO """Reads data from *url* with an HTTP *GET*. This function supports fetching from resources which use basic HTTP auth as laid out by RFC1738 § 3.1. See § 5 for grammar definitions for URLs. .. seealso: https://www.ietf.org/rfc/rfc1738.txt :param url: URL of an HTTP resource :type url: ``str`` :return: data read from resource described by *url* :rtype: ``file``-like object """ r = requests.get(url, stream=True, config=config, timeout=config.intersphinx_timeout) r.raise_for_status() r.raw.url = r.url return r.raw
def returnQQURLs(self, number, req): '''用于批量提取腾讯新闻 只提取新闻,不提取专题 过程:后台请求时,会携带上次请求所得的id作为参数''' # 腾讯新闻一次返回10 个新闻 pageSize = 10 rang = number.split('-') start = int(rang[0]) end = int(rang[1]) pageCount = int((end - start) / pageSize) urlList = [] expIdsList = [] # 请求头 添加cookie信息 cookies = requests.cookies.RequestsCookieJar() for i in range(pageCount): if i == 0: url2 = req + '&page=' + str(i) + '&expIds=' else: url2 = req + '&page=' + str(i) + '&expIds=' + '|'.join( str(id) for id in expIdsList) expIdsList.clear() # response = requests.get(url2) # 添加请求头和cookies response = requests.get(url2, headers=getHeader(), cookies=cookies) # 将数据转为json格式 conten = json.loads(response.text) dataList = conten.get('data') for temp in dataList: # article_tple为11代表该条新闻指定的是一个专题 if (temp['article_type']) != 11: url = temp['vurl'] id = temp['id'] urlList.append(url) expIdsList.append(id) else: pass # print('专题链接:'+temp['vurl']) # print('腾讯新闻url数量:%d'%len(urlList)) # print(urlList) return urlList
def handle(self, node): # type: (nodes.Node) -> None basename = os.path.basename(node['uri']) if '?' in basename: basename = basename.split('?')[0] dirname = node['uri'].replace('://', '/').translate({ord("?"): u"/", ord("&"): u"/"}) ensuredir(os.path.join(self.imagedir, dirname)) path = os.path.join(self.imagedir, dirname, basename) try: headers = {} if os.path.exists(path): timestamp = ceil(os.stat(path).st_mtime) headers['If-Modified-Since'] = epoch_to_rfc1123(timestamp) r = requests.get(node['uri'], headers=headers) if r.status_code >= 400: logger.warning('Could not fetch remote image: %s [%d]' % (node['uri'], r.status_code)) else: self.app.env.original_image_uri[path] = node['uri'] if r.status_code == 200: with open(path, 'wb') as f: f.write(r.content) last_modified = r.headers.get('last-modified') if last_modified: timestamp = rfc1123_to_epoch(last_modified) os.utime(path, (timestamp, timestamp)) mimetype = guess_mimetype(path, default='*') node['candidates'].pop('?') node['candidates'][mimetype] = path node['uri'] = path self.app.env.images.add_file(self.env.docname, path) except Exception as exc: logger.warning('Could not fetch remote image: %s [%s]' % (node['uri'], text_type(exc)))
def returnTouTiaoURLs(self, index, number, req): '''用于批量提取今日头条的新闻 过程:后台请求时,会携带上次请求中的max_behot_time的值作为参数''' max_behot_time = max_behot_time_tmp = 0 category = index rang = number.split('-') start = int(rang[0]) end = int(rang[1]) pageSize = 10 pageCount = int((end - start) / pageSize) urlList = [] # 请求头 添加cookie信息 cookies = requests.cookies.RequestsCookieJar() for i in range(pageCount): reqUrl = req + '&category=%s&max_behot_time=%d&max_behot_time_tmp=%d' % ( category, max_behot_time, max_behot_time_tmp) print(reqUrl) # 添加请求头和cookies response = requests.get(reqUrl, headers=getHeader(), cookies=cookies) cookies.update(response.cookies) content = json.loads(response.text) dataList = content['data'] for data in dataList: if 'article'.__eq__(data['article_genre']): itemId = data['item_id'] # 请求得到的json数据中,不会包含完整的详情页地址,需要与详情页前缀拼接 newsUrl = 'https://www.toutiao.com/a%s' % (itemId) urlList.append(newsUrl) # print('urlList长度:%d'%(len(urlList))) nextDic = content['next'] nextId = nextDic['max_behot_time'] max_behot_time = max_behot_time_tmp = nextId # print(urlList) return urlList
def _read_from_url(url, config=None): # type: (unicode, Config) -> IO """Reads data from *url* with an HTTP *GET*. This function supports fetching from resources which use basic HTTP auth as laid out by RFC1738 § 3.1. See § 5 for grammar definitions for URLs. .. seealso: https://www.ietf.org/rfc/rfc1738.txt :param url: URL of an HTTP resource :type url: ``str`` :return: data read from resource described by *url* :rtype: ``file``-like object """ r = requests.get(url, stream=True, config=config, timeout=config.intersphinx_timeout) r.raise_for_status() r.raw.url = r.url # decode content-body based on the header. # ref: https://github.com/kennethreitz/requests/issues/2155 r.raw.read = functools.partial(r.raw.read, decode_content=True) return r.raw
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() found = check_anchor(response, unquote(anchor)) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: if isinstance( err, HTTPError) and err.response.status_code == 429: raise # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 else: netloc = urlparse(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor if allowed_redirect(req_url, new_url): return 'working', '', 0 elif response.history: # history contains any redirects, get last code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri(): # type: () -> Tuple[unicode, unicode, int] # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: if is_ssl_error(err): return 'ignored', str(err), 0 else: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code
def check_uri(): # type: () -> Tuple[unicode, unicode, int] # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: if is_ssl_error(err): return 'ignored', str(err), 0 else: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() anchor_str = unquote(anchor) # Hack (?): https://github.com/container-storage-interface/spec/blob/master/spec.md#getplugininfo # is a valid anchor, but the actual id of the anchor is user-content-getplugininfo, which causes # the anchor check to fail: # <a id="user-content-getplugininfo" class="anchor" aria-hidden="true" href="#getplugininfo"> # # Might have to be fixed in AnchorCheckParser instead? if req_url.startswith('https://github.com/'): anchor_str = "user-content-" + anchor_str found = check_anchor(response, anchor_str) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except (HTTPError, TooManyRedirects) as err: if isinstance( err, HTTPError) and err.response.status_code == 429: raise # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 else: netloc = urlparse(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() found = check_anchor(response, unquote(anchor)) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def getImgByUrl(self, url): response = requests.get(url, stream=True) return io.BytesIO(response.content)