def __init__(self, status, headers=Headers(), content=None, message=None, request=None): self.status = status self.message = message self.redirects = list() if (not isinstance(headers, Headers)): headers = Headers(headers) self._headers = headers methods_from_headers = [ 'last_modified', 'date', 'expires', 'content_length', 'content_is_text', 'content_is_xml', 'content_is_xhtml' ] for m in methods_from_headers: setattr(self.__class__, m, getattr(headers, m)) self._content = content self._request = request
def __init__(self, status, headers=Headers(), content=None, message=None, request=None): """ Construct a new ``Response`` object. :param status: HTTP status code for the response :type status: integer :param headers: HTTP headers :type status: a list of tuples or a class:`Headers` object :param content: content :param message: HTTP message for the response :param request: origin Request object used :type request: class:`Request` .. attribute:: redirects List of redirections """ self._status = status self.message = message self.redirects = list() if (not isinstance(headers, Headers)): headers = Headers(headers) self._headers = headers self._content = content self._request = request
def __init__(self, method, url, headers=None, content=None): """ Construct a new ``Request`` object :param method: HTTP method :type method: string :param url: URL of the request :type url: string or class:`Url` :param headers: Headers for the request :type headers: list of tuples or class:`Headers` :param content: body """ self.method = method self.content = content if not isinstance(url, Url): url = Url(url) self.url = url if headers is None: headers = Headers() elif not isinstance(headers, Headers): headers = Headers(headers) self._headers = headers
def from_socket(cls, sock: socket.socket) -> "Request": """Read and parse the request from a socket object. Raises: ValueError: When the request cannot be parsed. """ lines = iter_lines(sock) try: request_line = next(lines).decode("ascii") except StopIteration: raise ValueError("Request line missing") try: method, path, __ = request_line.split(" ") except ValueError: raise ValueError(f"Malformed request line {request_line!r}.") headers = Headers() while True: try: line = next(lines) except StopIteration as e: buff = e.value break try: name, __, value = line.decode("ascii").partition(":") headers.add(name.lower(), value.lstrip()) except ValueError: raise ValueError(f"Malformed header line {line!r}.") body = BodyReader(sock, buff=buff) return cls(method=method.upper(), path=path, headers=headers, body=body)
def Topics_Data(type, key): #构建话题请求链接 topic_base_url = 'https://www.zhihu.com/topic' topics_url = topic_base_url + 's' #构建话题请求 topics_data = requests.get(topics_url, headers=Headers('zhihu')) #提取父话题文本 topics_text = topics_data.text #正则提取父话题Name&ID re_topic_id = r'data-id="(.*?)"' re_topic_name = r'<li.*?">.*?">(.*?)<' topics_id = re.findall(re_topic_id, topics_text) topics_name = re.findall(re_topic_name, topics_text) #根据需要,返回父话题名称、ID、字典组合 if type == 'name': return topics_name elif type == 'id': return topics_id elif type == dict: topicsDict = dict(zip(topics_id, topics_name)) return topicsDict
def __init__(self): """Instantiate the Coordinator object. Automatically creates & links the required modules.""" if CONFIG['settings']['debug']: print("coordinator.__init__: starting instantiation") self.overall_config = CONFIG self.settings = copy.deepcopy(CONFIG['settings']) self.contents = copy.deepcopy(CONFIG['contents']) self.callbacks = {} self.register_callbacks() self.active_guis = {} self.metrics = { 'sending-rate': None, 'sending-time': None, 'remaining': None, 'sent': None, 'etc': None, 'etr': None, 'no-active-connections': None, } self.email = Email(self, None) self.sender = EmailSendHandler(self) self.gui = EmailGUI(self) self.headers = Headers(self, self.email) self.email.headers = self.headers self.last_exc = None self.ready_to_send = True if self.settings['debug']: print("coordinator.__init__: instantiation complete")
def __init__(self, agent=None, timeout=10, keep_alive=1, default_headers={}, max_redirect=7): self.timeout = 60 self.max_redirect = max_redirect self._handlers = Handlers() if agent is None: self.agent = 'python-fluffyhttp' else: self.agent = agent if len(default_headers) == 0: default_headers = { 'Connection': 'keep-alive', } if 'User-Agent' not in default_headers: default_headers['User-Agent'] = self.agent self._default_headers = Headers(default_headers) self._poolmanager = PoolManager(maxsize=keep_alive)
def __init__(self, method, url, headers=Headers(), content=None): # XXX no content on GET / DELETE ? self.method = method self.content = content if not isinstance(url, Url): url = Url(url) self.url = url if not isinstance(headers, Headers): headers = Headers(headers) self._headers = headers methods_from_headers = ['if_modified_since', 'if_unmodified_since'] for m in methods_from_headers: setattr(self.__class__, m, getattr(headers, m))
def Questions_ID(word, topicName): topic_base_url = 'https://www.zhihu.com/topic' topic_url = [] question_id = [] for i in range(1): #I_want函数提供子话题ID topic_url.append(topic_base_url + '/' + I_want(word, topicName)[i]) get_questiontitle_data = requests.get(topic_url[i], headers=Headers('zhihu')) questiontitle_data_text = get_questiontitle_data.text re_question_id = r'content="https://www.zhihu.com/question/(.*?)"' # re_question_name = r'content="(.*?)"' question_id_list = re.findall(re_question_id, questiontitle_data_text) # question_name_list = re.findall(re_question_name, questiontitle_data_text) # print(question_name_list) #去重 for i in range(len(question_id_list)): if 'answer' in question_id_list[i]: pass else: question_id.append(question_id_list[i]) return question_id
def __init__(self, request): self.request = request self.code = 200 self.code_text = self.CODES[self.code] self.headers = Headers() if self.has_body(): self.body = ""
def getDetails(self, details): ret = [] if len(details) > 0: detail = Config.url_main + details[0] headers = Headers().getHeaders() req = requests.get(detail, headers=headers) selector = etree.HTML(req.content) js = selector.xpath('//script/text()') if len(js) > 0: urls = js[0].split('"') if len(urls) > 0: ret.append(urls[1]) return ret
def run(self): while True: print (self.getName()) if self._queue.empty(): break ret_json = self._queue.get() ret = simplejson.loads(ret_json) id = ret[0] category_id = ret[1] content_url = ret[2] headers = Headers().getHeaders() print(content_url) try: req = requests.get(content_url, headers=headers, timeout=60) if req.status_code == 200: html = req.content selector = etree.HTML(html) titles = selector.xpath('//*[@class="fl"]/a[1]/text()') nicks = selector.xpath('//*[@class="fl"]/span[1]/a[1]/text()') play_urls = selector.xpath('//*[@class="redBtn"]/a[1]/@href') contents = [] contents_li = selector.xpath('//*[@class="movStaff line_BSld"]/li') for c_li in contents_li: c_temp = c_li.xpath('descendant::text()') if len(c_temp) > 0: contents.append(c_temp) imgs = selector.xpath('//*[@class="imgBAyy db"]/descendant::img[1]/@src') title = play_url = content = img = nick = '' if len(titles) > 0: title = titles[0] if len(nicks) > 0: nick = nicks[0] title = title + ',' + nick if len(play_urls) > 0: play_url = play_urls[0] if len(imgs) > 0: img = imgs[0] content = simplejson.dumps(contents) created_at = time.strftime('%Y-%m-%d %H:%M:%S') if img != '': # 存入content sql = 'insert ignore into m1905_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)' values = (category_id, title, content, play_url, img, content_url, created_at) print(title) self._mysqlDao.executeValues(sql, values) except: self._mysqlDao = MysqlDao() if img != '': # url置1 sql = 'update m1905_url set `status`=1 where `id`=' + str(id) self._mysqlDao.execute(sql)
def __init__(self, status: str, headers: typing.Optional[Headers] = None, body: typing.Optional[typing.IO] = None, content: typing.Optional[str] = None, encoding: str = "utf-8") -> None: self.status = status.encode() self.headers = headers or Headers() if content is not None: self.body = io.BytesIO(content.encode(encoding)) elif body is None: self.body = io.BytesIO() else: self.body = body
def getHeaders(self, info): """Extract the headers from an IRequestInfo or IResponseInfo object. Returns a Headers object with the headers. Args: * info (IRequestInfo or IResponseInfo): Request info. Use the output from getInfo or getInfoFromBytes. """ from headers import Headers hdr = Headers() # this is IRequestInfo.getHeaders() or IResponseInfo.getHeaders() from Burp rawHdr = info.getHeaders() hdr.importRaw(rawHdr) return hdr
def clear(self): self.done = False self.close = False if self.request.server: server_version = self.request.server.version else: server_version = SERVER_VERSION self.headers = Headers([("Server", server_version), ("Date", strftime("%a, %d %b %Y %H:%M:%S %Z")), ("X-Powered-By", server_version)]) self.cookie = self.request.cookie self.stream = False self.body = None self.time = time() self.status = "200 OK"
def Topics_Child_Data(Topic_ID, index, type): post_url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' data = { "method": "next", "params": '{"topic_id": ' + str(Topic_ID) + ', "offset": ' + str(index) + ', "hash_id": "40b294bd32442bd6a654497741c02b17"}' } #构建子话题请求 topics_data_child = requests.post(url=post_url, headers=Headers('zhihu'), data=data) #提取子话题文本 topics_text_child = topics_data_child.text #正则提取子话题Name&ID re_topic_id_child = r'href=.*?".*?topic\\\/(.*?)\\"' re_topic_name_child = r'<strong>(.*?)<.*?/strong>' topics_id_child = re.findall(re_topic_id_child, topics_text_child) topics_name_child = re.findall(re_topic_name_child, topics_text_child) #转译Unicode编码 topics_name_child_eval = [] for i in range(len(topics_name_child)): xxx = eval("u" + "'" + topics_name_child[i] + "'") topics_name_child_eval.append(xxx) if type == 'name': return topics_name_child_eval elif type == 'id': return topics_id_child elif type == dict: #构建子话题字典 topicDict_child = dict(zip(topics_id_child, topics_name_child_eval)) return topicDict_child
def _build_response(self, r, request): status = r.status headers = Headers(r.headers) content = r.data resp = Response(status=status, headers=headers, content=content, message=r.reason, request=request) new_resp = self._handlers.dispatch('response_done', resp) if new_resp is not None: resp = new_resp req = self._handlers.dispatch('response_redirect', resp) if req is not None and isinstance(req, Request): return self.request(req) return resp
def getRequestResponse(self, environ): env = environ.get headers = Headers(list(self.translateHeaders(environ))) protocol = tuple(map(int, env("SERVER_PROTOCOL")[5:].split("."))) request = wrappers.Request(None, env("REQUEST_METHOD"), env("wsgi.url_scheme"), env("PATH_INFO"), protocol, env("QUERY_STRING")) request.remote = wrappers.Host(env("REMOTE_ADDR"), env("REMTOE_PORT")) request.headers = headers request.script_name = env("SCRIPT_NAME") request.wsgi_environ = environ request.body = env("wsgi.input") response = wrappers.Response(None, request) response.gzip = "gzip" in request.headers.get("Accept-Encoding", "") return request, response
def start_response(self, status, headers, exc_info=None): """'start_response()' callable as specified by PEP 333""" if exc_info: try: if self.headers_sent: # Re-raise original exception if headers sent raise exc_info[0](exc_info[1]).with_traceback(exc_info[2]) finally: exc_info = None # avoid dangling circular ref elif self.headers is not None: raise AssertionError("Headers already set!") assert type(status) is str, "Status must be a string" assert len(status) >= 4, "Status must be at least 4 characters" assert int(status[:3]), "Status message must begin w/3-digit code" assert status[3] == " ", "Status message must have a space after code" self.status = int(status[:3]) self.headers = Headers(headers) return self.write
def Questions_ID(word, topicName): topic_base_url = 'https://www.zhihu.com/topic' topic_url = [] question_id_list = [] for i in range(1): # word = ['时尚', '生活方式', '咖啡', '凯恩斯主义', '', ''] # topicName = '生活时尚' #I_want函数提供子话题ID topic_url.append(topic_base_url + '/' + I_want(word, topicName)[i]) # print(I_want()[i]) get_questiontitle_data = requests.get(topic_url[i], headers=Headers('zhihu')) questiontitle_data_text = get_questiontitle_data.text re_questiontitle_id = r'content="https://www.zhihu.com/question/(.*?)"' questiontitle_id_list = re.findall(re_questiontitle_id, questiontitle_data_text) for i in range(len(questiontitle_id_list)): question_id_list.append(questiontitle_id_list[i]) return question_id_list
def clear(self): self.done = False self.close = False if self.request.server: server_version = self.request.server.version else: server_version = SERVER_VERSION self.headers = Headers([("Date", strftime("%a, %d %b %Y %H:%M:%S %Z")), ("X-Powered-By", server_version)]) if self.request.server is not None: self.headers.add_header("Server", server_version) self.cookie = self.request.cookie self.stream = False self._body = [] self.time = time() self.status = "200 OK" self.protocol = "HTTP/%d.%d" % self.request.server_protocol
def Topics_DICT(): #构建话题请求链接 topic_base_url = 'https://www.zhihu.com/topic' topics_url = topic_base_url + 's' #构建话题请求 topics_data = requests.get(topics_url, headers=Headers('zhihu')) #提取父话题文本 topics_text = topics_data.text #正则提取父话题Name&ID re_topic_id = r'data-id="(.*?)"' re_topic_name = r'<li.*?">.*?">(.*?)<' topics_id = re.findall(re_topic_id, topics_text) topics_name = re.findall(re_topic_name, topics_text) #构建父话题字典 topicsDict = dict(zip(topics_id, topics_name)) return topicsDict
def __init__(self, max_workers=10): self.headers = Headers() self.timeout = None self.proxies = None self.__thread_pool_executor = futures.ThreadPoolExecutor( max_workers=max_workers)
class Spider(object): header = Headers() headers = header.headers() #初始化获取随机的请求头 normalize = Normalize() #格式化url items_fans = {} #用于存储粉丝列表的字典 items_self = {} #用于存储个人信息的字典 redis = Redis() mongo = Mongo() s_time = 0 #起始时间 e_time = 0 #程序运行结束时间 flag = 0 #请求头切换标识 default_time = 20 def start_url(self): #初始链接 start_urls = [ 'https://weibo.com/p/1004061537790411?is_hot=1', ] for start_url in start_urls: yield start_url def downloader(self, url_item, referer, retries_num=4): """ 返回源码 """ print("开始下载") self.e_time = time.time() #获取当前时间 time_dif = self.e_time - self.s_time if self.flag == 1: time_dif = 400 flag = 0 if time_dif > 300: self.headers = self.header.headers() #获取随机的请求头 self.s_time = self.e_time time.sleep(random.random() * 5 + random.random() * 5) #+ random.randint(1,5)) if referer: #判断是否需要防盗链 self.headers['Referer'] = referer #添加referer url = url_item[0] print("待抓取:", url) try: response = requests.get(url, headers=self.headers, timeout=30) #print(self.headers) print("状态码:", response.status_code) #print(response.text) if response.status_code == 200: if len(response.text) > 50000: return response.text else: return None else: self.flag = 0 #切换请求头 if retries_num > 0: print("第", 4 - retries_num, '次下载') self.downloader(url_item, referer, retries_num - 1) else: self.redis.push(url_item) #下载失败则重新下载 return None except requests.exceptions.ConnectionError as e: print("downloaderrl错误", url) print("错误信息:", str(e)) else: response = requests.get(url, headers=self.headers) return response.text def parse_follow_page(self, html, referer): """ 从个人主页提取pageid, 用于构建 关注的人 的链接, 提取关注的人数,粉丝数 """ print("解析函数1") p1 = r'<title>(.*?[\u4e00-\u9fa5]{0,})的微博_微博</title>' #用来匹配这是谁的微博 p3 = r"\$CONFIG\['page_id'\]='(\d.*?)';" #用于匹配pageid p4 = r"(\d{6})" #用于从 pageid 中匹配pid p5 = r'<strong\sclass=\\"W_f12\\">(\d*?)<\\/strong><span\sclass=\\"S_txt2\\">关注<\\/span>' #关注的人数 p6 = r'<strong\sclass=\\"W_f12\\">(\d*?)<\\/strong><span\sclass=\\"S_txt2\\">粉丝<\\/span>' #粉丝数 self.items_self = {} self.items_self['collection'] = re.search(p1, html).group( 1) #谁的主页,用于建立collection self.items_self['page_id'] = re.search(p3, html).group(1) #获得pageid self.items_self['pid'] = re.search(p4, self.items_self['page_id']).group( 1) #获得pid try: self.items_self['idol'] = int(re.search(p5, html).group(1)) except: self.items_self['idol'] = '__' #关注人数不可见,则idol列表不能添加 print("关注的人数人不可访问") try: self.items_self['fans'] = int(re.search(p6, html).group(1)) except: self.items_self['fans'] = 0 print("粉丝数人不可访问") if self.items_self['fans'] > 50000: #这是阻尼系数 self.items_self['damp'] = 1 else: self.items_self['damp'] = 0.5 print(self.items_self) #self.mongo.save(self.items_self) #存储 yield self.items_self #返回结果用于存储 if isinstance(self.items_self['idol'], int): for url in self.normalize.nor_follow( self.items_self['page_id']): #关注着页面 url_item = [url, self.parse_detail, referer] yield url_item #只需返回关注页面的链接即可,其他的直接存储 else: yield None def parse_detail(self, html, referer): """ 提取每个人的关注页面和首页链接 """ print("解析函数2") self.items_fans = {} p1 = r'<title>(.*?[\u4e00-\u9fa5]{0,})的微博_微博</title>' p2 = r'<a\starget=\\"_blank\\"\stitle=\\"(.*?[\u4e00-\u9fa5]{0,})\\"\shref=\\"(.*?)\\"\s>' #用于匹配粉丝列表 try: results = re.findall(p2, html) for result in results: if result: collection = re.search(p1, html).group(1) #控制表 idol_name = result[0] #关注者的名字 link = self.normalize.nor_home(result[1].replace( '\\', '')) #关注者的首页链接 if re.search(r'\?', link): #如果能找到 ‘?’ 则存入数据库 self.items_fans = { 'collection': collection, 'idol_name': idol_name, 'link': link, } print(self.items_fans) #self.mongo.save(self.items_fans) #存储到数据库 yield self.items_fans #返回结果,用于存储 url_item = [ self.items_fans['link'], self.parse_follow_page, referer ] yield url_item #将结果返回 else: print("链接不符合规定:", link) yield None except: print("粉丝列表不可访问") def scheduler(self): #初始化 #self.redis.delete() #控制是否在爬虫关闭后继续抓取 if self.redis.llen() == 0: for url in self.start_url(): callback = self.parse_follow_page referer = "https://weibo.com" url_item = [url, callback, referer] self.redis.push(url_item) while True: print("开始执行") if not self.redis.llen(): url_item = self.redis.pop() url = url_item[0] callback = url_item[1] referer = url_item[2] html = self.downloader(url_item, referer=referer) if html is not None: print("html的长度:", len(html)) for items in callback(html, url): if isinstance(items, list): print("返回结果是列表") self.redis.push(items) if isinstance(items, dict): print("返回结果是字典") self.mongo.save(items) if items is None: pass #剔除掉粉丝列表不可看的 else: print("html的值:", html) else: break def run(self): self.scheduler()
def _merge_headers(self, headers): final_headers = Headers(self.default_headers.items() + headers.items()) return final_headers
def __init__(self, *args, **kwargs): self.protocol = kwargs.get('protocol') self.path = kwargs.get('path') self.method = kwargs.get('method') self.body = kwargs.get('body') self.headers = Headers(**kwargs.get('headers'))
def run(self): while True: print(self.getName()) ret_json = self._redisDao.lpop('kansogou') if ret_json == None: break ret = simplejson.loads(ret_json) id = ret[0] category_id = ret[1] content_url = ret[2] img = ret[3] headers = Headers().getHeaders() print(content_url) try: req = requests.get(content_url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) # 电影 if category_id == 1: titles = selector.xpath( '//*[@class="title txt-overflow"]/a[1]/text()') play_urls = selector.xpath( '//*[@class="title txt-overflow"]/a[1]/@href') contents = selector.xpath( '//*[@class="video-info"]/descendant::text()') # 电视剧 if category_id == 2: titles = selector.xpath('//*[@class="tt-mnc"]/text()') play_urls = selector.xpath( '//*[@class="tt-mnc"]/@href') contents = selector.xpath( '//*[@class="lines"]/descendant::text()') # 综艺 if category_id == 3: titles = selector.xpath( '//*[@class="info"]/h1[1]/a[1]/text()') play_urls = selector.xpath( '//*[@class="info"]/h1[1]/a[1]/@href') contents = selector.xpath( '//*[@class="info"]/descendant::span/descendant::text()' ) # 动漫 if category_id == 4: titles = selector.xpath( '//*[@class="title"]/a[1]/text()') play_urls = selector.xpath( '//*[@class="title"]/a[1]/@href') contents = selector.xpath( '//*[@class="video-info"]/descendant::text()') title = play_url = content = '' if len(titles) > 0: title = titles[0] if len(play_urls) > 0: play_url = Config.url_main + play_urls[0] content = simplejson.dumps(contents) created_at = time.strftime('%Y-%m-%d %H:%M:%S') # 存入content sql = 'insert ignore into kansogou_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)' values = (category_id, title, content, play_url, img, content_url, created_at) print(title) self._mysqlDao.executeValues(sql, values) except: self._mysqlDao = MysqlDao() # url置1 sql = 'update kansogou_url set `status`=1 where `id`=' + str(id) self._mysqlDao.execute(sql)
#构建回答页面链接 api_url = 'https://www.zhihu.com/api/v4/questions/' req_url = '/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics%3Bdata%5B%2A%5D.settings.table_of_content.enabled&limit=' limit = 15 question_id = 34243513 word = ['时尚', '', '', '', '', ''] topicName = '生活方式' print(I_want(word, topicName)[0]) # list = ['咖啡', '时尚'] url_ = api_url + str(Questions_ID(word, topicName)[0]) + req_url + str(limit) print(Questions_ID(word, topicName)[0]) #请求数据 get_pic_data = requests.get(url_, headers=Headers('zhihu')) pic_data_text = get_pic_data.text #正则匹配图片地址 re_pic_src = r'src=."(https:.*?)"' re_content = r'content":"(.*?)\\u003' #查找图片 pic_src_list = re.findall(re_pic_src, pic_data_text) # print(len(pic_src_list)) for i in range(0, 15): print('(', i + 1, ')', pic_src_list[i], '\n') #保存图片,默认位置C:\Users\lfeng # img_src = pic_src_list[i] # r = requests.get(img_src,stream=True)
''' 大鲲 爬虫 ''' import requests from headers import Headers from bs4 import BeautifulSoup import time import re import numpy as np from storage import Storage headers = Headers() storage = Storage() class DakunSpider(object): # 获取列表数据 def fetch_list(self, _type, page): url = 'https://pro.lagou.com/project/%s/%d' % (_type, page) res = requests.get(url, headers=headers.getHeader()) _result = [] soup = BeautifulSoup(res.text, 'html5lib').find(id='project_list').find_all('li') for item in soup: link = item.find('a')
# 'scheme': 'https', # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', # 'accept-encoding': 'gzip, deflate, br', # 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', # 'cache-control': 'max-age=0', # 'cookie': 'session-id=142-0742346-9197407; ubid-main=133-7610726-3999729; x-main="4kcZZToPabr9kasOAP@fUF?gnBEFaM5G"; at-main=Atza|IwEBIHakK7hcNKZ8SnOHIESyAbGkJm881U1DzVVSDUsQtW4m9IBGj3eRSHyCTielSYV5LFEZgM52WDU-iPqlVlfoyFH50lsV1lcHWTw-Lx0TKAQtyZ6lNtbPPeR61oLvW9lrKdJhmGKMyb_q4CM05NPDtIWwNwNlfDHJlWJiyMvDVj_ei_0q8rIGvCsv0iSo5UecLdunGbhQcvp1cCOQl9JecTQY; sess-at-main="QG3NzTZK9Oq/AnB9D2p9/kHvrdx2o/40aMhb9ncESI0="; sst-main=Sst1|PQFB0bUIS1fXsMsV2S59DyOUCW5nwqhMy78gX87n0biokDXU_fAKXLMZr9qeysTcdSa7CZu1fReugmLsmJSOEcuOW6rifXNp3uuO-9JwzfegBoGOwKCMPvM2DVFb9vKeL8qsOvwdE1CF1nsDPO7C67FLmrV3ReHONyDcqc58rITDQebzhrYOXYIRj-Lx4memd-sVbkKPZGLrw7pE7Flqpoodv66BW8DlE1lWRyZv5DvGAIivV_7JHGD3hJCJM_WYWjmiQJfLy4fx_c_iVECH2s1IHTppiitYq8kwq54gZUOfmO4; lc-main=en_US; session-id-time=2082787201l; csd-key=eyJ3YXNtVGVzdGVkIjp0cnVlLCJ3YXNtQ29tcGF0aWJsZSI6dHJ1ZSwid2ViQ3J5cHRvVGVzdGVkIjpmYWxzZSwidiI6MSwia2lkIjoiYjVjNzZlIiwia2V5IjoiVDk4S251VHI2aWMvdE44OGVYcEhsVXBOdjM1dEdsdjR2UEJZOVpNL052SjVmRzNsMEV4RjlzdStQcU9LZ1dTQ1owSTF4bWQ3amJXY1RUSE55OWJleDNaMGRYSG93eW5kRk1aaDF2c2Zpc0ZZTmdFeXllZ2lybktCNUd4Y0REdUF3eEhiUit5RytoU1QvSnZvTDh5SnRiT1JJVDBqanJtK2N3blFGbGpXTHZRYnhjSkZiNWJ6MTdJL0ZZUmRockZ3ejI3Z20wVGVwRFVmNHRGc1A1K3RZbFVSTHp4ckNQWHBCMm8xQ0RqTVIrK0l0dVNhelR6Q2FyVEl0UmJUdE0yZ2xVUGtqYU44UDRoUEl2NzFQT25QVWVBQWpMSWRkMURrSXlJb0FXZlBKalByaE9pOXV4Q0RVSjVYVUdrVzJtY0J6cFJHMEtydTg0R1QyZEs4R2JuQzNBPT0ifQ==; session-token="iYYndurGfOfXDEnCsH9uwyeX7b0S+ZABM97JVwXRTvNCEaqpBjoPjB5SmgoBsH8RZRpnTyH3FR3j0vVnX2Jlm96lE8f/iksXfejUBpTe5aVQQ6iCjhz/i/rSjgLeZ2aG8khFYc6fXz/7BaqN2qGUsVjZ+NDN0VmQXQHey4LkWKp58hQbDH6JW9jFkhDwgPC7hfZhQDMsUMH4ZaycRAq2qYLsqy5iTaJW5pUsXmBnZgaCx2DW3EkX9TEmOtVijRRhEt6ML6qD0/+9mVgz1Y26ig=="; csm-hit=adb:adblk_no&t:1608966225254&tb:FCA9V8C9Y22B2CENQZCH+s-QNQR62MA46RMT9NTY65E|1608966225254', # } # amazon_headers # 创建URL组合 start_url = 'https://s.taobao.com/search?q=' goods = 'lego 21309' url = start_url + goods + '&s=' + str(0) # 通过URL爬取内容 get_contents = requests.get(url, headers = Headers('taobao')) get_text = get_contents.text # print(get_text) # 正则表达式匹配指定内容 raw_ = r'"title":"([^"]+)"' title_co = r'"raw_title":"([^"]+)"' price_co = r'"view_price":"([^"]+)"' nick_co = r'"nick":"([^"]+)"' innerText_co = r'"innerText":"([^"]+)"' view_sales_co = r'"view_sales":"([^"]+)"' comment_count_co = r'"comment_count":"([^"]+)"' item_loc_co = r'"item_loc":"([^"]+)"' tb_sku_title_ = re.findall(title_co, get_text) tb_sku_prcie_ = re.findall(price_co, get_text)