def testD(self): url = 'https://s.taobao.com/search' data = { 'data-key':'s', 'data-value':'264', 'ajax':'true', '_ksTS': "{}".format(time.time()*1000).replace('.','_'), 'callback':'jsonp{}'.format("{}".format(time.time()*1000).replace('.','_').split('_')[1]), 'q':'显示器', 'imgfile':'', 'js':'1', 'stats_click':'search_radio_all:1', 'initiative_id':'staobaoz_20181207', 'ie':'utf8', 'cps':'yes', 'ppath':'20000:26683', 'bcoffset':'3', 'ntoffset':'3', 'p4ppushleft':'1,48', 's':'44', } headers = {'User-Agent':self.get_headers()} temp_proxy = get_one_proxy() proxies = {'http':temp_proxy.strip(),'https':temp_proxy.strip()} headers['cookie'] = 'cna=urphFA2+QgcCAT3czp0RFlrD; t=cfcbdf6c5e7e70ec61e509b3e893cacc; _cc_=URm48syIZQ%3D%3D; tg=0; l=AsvLG1l6sk/iMXekLpNLMpmk22S10t/i; enc=ddv2AfS73bAhE5TaTHRE7lY%2FsnloFft7cUqDn%2B6k1ekOfrN4duFionnjGCyhEpz%2FnRl9%2FHuS6ZucHRurQ8drjA%3D%3D; _uab_collina=154407940266705370937115; thw=tw; hng=TW%7Czh-TW%7CTWD%7C158; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=12c6025833330a8057cc7fbdbdb899c7_1544151605085; _m_h5_tk_enc=6e833b2b8360d85dea0e2cbd677b44a6; cookie2=142aa50b21624f6dc0cde7974b608e1b; _tb_token_=7ee603780db1b; _fbp=fb.1.1544142611079.761093547; alitrackid=world.taobao.com; lastalitrackid=world.taobao.com; swfstore=245366; v=0; uc1=cookie14=UoTYMh9%2BrwN37g%3D%3D; mt=ci=-1_0; x5sec=7b227365617263686170703b32223a223738376530653737386530633432306535646261613233306164353835633933434f697a702b4146454b2f6673726a71355a50346a674561437a597a4d5463314d6a51314f547379227d; JSESSIONID=4E59B8904FEDB78C25510C048E50018B; isg=BDc3yuOiJwcPVKOCvsxtL0TLxiuL1c8mgTKMuYnhYoSGOER6k86erp5eHtDD0OPW' # headers['referer']='https://s.taobao.com/search?q=%E6%98%BE%E7%A4%BA%E5%99%A8&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20181207&ie=utf8&cps=yes&ppath=20000%3A26683&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44' resp = requests.get(url,params=data,headers=headers,proxies=proxies) # print(resp.url) result = resp.text print(result) return
def download_dell(self, url, keywordslist, callback=None): if callback is None: callback = self.dell_store proxy = get_one_proxy() print(proxy) resp = requests.get(url, headers=self.headers, proxies=self.proxy) return callback(resp.text, keywordslist=keywordslist)
def __init__(self, goodsname='Dell', keyword='P2317H', client='Rosa', tag=None, debug=False): self.debug = debug self.tag = tag #self.dbmysql = MysqlPipeline() self.client = client self.goodsname = goodsname self.keyword = self.goodsname + ' ' + keyword self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' } self.baseurl = 'https://search.jd.com' self.baseurl2 = 'http://item.jd.com' self.searchurl = 'http://search.jd.com/Search' self.prefix1 = 'Summary_' self.prefix2 = 'Main_' #第一层json的值 proxy = get_one_proxy() self.proxy = {'http': proxy, 'https': proxy} self.keyword2 = keyword #用来匹配 self.pattern = r'\/(\d+?)\.html' #从url中获取goodid self.urlmoduel = 'https://item.jd.com/{}.html' # 用来去掉已经爬过了的店铺 self.setfilter = set() data = { 'keyword': self.keyword, 'enc': 'utf-8', 'wq': self.keyword, 'pvid': '7e5dd963f7084c468d817cf06a3351dc' } # print(data) self.lock = True try: resp = requests.get(self.searchurl, params=data, headers=self.headers, proxies=self.proxy) if '汪~没有找到' in resp.content.decode(): self.lock = False # print(resp.status_code) except Exception as e: print('Fatal error') logging.error('Fatal error:' + self.searchurl + 'downloaded fail') self.refer = self.searchurl else: self.refer = resp.url self.switch = True # 当分页处理完成,设置为False self.comment_switch = {} # 评论分页开关,键名为goodid # print(self.refer) # 测试 self.test = [] # 更新 self.maxpage = 5 #更新状态,最多翻看5页评论。 self.update_status = False #默认不开启更新状态
def comment_detail(self, goodid, page=0, callback=None, meta=None, keyword=None, goodsname=None, client=None): if self.debug: return print(goodid, page, keyword, goodsname, client) # url = self.urlmoduel.format(goodid) #url = 'http://sclub.jd.com/comment/productPageComments.action' url = 'http://club.jd.com/comment/skuProductPageComments.action' # 解析详情页,获取评论信息 data = { 'callback': 'fetchJSON_comment98vv229', 'productId': goodid, 'score': '0', 'sortType': '6', #按时间排序 'page': page, 'pageSize': '10', # 最多显示十条评论 'isShadowSku': '0', 'fold': '1' } data = { 'callback': 'fetchJSON_comment98vv762', 'productId': goodid, 'score': '0', 'sortType': '6', 'page': page, 'pageSize': '10', 'isShadowSku': '0', 'rid': '0', 'fold': '1' } try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url, params=data, headers=self.headers, proxies=proxies) except Exception as e: print('{}'.format(e)) logging.error('Fatal error:' + url + 'downloaded fail') return cod = resp.encoding result = resp.content.decode(cod) reT = r'\w+?\((.*?)\);$' res = re.search(reT, result, re.S) print(res) # 调试 if res: res = res.group(1) res = json.loads(res) # print(res) try: comments = res.get("comments") except Exception as e: logging.error('comment_detail error:' + e) return if len(comments) == 0: self.comment_switch[goodid] = False return myresult = [] # 最终要获得的数据 结构 for i in comments: # print(i) temp = {} temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d') temp['size'] = i.get('productSize', None) temp['comment_time'] = i.get('creationTime', None) # 2015-09-09 11:35:27 temp['content'] = i.get('showOrderComment', {}).get( "content", i.get('content')) # 这个当然不错的了,我们是用来作图的 temp['img'] = re.findall( r'http\:\/\/img30\.360buyimg\.com\/shaidan\/jfs\/[\w\/]+?\.jpg', temp.get('content', '')) if len(temp['img']) == 0: temp['img'] = None temp['content'] = i.get('content') # print(temp['website_url']) temp['website'] = 'JD' temp['website_url'] = 'http://www.jd.com' #http://img30.360buyimg.com/shaidan/jfs/t23899/69/1404782488/83204/3b210e9c/5b5ef8f1N3d24d6b6.jpg temp['type'] = self.keyword if keyword is None else keyword temp['client'] = self.client if client is None else client temp['score'] = i.get('score', None) replies = i.get('replies', None) temp['replytime'] = None if replies is not None: try: temp['replytime'] = replies[0].get( 'creationTime', None) except IndexError: temp['replytime'] = None else: temp['replytime'] = None # 回复时间 temp['md5_id'] = self.md5('{}{}'.format( goodid, i.get('id', ''))) temp[ 'goodsname'] = self.goodsname if goodsname is None else goodsname norepeat = self.md5('{}{}{}'.format( goodid, i.get('id', ''), temp['replytime'] if temp['replytime'] else 'null')) if not dbredis.sadd("Reviews_norepeat6", norepeat): self.comment_switch[goodid] = False #break myresult.append(temp) pipelines = MongodbPipeline() try: # print(myresult) pipelines.insert(myresult) except Exception as e: logging.error('insert error' + self.keyword + e + '{}'.format(page)) else: self.comment_switch[goodid] = False pass
def __search(self, page=1, callback=None, meta=None): ''' page:第多少个半页 ''' # 解析第一部分 第一部分和第二部分可以合并 # 解析第二部分 # 处理分页 if page >= 200: return refer = self.refer url2 = 'https://search.jd.com/s_new.php' headers = {} headers['Referer'] = refer headers.update(self.headers) data2 = { 'keyword': self.keyword, 'enc': 'utf-8', 'qrst': '1', 'rt': '1', 'stop': '1', 'vt': '2', 'wq': self.keyword, 'page': page, 's': (page - 1) * 30 + 1, 'scrolling': 'y', 'log_id': time.time(), 'tpl': '1_M', } # print('测试') #测试时候使 try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url2, params=data2, headers=headers, proxies=proxies) except Exception as e: logging.error('Fatal error:' + url2 + 'downloaded fail') return # code = resp.encoding logging.info('status code : {}'.format(resp.status_code)) # print(resp.status_code) result = resp.text # print(result) html = etree.HTML(result) items = html.xpath(r'//li[@class = "gl-item"]') length = len(items) if length == 0: self.switch = False for item in items: temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href') # print(temp_url) if len(temp_url) > 0: _ = re.findall(self.pattern, temp_url[0]) if len(_) > 0: url = self.urlmoduel.format(_[0]) goodid = _[0] # print(url) else: continue pass else: continue # 为了数据完整性,此处需要修改 res = etree.tostring(item) cod = chardet.detect(res).get("encoding") res = res.decode(cod) # kw = self.keyword.split(' ') reT = self.keyword2 + '[a-zA-Z]' # print(reT) res = re.sub(r'<font.+?>', '', res) res = re.sub(r'</font>', '', res) tres = etree.HTML(res) tres = tres.xpath(r'//a/em/text()') # 获取标题 if len(tres): res = tres[0] else: print('空') continue print(res) if re.search(reT, res, re.S): logging.info('Invalid Match ') # print(goodid,'x') continue if self.keyword2 not in res: continue if '显示器' not in res: continue else: logging.info('{}'.format(goodid)) print(res) # print(reT) # print(goodid,'okay') # continue #测试的时候使用 if goodid in self.setfilter: #去掉爬过了的网页 continue else: self.setfilter.add(goodid) print(goodid) #测试 callback(goodid=goodid, callback=self.comment_detail) '''break # 必须删除,调试的时候使用
def download_dell_TMall(self, url=None, callback=None): if callback is None: callback = self.parse_comment if url is None and self.page == 0: url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.23b52fc5dpna5r&brand=26683&q=%CF%D4%CA%BE%C6%F7&sort=s&style=g&from=sn_1_brand-qp&type=pc#J_Filter' if url is None and self.page != 0: url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.23b52fc5dpna5r&brand=26683&s=60&q=%CF%D4%CA%BE%C6%F7&sort=s&style=g&from=sn_1_brand-qp&type=pc#J_Filter' url = re.sub(r'&s=(\d+?)&', '', url) url += '&s={}'.format(self.page * 60) rawurl = url if self.count >= 10: time.sleep(300) self.count = 0 headers = {'User-Agent': self.get_headers()} headers[ 'referer'] = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.79102fc5TTtYDp&brand=26683&s=60&q=%CF%D4%CA%BE%C6%F7&sort=s&style=g&from=sn_1_brand-qp&type=pc' headers[ 'cookie'] = 'hng=TW%7Czh-TW%7CTWD%7C158; cna=urphFA2+QgcCAT3czp0RFlrD; _med=dw:1366&dh:768&pw:1366&ph:768&ist:0; uss=""; enc=l0HOjOaHNaXY5Gm6OVhD8Imt8PfZqiyMNQ93Iyhipnx%2F8ShXgyOJ%2FfgsHJFyhTTJ6cRZSQftnIoeE2SRRr%2B02w%3D%3D; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; uc1=cookie14=UoTYM8Z0sKwwJg%3D%3D; t=cfcbdf6c5e7e70ec61e509b3e893cacc; uc3=vt3=F8dByR1Rm78DSHZkeDE%3D&id2=UoLfdCnbIqFUtP%2BN&nk2=CdzyrrmtfW3pTfw%3D&lg2=W5iHLLyFOGW7aA%3D%3D; tracknick=jonny_test1; lid=jonny_test1; lgc=jonny_test1; _tb_token_=563859e331696; cookie2=1f65d98536154088eb538260b03f63cf; cq=ccp%3D1; swfstore=63758; _uab_collina=154502570212165294427025; x5sec=7b22746d616c6c7365617263683b32223a22393065623664393730343964356366323331356631353934643866316232663743496e79334f414645506d73315936596771722b6851453d227d; res=scroll%3A1331*5350-client%3A1331*581-offset%3A1331*5350-screen%3A1366*768; pnm_cku822=098%23E1hvzQvUvbpvjQCkvvvvvjiPR25vAjl8P2MUtjljPmPyAj1HP2FwAjlPn2SOgj6PvpvhvvvvvbyCvm3vpvvvvvCvpZCvHChvvhUjphvZ7pvvp6nvpCBXvvCmeyCvHHhvvhb%2FuphvmhCvCElaO%2BGRkphvCyEmmvofVvyCvh12%2FZvvIsD9h5lNAf6T%2Bm1XbZBnsfpBfC3Y2qFnR2Oic738k8p7rz6Ef3Ax0fUtWlXIpfmD5daKCmYVK4vBh7DHb3RxffoKfCYYhML6vphvCyCCvvvvv2yCvvBvpvvviQhvChCvCCp%3D; isg=BO_vtkNt73HxiutQ91qjnwohfgPzvQbguQpEwQF8i95lUA9SCGTTBu0K1ghLKBsu' temp_proxy = get_one_proxy() #proxies = {'http':'122.116.144.41:55160','https':'122.116.144.41:55160'} proxies = {'http': temp_proxy.strip(), 'https': temp_proxy.strip()} try: resp = self.session2.get(url, headers=headers, proxies=proxies, timeout=self.timeout) except Exception as e: print(e) return self.download_dell_TMall(url=url, callback=callback) # print(resp.url) result = resp.text with open('rawtmall.txt', 'w') as f: f.write(result) print(result) html = etree.HTML(result) reslist = html.xpath(r'//div[@class="product-iWrap"]') # r'//div[@class="product-iWrap"]//p[@class="productTitle"]/a//@title' print(len(reslist)) for res in reslist: title = self.clean( res.xpath(r'.//p[@class="productTitle"]/a//@title')) if '差价' in title: continue if '英寸' not in title: continue type = None for i in keywords_type.keys(): if i in title: type = i break # print(type) if type is None: continue print(title, type) self.page += 1 print('第{}页'.format(self.page)) # nexturl = self.clean(html.xpath('//a[@class="ui-page-next"]/@href')) # nexturl = 'https://list.tmall.com/search_product.htm' +self.clean(html.xpath('//a[@class="ui-page-next"]/@href')) if nexturl else None # print(nexturl) # if nexturl is None: # return self.download_dell_TMall() '''
def comment_detail(self, goodid, url=None, page=1, callback=None, meta=None, keyword=None, goodsname=None, client=None, default=None): self.session = random.choice(self.sessionlist) print('Page {}'.format(page)) # 解析详情页,获取评论信息 if callback is None: callback = self.comment_detail if page == 0: url = self.get_cookies(goodid) print(url) else: if url is None: url = '''https://rate.taobao.com/feedRateList.htm?auctionNumId={}¤tPageNum=1&pageSize=20&rateType=1&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0&_ksTS={}&callback=jsonp_tbcrate_reviews_list'''.format( goodid, "{}".format(time.time() * 1000).replace('.', '_')) url = re.sub(r'¤tPageNum=(\d+?)&', '¤tPageNum={}&'.format(page), url) url = re.sub( r'&_ksTS=[\d_]+?&', '¤tPageNum={}&'.format("{}".format( time.time() * 1000).replace('.', '_')), url) print(url) try: headers = {'User-Agent': self.get_headers()} temp_proxy = get_one_proxy() #proxies = {'http':'122.116.144.41:55160','https':'122.116.144.41:55160'} proxies = {'http': temp_proxy.strip(), 'https': temp_proxy.strip()} resp = requests.get(url, headers=headers, proxies=proxies, timeout=self.timeout) except Exception as e: print(e) callback(goodid, url=url, page=page, callback=callback, meta=meta, keyword=keyword, goodsname=goodsname, client=client, default=default) # logging.error('Fatal error:'+url+'downloaded fail') return cod = resp.encoding result = resp.content.decode(cod) # print(result) reT = r'\w+?\((.*?)\)$' res = re.search(reT, result, re.S) if res: res = res.group(1) res = json.loads(res) try: comments = res.get("comments", []) except Exception as e: # logging.error('comment_detail error:' + e) return print(self.count, len(comments)) if len(comments) == 0: if self.count >= 5: self.comment_switch[goodid] = False return if callback is None: callback = self.comment_detail self.count += 1 print(self.count) callback(goodid, url=url, page=page, callback=callback, meta=meta, keyword=keyword, goodsname=goodsname, client=client, default=default) return #self.comment_switch[goodid] = False myresult = [] # 最终要获得的数据 结构 self.count = 0 temp2 = {} for i in comments: # print(i) # print('#'*50) temp = {} temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d') temp['size'] = i.get('productSize', None) temp['comment_time'] = i.get('date', None) # 2018年11月22日 15:23 temp['content'] = i.get('content', '') # 这个当然不错的了,我们是用来作图的 temp['img'] = i.get('photos') temp['img'] = json.dumps( list( map( lambda x: re.sub(r'^(https:)?//', 'https://', x.get('url', '')).replace( '_400x400.jpg', ''), temp.get('img', [])))) if len(temp['img']) == 0: temp['img'] = None temp['content'] = i.get('content') # print(temp['website_url']) temp['website'] = 'JD' temp['website_url'] = 'http://www.jd.com' #http://img30.360buyimg.com/shaidan/jfs/t23899/69/1404782488/83204/3b210e9c/5b5ef8f1N3d24d6b6.jpg temp['type'] = self.keyword if keyword is None else keyword temp['client'] = self.client if client is None else client temp['score'] = i.get('score', default) replies = i.get('replies', None) temp['replytime'] = i.get('shareInfo', {}).get('lastReplyTime', '') if temp['replytime'] == '': temp['replytime'] = None temp['md5_id'] = self.md5('{}{}'.format( goodid, i.get('rateId', ''))) temp[ 'goodsname'] = self.goodsname if goodsname is None else goodsname norepeat = self.md5('{}{}{}'.format( goodid, i.get('rateId', ''), temp['replytime'] if temp['replytime'] else 'null')) print(temp) self.results.append(temp) self.record_num += 1 if self.record_num >= self.max_record_per_file: self.save_to_file() # md5_temp = self.md5(temp) # if md5_temp == self.last_md5 and len(temp)>0: # return # else: # self.last_md5 = md5_temp page = page + 1 if callback is None: callback = self.comment_detail callback(goodid, url=url, page=page, callback=callback, meta=meta, keyword=keyword, goodsname=goodsname, client=client, default=default) return
def download_dell_Taobao(self,url=None,callback=None): self.session.cookies = random.choice(self.cookies_dict) if callback is None: callback = self.parse_comment if url is None and self.page==0: url = 'https://s.taobao.com/search?spm=a230r.1.1998181369.d4919860.2e9f1fcfzMnVGu&q=%E6%98%BE%E7%A4%BA%E5%99%A8&imgfile=&js=1&initiative_id=staobaoz_20181128&ie=utf8&bcoffset=-9&ntoffset=-9&p4ppushleft=%2C44&tab=mall&cps=yes&ppath=20000%3A26683' if url is None and self.page!=0: url = 'https://s.taobao.com/search?spm=a230r.1.1998181369.d4919860.2e9f1fcfzMnVGu&q=%E6%98%BE%E7%A4%BA%E5%99%A8&imgfile=&js=1&initiative_id=staobaoz_20181128&ie=utf8&bcoffset=-9&ntoffset=-9&p4ppushleft=%2C44&tab=mall&cps=yes&ppath=20000%3A26683' url = re.sub(r'&s=(\d+?)$','',url) url += '&s={}'.format(self.page*44) rawurl = url if self.count >=10: time.sleep(300) self.count = 0 headers = {'User-Agent':self.get_headers()} headers['cookie'] = 'cna=urphFA2+QgcCAT3czp0RFlrD; t=cfcbdf6c5e7e70ec61e509b3e893cacc; _cc_=URm48syIZQ%3D%3D; tg=0; l=AsvLG1l6sk/iMXekLpNLMpmk22S10t/i; enc=ddv2AfS73bAhE5TaTHRE7lY%2FsnloFft7cUqDn%2B6k1ekOfrN4duFionnjGCyhEpz%2FnRl9%2FHuS6ZucHRurQ8drjA%3D%3D; _uab_collina=154407940266705370937115; thw=tw; hng=TW%7Czh-TW%7CTWD%7C158; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=12c6025833330a8057cc7fbdbdb899c7_1544151605085; _m_h5_tk_enc=6e833b2b8360d85dea0e2cbd677b44a6; cookie2=142aa50b21624f6dc0cde7974b608e1b; _tb_token_=7ee603780db1b; _fbp=fb.1.1544142611079.761093547; alitrackid=world.taobao.com; lastalitrackid=world.taobao.com; swfstore=245366; v=0; uc1=cookie14=UoTYMh9%2BrwN37g%3D%3D; mt=ci=-1_0; x5sec=7b227365617263686170703b32223a223738376530653737386530633432306535646261613233306164353835633933434f697a702b4146454b2f6673726a71355a50346a674561437a597a4d5463314d6a51314f547379227d; JSESSIONID=3EDBD25CB01DC1E8AF7FF2B28F4B38E8; isg=BOnp0iHokRFsr62sPFb7mWah-JVJnxmQI0gCF4vYjlOSUh5k0Qd4uLcEEL5BSnUg' temp_proxy = get_one_proxy() #proxies = {'http':'122.116.144.41:55160','https':'122.116.144.41:55160'} proxies = {'http':temp_proxy.strip(),'https':temp_proxy.strip()} try: resp = self.session2.get(url,headers=headers,proxies=proxies,timeout = self.timeout) except Exception as e : print(e) return self.download_dell_Taobao(url=url,callback=callback) # print(resp.url) result = resp.text # print(result) result = re.search(r'g_page_config = ({.*?});',result,re.S) # print(self.session.cookies) if result: self.count = 0 result = json.loads(result.group(1)) items = result.get('mods',{}).get('itemlist',{}).get('data',{}).get('auctions',[]) print(len(items)) for item in items: shopname = item.get('nick') price = item.get('view_price') url = item.get('detail_url') title = item.get('raw_title') goodid = re.findall(r'\?id=(\d+?)&',url) if goodid: goodid = goodid[0] else: continue if '差价' in title: continue if '英寸' not in title: continue type = None for i in keywords_type.keys(): if i in title: type = i break # print(type) if type is None: continue websites = item.get('icon',[]) for website in websites: if '尚天猫,就购了' in website.get('title',''): print(website.get('title','')) continue # dbredis.sadd('TMall_goodid',(goodid,type)) print(shopname,price,goodid,title,type,sep='\n') self.goodid_list.append((shopname,price,goodid,title,type)) # 处理分页 if self.page < self.startpage+5 and len(items) ==44: self.page += 1 url = rawurl url = re.sub(r'&s=(\d+?)$','',url) url += '&s={}'.format(self.page*44) print('url:{}'.format(url)) if url.startswith('//'): url = 'https:'+url print(url) return self.download_dell_Taobao(url) # 下载分页 else: print('正在更换用户...') time.sleep(30) self.count += 1 self.cookie_id = self.cookie_id + 1 if self.cookie_id + 1 <len(self.cookies_dict) else 0 self.session.cookies = self.cookies_dict[self.cookie_id] return self.download_dell_Taobao(url=url,callback=callback) print('淘宝导入完成,共导入{}条数据'.format(self.page)) with open('goodid{}.txt'.format(self.startpage),'w') as f: f.write(json.dumps(self.goodid_list)) print('数据写入完成...')