Beispiel #1
0
	def testD(self):
		url = 'https://s.taobao.com/search'
		data = {
			'data-key':'s',
			'data-value':'264',
			'ajax':'true',
			'_ksTS': "{}".format(time.time()*1000).replace('.','_'),
			'callback':'jsonp{}'.format("{}".format(time.time()*1000).replace('.','_').split('_')[1]),
			'q':'显示器',
			'imgfile':'', 
			'js':'1',
			'stats_click':'search_radio_all:1',
			'initiative_id':'staobaoz_20181207',
			'ie':'utf8',
			'cps':'yes',
			'ppath':'20000:26683',
			'bcoffset':'3',
			'ntoffset':'3',
			'p4ppushleft':'1,48',
			's':'44',
		}
		headers = {'User-Agent':self.get_headers()}
		temp_proxy = get_one_proxy()
		proxies = {'http':temp_proxy.strip(),'https':temp_proxy.strip()}
		headers['cookie'] = 'cna=urphFA2+QgcCAT3czp0RFlrD; t=cfcbdf6c5e7e70ec61e509b3e893cacc; _cc_=URm48syIZQ%3D%3D; tg=0; l=AsvLG1l6sk/iMXekLpNLMpmk22S10t/i; enc=ddv2AfS73bAhE5TaTHRE7lY%2FsnloFft7cUqDn%2B6k1ekOfrN4duFionnjGCyhEpz%2FnRl9%2FHuS6ZucHRurQ8drjA%3D%3D; _uab_collina=154407940266705370937115; thw=tw; hng=TW%7Czh-TW%7CTWD%7C158; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=12c6025833330a8057cc7fbdbdb899c7_1544151605085; _m_h5_tk_enc=6e833b2b8360d85dea0e2cbd677b44a6; cookie2=142aa50b21624f6dc0cde7974b608e1b; _tb_token_=7ee603780db1b; _fbp=fb.1.1544142611079.761093547; alitrackid=world.taobao.com; lastalitrackid=world.taobao.com; swfstore=245366; v=0; uc1=cookie14=UoTYMh9%2BrwN37g%3D%3D; mt=ci=-1_0; x5sec=7b227365617263686170703b32223a223738376530653737386530633432306535646261613233306164353835633933434f697a702b4146454b2f6673726a71355a50346a674561437a597a4d5463314d6a51314f547379227d; JSESSIONID=4E59B8904FEDB78C25510C048E50018B; isg=BDc3yuOiJwcPVKOCvsxtL0TLxiuL1c8mgTKMuYnhYoSGOER6k86erp5eHtDD0OPW'
		# headers['referer']='https://s.taobao.com/search?q=%E6%98%BE%E7%A4%BA%E5%99%A8&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20181207&ie=utf8&cps=yes&ppath=20000%3A26683&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44'
		resp = requests.get(url,params=data,headers=headers,proxies=proxies)
		# print(resp.url)
		result = resp.text
		print(result)
		return 
Beispiel #2
0
    def download_dell(self, url, keywordslist, callback=None):
        if callback is None:
            callback = self.dell_store

        proxy = get_one_proxy()
        print(proxy)
        resp = requests.get(url, headers=self.headers, proxies=self.proxy)
        return callback(resp.text, keywordslist=keywordslist)
Beispiel #3
0
    def __init__(self,
                 goodsname='Dell',
                 keyword='P2317H',
                 client='Rosa',
                 tag=None,
                 debug=False):
        self.debug = debug
        self.tag = tag
        #self.dbmysql = MysqlPipeline()
        self.client = client
        self.goodsname = goodsname
        self.keyword = self.goodsname + ' ' + keyword
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
        }
        self.baseurl = 'https://search.jd.com'
        self.baseurl2 = 'http://item.jd.com'
        self.searchurl = 'http://search.jd.com/Search'
        self.prefix1 = 'Summary_'
        self.prefix2 = 'Main_'  #第一层json的值

        proxy = get_one_proxy()
        self.proxy = {'http': proxy, 'https': proxy}
        self.keyword2 = keyword  #用来匹配
        self.pattern = r'\/(\d+?)\.html'  #从url中获取goodid
        self.urlmoduel = 'https://item.jd.com/{}.html'

        # 用来去掉已经爬过了的店铺
        self.setfilter = set()

        data = {
            'keyword': self.keyword,
            'enc': 'utf-8',
            'wq': self.keyword,
            'pvid': '7e5dd963f7084c468d817cf06a3351dc'
        }
        # print(data)
        self.lock = True
        try:
            resp = requests.get(self.searchurl,
                                params=data,
                                headers=self.headers,
                                proxies=self.proxy)
            if '汪~没有找到' in resp.content.decode():
                self.lock = False
            # print(resp.status_code)
        except Exception as e:
            print('Fatal error')
            logging.error('Fatal error:' + self.searchurl + 'downloaded fail')
            self.refer = self.searchurl
        else:
            self.refer = resp.url

        self.switch = True  # 当分页处理完成,设置为False
        self.comment_switch = {}  # 评论分页开关,键名为goodid
        # print(self.refer)

        # 测试
        self.test = []

        # 更新
        self.maxpage = 5  #更新状态,最多翻看5页评论。
        self.update_status = False  #默认不开启更新状态
Beispiel #4
0
    def comment_detail(self,
                       goodid,
                       page=0,
                       callback=None,
                       meta=None,
                       keyword=None,
                       goodsname=None,
                       client=None):
        if self.debug:
            return print(goodid, page, keyword, goodsname, client)
        # url = self.urlmoduel.format(goodid)
        #url = 'http://sclub.jd.com/comment/productPageComments.action'
        url = 'http://club.jd.com/comment/skuProductPageComments.action'
        # 解析详情页,获取评论信息
        data = {
            'callback': 'fetchJSON_comment98vv229',
            'productId': goodid,
            'score': '0',
            'sortType': '6',  #按时间排序
            'page': page,
            'pageSize': '10',  # 最多显示十条评论
            'isShadowSku': '0',
            'fold': '1'
        }
        data = {
            'callback': 'fetchJSON_comment98vv762',
            'productId': goodid,
            'score': '0',
            'sortType': '6',
            'page': page,
            'pageSize': '10',
            'isShadowSku': '0',
            'rid': '0',
            'fold': '1'
        }
        try:
            proxy = get_one_proxy()
            proxies = {'http': proxy, 'https': proxy}
            resp = requests.get(url,
                                params=data,
                                headers=self.headers,
                                proxies=proxies)
        except Exception as e:
            print('{}'.format(e))
            logging.error('Fatal error:' + url + 'downloaded fail')
            return
        cod = resp.encoding
        result = resp.content.decode(cod)
        reT = r'\w+?\((.*?)\);$'
        res = re.search(reT, result, re.S)
        print(res)  # 调试
        if res:
            res = res.group(1)
            res = json.loads(res)
            # print(res)
            try:
                comments = res.get("comments")
            except Exception as e:
                logging.error('comment_detail error:' + e)
                return

            if len(comments) == 0:
                self.comment_switch[goodid] = False
                return

            myresult = []  # 最终要获得的数据 结构
            for i in comments:
                # print(i)
                temp = {}
                temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d')
                temp['size'] = i.get('productSize', None)
                temp['comment_time'] = i.get('creationTime',
                                             None)  # 2015-09-09 11:35:27
                temp['content'] = i.get('showOrderComment', {}).get(
                    "content", i.get('content'))  # 这个当然不错的了,我们是用来作图的
                temp['img'] = re.findall(
                    r'http\:\/\/img30\.360buyimg\.com\/shaidan\/jfs\/[\w\/]+?\.jpg',
                    temp.get('content', ''))
                if len(temp['img']) == 0:
                    temp['img'] = None
                temp['content'] = i.get('content')
                # print(temp['website_url'])
                temp['website'] = 'JD'
                temp['website_url'] = 'http://www.jd.com'
                #http://img30.360buyimg.com/shaidan/jfs/t23899/69/1404782488/83204/3b210e9c/5b5ef8f1N3d24d6b6.jpg
                temp['type'] = self.keyword if keyword is None else keyword
                temp['client'] = self.client if client is None else client
                temp['score'] = i.get('score', None)
                replies = i.get('replies', None)
                temp['replytime'] = None
                if replies is not None:
                    try:
                        temp['replytime'] = replies[0].get(
                            'creationTime', None)
                    except IndexError:
                        temp['replytime'] = None
                else:
                    temp['replytime'] = None  # 回复时间
                temp['md5_id'] = self.md5('{}{}'.format(
                    goodid, i.get('id', '')))
                temp[
                    'goodsname'] = self.goodsname if goodsname is None else goodsname
                norepeat = self.md5('{}{}{}'.format(
                    goodid, i.get('id', ''),
                    temp['replytime'] if temp['replytime'] else 'null'))
                if not dbredis.sadd("Reviews_norepeat6", norepeat):
                    self.comment_switch[goodid] = False
                    #break

                myresult.append(temp)
            pipelines = MongodbPipeline()
            try:
                # print(myresult)
                pipelines.insert(myresult)
            except Exception as e:
                logging.error('insert error' + self.keyword + e +
                              '{}'.format(page))
        else:
            self.comment_switch[goodid] = False
        pass
Beispiel #5
0
    def __search(self, page=1, callback=None, meta=None):
        '''
		page:第多少个半页
		'''
        # 解析第一部分 第一部分和第二部分可以合并
        # 解析第二部分
        # 处理分页
        if page >= 200:
            return
        refer = self.refer
        url2 = 'https://search.jd.com/s_new.php'
        headers = {}
        headers['Referer'] = refer
        headers.update(self.headers)
        data2 = {
            'keyword': self.keyword,
            'enc': 'utf-8',
            'qrst': '1',
            'rt': '1',
            'stop': '1',
            'vt': '2',
            'wq': self.keyword,
            'page': page,
            's': (page - 1) * 30 + 1,
            'scrolling': 'y',
            'log_id': time.time(),
            'tpl': '1_M',
        }
        # print('测试') #测试时候使
        try:
            proxy = get_one_proxy()
            proxies = {'http': proxy, 'https': proxy}
            resp = requests.get(url2,
                                params=data2,
                                headers=headers,
                                proxies=proxies)
        except Exception as e:
            logging.error('Fatal error:' + url2 + 'downloaded fail')
            return
        # code = resp.encoding
        logging.info('status code : {}'.format(resp.status_code))
        # print(resp.status_code)
        result = resp.text
        # print(result)
        html = etree.HTML(result)
        items = html.xpath(r'//li[@class = "gl-item"]')
        length = len(items)
        if length == 0:
            self.switch = False
        for item in items:
            temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href')
            # print(temp_url)
            if len(temp_url) > 0:
                _ = re.findall(self.pattern, temp_url[0])
                if len(_) > 0:
                    url = self.urlmoduel.format(_[0])
                    goodid = _[0]
                    # print(url)
                else:
                    continue
                    pass
            else:
                continue

            # 为了数据完整性,此处需要修改

            res = etree.tostring(item)
            cod = chardet.detect(res).get("encoding")
            res = res.decode(cod)
            # kw = self.keyword.split(' ')
            reT = self.keyword2 + '[a-zA-Z]'
            # print(reT)

            res = re.sub(r'<font.+?>', '', res)
            res = re.sub(r'</font>', '', res)
            tres = etree.HTML(res)
            tres = tres.xpath(r'//a/em/text()')  # 获取标题
            if len(tres):
                res = tres[0]
            else:
                print('空')
                continue

            print(res)
            if re.search(reT, res, re.S):
                logging.info('Invalid Match ')
                # print(goodid,'x')
                continue
            if self.keyword2 not in res:
                continue
            if '显示器' not in res:
                continue
            else:
                logging.info('{}'.format(goodid))
                print(res)
                # print(reT)
                # print(goodid,'okay')
                # continue #测试的时候使用
                if goodid in self.setfilter:  #去掉爬过了的网页
                    continue
                else:
                    self.setfilter.add(goodid)

                print(goodid)  #测试
                callback(goodid=goodid, callback=self.comment_detail)
                '''break # 必须删除,调试的时候使用
Beispiel #6
0
 def download_dell_TMall(self, url=None, callback=None):
     if callback is None:
         callback = self.parse_comment
     if url is None and self.page == 0:
         url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.23b52fc5dpna5r&brand=26683&q=%CF%D4%CA%BE%C6%F7&sort=s&style=g&from=sn_1_brand-qp&type=pc#J_Filter'
     if url is None and self.page != 0:
         url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.23b52fc5dpna5r&brand=26683&s=60&q=%CF%D4%CA%BE%C6%F7&sort=s&style=g&from=sn_1_brand-qp&type=pc#J_Filter'
         url = re.sub(r'&s=(\d+?)&', '', url)
         url += '&s={}'.format(self.page * 60)
     rawurl = url
     if self.count >= 10:
         time.sleep(300)
         self.count = 0
     headers = {'User-Agent': self.get_headers()}
     headers[
         'referer'] = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.79102fc5TTtYDp&brand=26683&s=60&q=%CF%D4%CA%BE%C6%F7&sort=s&style=g&from=sn_1_brand-qp&type=pc'
     headers[
         'cookie'] = 'hng=TW%7Czh-TW%7CTWD%7C158; cna=urphFA2+QgcCAT3czp0RFlrD; _med=dw:1366&dh:768&pw:1366&ph:768&ist:0; uss=""; enc=l0HOjOaHNaXY5Gm6OVhD8Imt8PfZqiyMNQ93Iyhipnx%2F8ShXgyOJ%2FfgsHJFyhTTJ6cRZSQftnIoeE2SRRr%2B02w%3D%3D; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; uc1=cookie14=UoTYM8Z0sKwwJg%3D%3D; t=cfcbdf6c5e7e70ec61e509b3e893cacc; uc3=vt3=F8dByR1Rm78DSHZkeDE%3D&id2=UoLfdCnbIqFUtP%2BN&nk2=CdzyrrmtfW3pTfw%3D&lg2=W5iHLLyFOGW7aA%3D%3D; tracknick=jonny_test1; lid=jonny_test1; lgc=jonny_test1; _tb_token_=563859e331696; cookie2=1f65d98536154088eb538260b03f63cf; cq=ccp%3D1; swfstore=63758; _uab_collina=154502570212165294427025; x5sec=7b22746d616c6c7365617263683b32223a22393065623664393730343964356366323331356631353934643866316232663743496e79334f414645506d73315936596771722b6851453d227d; res=scroll%3A1331*5350-client%3A1331*581-offset%3A1331*5350-screen%3A1366*768; pnm_cku822=098%23E1hvzQvUvbpvjQCkvvvvvjiPR25vAjl8P2MUtjljPmPyAj1HP2FwAjlPn2SOgj6PvpvhvvvvvbyCvm3vpvvvvvCvpZCvHChvvhUjphvZ7pvvp6nvpCBXvvCmeyCvHHhvvhb%2FuphvmhCvCElaO%2BGRkphvCyEmmvofVvyCvh12%2FZvvIsD9h5lNAf6T%2Bm1XbZBnsfpBfC3Y2qFnR2Oic738k8p7rz6Ef3Ax0fUtWlXIpfmD5daKCmYVK4vBh7DHb3RxffoKfCYYhML6vphvCyCCvvvvv2yCvvBvpvvviQhvChCvCCp%3D; isg=BO_vtkNt73HxiutQ91qjnwohfgPzvQbguQpEwQF8i95lUA9SCGTTBu0K1ghLKBsu'
     temp_proxy = get_one_proxy()
     #proxies = {'http':'122.116.144.41:55160','https':'122.116.144.41:55160'}
     proxies = {'http': temp_proxy.strip(), 'https': temp_proxy.strip()}
     try:
         resp = self.session2.get(url,
                                  headers=headers,
                                  proxies=proxies,
                                  timeout=self.timeout)
     except Exception as e:
         print(e)
         return self.download_dell_TMall(url=url, callback=callback)
     # print(resp.url)
     result = resp.text
     with open('rawtmall.txt', 'w') as f:
         f.write(result)
     print(result)
     html = etree.HTML(result)
     reslist = html.xpath(r'//div[@class="product-iWrap"]')
     # r'//div[@class="product-iWrap"]//p[@class="productTitle"]/a//@title'
     print(len(reslist))
     for res in reslist:
         title = self.clean(
             res.xpath(r'.//p[@class="productTitle"]/a//@title'))
         if '差价' in title:
             continue
         if '英寸' not in title:
             continue
         type = None
         for i in keywords_type.keys():
             if i in title:
                 type = i
                 break
         # print(type)
         if type is None:
             continue
         print(title, type)
     self.page += 1
     print('第{}页'.format(self.page))
     # nexturl = self.clean(html.xpath('//a[@class="ui-page-next"]/@href'))
     # nexturl = 'https://list.tmall.com/search_product.htm' +self.clean(html.xpath('//a[@class="ui-page-next"]/@href')) if nexturl else None
     # print(nexturl)
     # if nexturl is None:
     # return
     self.download_dell_TMall()
     '''
Beispiel #7
0
    def comment_detail(self,
                       goodid,
                       url=None,
                       page=1,
                       callback=None,
                       meta=None,
                       keyword=None,
                       goodsname=None,
                       client=None,
                       default=None):
        self.session = random.choice(self.sessionlist)
        print('Page {}'.format(page))
        # 解析详情页,获取评论信息
        if callback is None:
            callback = self.comment_detail
        if page == 0:
            url = self.get_cookies(goodid)
            print(url)
        else:
            if url is None:
                url = '''https://rate.taobao.com/feedRateList.htm?auctionNumId={}&currentPageNum=1&pageSize=20&rateType=1&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0&_ksTS={}&callback=jsonp_tbcrate_reviews_list'''.format(
                    goodid, "{}".format(time.time() * 1000).replace('.', '_'))
            url = re.sub(r'&currentPageNum=(\d+?)&',
                         '&currentPageNum={}&'.format(page), url)

            url = re.sub(
                r'&_ksTS=[\d_]+?&', '&currentPageNum={}&'.format("{}".format(
                    time.time() * 1000).replace('.', '_')), url)
        print(url)
        try:
            headers = {'User-Agent': self.get_headers()}
            temp_proxy = get_one_proxy()
            #proxies = {'http':'122.116.144.41:55160','https':'122.116.144.41:55160'}
            proxies = {'http': temp_proxy.strip(), 'https': temp_proxy.strip()}
            resp = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                timeout=self.timeout)
        except Exception as e:
            print(e)
            callback(goodid,
                     url=url,
                     page=page,
                     callback=callback,
                     meta=meta,
                     keyword=keyword,
                     goodsname=goodsname,
                     client=client,
                     default=default)
            # logging.error('Fatal error:'+url+'downloaded fail')
            return
        cod = resp.encoding
        result = resp.content.decode(cod)
        # print(result)
        reT = r'\w+?\((.*?)\)$'
        res = re.search(reT, result, re.S)
        if res:
            res = res.group(1)
            res = json.loads(res)
            try:
                comments = res.get("comments", [])
            except Exception as e:
                # logging.error('comment_detail error:' + e)
                return

            print(self.count, len(comments))
            if len(comments) == 0:
                if self.count >= 5:
                    self.comment_switch[goodid] = False
                    return
                if callback is None:
                    callback = self.comment_detail
                self.count += 1
                print(self.count)
                callback(goodid,
                         url=url,
                         page=page,
                         callback=callback,
                         meta=meta,
                         keyword=keyword,
                         goodsname=goodsname,
                         client=client,
                         default=default)
                return
                #self.comment_switch[goodid] = False

            myresult = []  # 最终要获得的数据 结构
            self.count = 0
            temp2 = {}
            for i in comments:
                # print(i)
                # print('#'*50)
                temp = {}
                temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d')
                temp['size'] = i.get('productSize', None)
                temp['comment_time'] = i.get('date', None)  # 2018年11月22日 15:23
                temp['content'] = i.get('content', '')  # 这个当然不错的了,我们是用来作图的
                temp['img'] = i.get('photos')
                temp['img'] = json.dumps(
                    list(
                        map(
                            lambda x: re.sub(r'^(https:)?//', 'https://',
                                             x.get('url', '')).replace(
                                                 '_400x400.jpg', ''),
                            temp.get('img', []))))
                if len(temp['img']) == 0:
                    temp['img'] = None
                temp['content'] = i.get('content')
                # print(temp['website_url'])
                temp['website'] = 'JD'
                temp['website_url'] = 'http://www.jd.com'
                #http://img30.360buyimg.com/shaidan/jfs/t23899/69/1404782488/83204/3b210e9c/5b5ef8f1N3d24d6b6.jpg
                temp['type'] = self.keyword if keyword is None else keyword
                temp['client'] = self.client if client is None else client
                temp['score'] = i.get('score', default)
                replies = i.get('replies', None)
                temp['replytime'] = i.get('shareInfo',
                                          {}).get('lastReplyTime', '')
                if temp['replytime'] == '':
                    temp['replytime'] = None

                temp['md5_id'] = self.md5('{}{}'.format(
                    goodid, i.get('rateId', '')))
                temp[
                    'goodsname'] = self.goodsname if goodsname is None else goodsname
                norepeat = self.md5('{}{}{}'.format(
                    goodid, i.get('rateId', ''),
                    temp['replytime'] if temp['replytime'] else 'null'))
                print(temp)
                self.results.append(temp)
                self.record_num += 1
                if self.record_num >= self.max_record_per_file:
                    self.save_to_file()

            # md5_temp = self.md5(temp)
            # if md5_temp == self.last_md5 and len(temp)>0:
            # return
            # else:
            # self.last_md5 = md5_temp

            page = page + 1
            if callback is None:
                callback = self.comment_detail
            callback(goodid,
                     url=url,
                     page=page,
                     callback=callback,
                     meta=meta,
                     keyword=keyword,
                     goodsname=goodsname,
                     client=client,
                     default=default)
            return
Beispiel #8
0
	def download_dell_Taobao(self,url=None,callback=None):
		self.session.cookies = random.choice(self.cookies_dict)
		if callback is None:
			callback = self.parse_comment
		if url is None and self.page==0:
			url = 'https://s.taobao.com/search?spm=a230r.1.1998181369.d4919860.2e9f1fcfzMnVGu&q=%E6%98%BE%E7%A4%BA%E5%99%A8&imgfile=&js=1&initiative_id=staobaoz_20181128&ie=utf8&bcoffset=-9&ntoffset=-9&p4ppushleft=%2C44&tab=mall&cps=yes&ppath=20000%3A26683'
		if url is None and self.page!=0:
			url = 'https://s.taobao.com/search?spm=a230r.1.1998181369.d4919860.2e9f1fcfzMnVGu&q=%E6%98%BE%E7%A4%BA%E5%99%A8&imgfile=&js=1&initiative_id=staobaoz_20181128&ie=utf8&bcoffset=-9&ntoffset=-9&p4ppushleft=%2C44&tab=mall&cps=yes&ppath=20000%3A26683'
			url = re.sub(r'&s=(\d+?)$','',url)
			url += '&s={}'.format(self.page*44)
		rawurl = url
		if self.count >=10:
			time.sleep(300)
			self.count = 0
		headers = {'User-Agent':self.get_headers()}
		headers['cookie'] = 'cna=urphFA2+QgcCAT3czp0RFlrD; t=cfcbdf6c5e7e70ec61e509b3e893cacc; _cc_=URm48syIZQ%3D%3D; tg=0; l=AsvLG1l6sk/iMXekLpNLMpmk22S10t/i; enc=ddv2AfS73bAhE5TaTHRE7lY%2FsnloFft7cUqDn%2B6k1ekOfrN4duFionnjGCyhEpz%2FnRl9%2FHuS6ZucHRurQ8drjA%3D%3D; _uab_collina=154407940266705370937115; thw=tw; hng=TW%7Czh-TW%7CTWD%7C158; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=12c6025833330a8057cc7fbdbdb899c7_1544151605085; _m_h5_tk_enc=6e833b2b8360d85dea0e2cbd677b44a6; cookie2=142aa50b21624f6dc0cde7974b608e1b; _tb_token_=7ee603780db1b; _fbp=fb.1.1544142611079.761093547; alitrackid=world.taobao.com; lastalitrackid=world.taobao.com; swfstore=245366; v=0; uc1=cookie14=UoTYMh9%2BrwN37g%3D%3D; mt=ci=-1_0; x5sec=7b227365617263686170703b32223a223738376530653737386530633432306535646261613233306164353835633933434f697a702b4146454b2f6673726a71355a50346a674561437a597a4d5463314d6a51314f547379227d; JSESSIONID=3EDBD25CB01DC1E8AF7FF2B28F4B38E8; isg=BOnp0iHokRFsr62sPFb7mWah-JVJnxmQI0gCF4vYjlOSUh5k0Qd4uLcEEL5BSnUg'
		temp_proxy = get_one_proxy()
		#proxies = {'http':'122.116.144.41:55160','https':'122.116.144.41:55160'}
		proxies = {'http':temp_proxy.strip(),'https':temp_proxy.strip()}
		try:
			resp = self.session2.get(url,headers=headers,proxies=proxies,timeout = self.timeout)
		except Exception as e :
			print(e)
			return self.download_dell_Taobao(url=url,callback=callback)
		# print(resp.url)
		result = resp.text
		# print(result)
		result = re.search(r'g_page_config = ({.*?});',result,re.S)
		# print(self.session.cookies)
		if result:
			self.count = 0
			result = json.loads(result.group(1))
			items = result.get('mods',{}).get('itemlist',{}).get('data',{}).get('auctions',[])
			print(len(items))
			for item in items:
				shopname = item.get('nick')
				price = item.get('view_price')
				url = item.get('detail_url')
				title = item.get('raw_title')
				goodid = re.findall(r'\?id=(\d+?)&',url)
				if goodid:
					goodid = goodid[0]
				else:
					continue
				if '差价' in title:
					continue
				if '英寸' not in title:
					continue
				type = None
				for i in keywords_type.keys():
					if i in title:
						type = i
						break
				# print(type)
				if type is None:
					continue
				websites = item.get('icon',[])
				for website in websites:
					if '尚天猫,就购了' in website.get('title',''):
						print(website.get('title',''))
						continue
				# dbredis.sadd('TMall_goodid',(goodid,type))
				print(shopname,price,goodid,title,type,sep='\n')
				self.goodid_list.append((shopname,price,goodid,title,type))
			# 处理分页
			if self.page < self.startpage+5 and len(items) ==44:
				self.page += 1
				url = rawurl
				url = re.sub(r'&s=(\d+?)$','',url)
				url += '&s={}'.format(self.page*44)
				print('url:{}'.format(url))
				if url.startswith('//'):
					url = 'https:'+url
					print(url)
				return self.download_dell_Taobao(url) # 下载分页
		else:
			print('正在更换用户...')
			time.sleep(30)
			self.count += 1
			self.cookie_id = self.cookie_id + 1 if self.cookie_id + 1 <len(self.cookies_dict) else 0
			self.session.cookies = self.cookies_dict[self.cookie_id]
			return self.download_dell_Taobao(url=url,callback=callback)
		print('淘宝导入完成,共导入{}条数据'.format(self.page))
		with open('goodid{}.txt'.format(self.startpage),'w') as f:
			f.write(json.dumps(self.goodid_list))
		print('数据写入完成...')