def testD(self): url = '' data = { 'data-key':'s', 'data-value':'264', 'ajax':'true', '_ksTS': "{}".format(time.time()*1000).replace('.','_'), 'callback':'jsonp{}'.format("{}".format(time.time()*1000).replace('.','_').split('_')[1]), 'q':'显示器', 'imgfile':'', 'js':'1', 'stats_click':'search_radio_all:1', 'initiative_id':'staobaoz_20181207', 'ie':'utf8', 'cps':'yes', 'ppath':'20000:26683', 'bcoffset':'3', 'ntoffset':'3', 'p4ppushleft':'1,48', 's':'44', } headers = {'User-Agent':self.get_headers()} temp_proxy = get_one_proxy() proxies = {'http':temp_proxy.strip(),'https':temp_proxy.strip()} headers['cookie'] = 'cna=urphFA2+QgcCAT3czp0RFlrD; t=cfcbdf6c5e7e70ec61e509b3e893cacc; _cc_=URm48syIZQ%3D%3D; tg=0; l=AsvLG1l6sk/iMXekLpNLMpmk22S10t/i; enc=ddv2AfS73bAhE5TaTHRE7lY%2FsnloFft7cUqDn%2B6k1ekOfrN4duFionnjGCyhEpz%2FnRl9%2FHuS6ZucHRurQ8drjA%3D%3D; _uab_collina=154407940266705370937115; thw=tw; hng=TW%7Czh-TW%7CTWD%7C158; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _m_h5_tk=12c6025833330a8057cc7fbdbdb899c7_1544151605085; _m_h5_tk_enc=6e833b2b8360d85dea0e2cbd677b44a6; cookie2=142aa50b21624f6dc0cde7974b608e1b; _tb_token_=7ee603780db1b; _fbp=fb.1.1544142611079.761093547;;; swfstore=245366; v=0; uc1=cookie14=UoTYMh9%2BrwN37g%3D%3D; mt=ci=-1_0; x5sec=7b227365617263686170703b32223a223738376530653737386530633432306535646261613233306164353835633933434f697a702b4146454b2f6673726a71355a50346a674561437a597a4d5463314d6a51314f547379227d; JSESSIONID=4E59B8904FEDB78C25510C048E50018B; isg=BDc3yuOiJwcPVKOCvsxtL0TLxiuL1c8mgTKMuYnhYoSGOER6k86erp5eHtDD0OPW' # headers['referer']='' resp = requests.get(url,params=data,headers=headers,proxies=proxies) # print(resp.url) result = resp.text print(result) return
def download_dell(self, url, keywordslist, callback=None): if callback is None: callback = self.dell_store proxy = get_one_proxy() print(proxy) resp = requests.get(url, headers=self.headers, proxies=self.proxy) return callback(resp.text, keywordslist=keywordslist)
def __init__(self, goodsname='Dell', keyword='P2317H', client='Rosa', tag=None, debug=False): self.debug = debug self.tag = tag #self.dbmysql = MysqlPipeline() self.client = client self.goodsname = goodsname self.keyword = self.goodsname + ' ' + keyword self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' } self.baseurl = '' self.baseurl2 = '' self.searchurl = '' self.prefix1 = 'Summary_' self.prefix2 = 'Main_' #第一层json的值 proxy = get_one_proxy() self.proxy = {'http': proxy, 'https': proxy} self.keyword2 = keyword #用来匹配 self.pattern = r'\/(\d+?)\.html' #从url中获取goodid self.urlmoduel = '{}.html' # 用来去掉已经爬过了的店铺 self.setfilter = set() data = { 'keyword': self.keyword, 'enc': 'utf-8', 'wq': self.keyword, 'pvid': '7e5dd963f7084c468d817cf06a3351dc' } # print(data) self.lock = True try: resp = requests.get(self.searchurl, params=data, headers=self.headers, proxies=self.proxy) if '汪~没有找到' in resp.content.decode(): self.lock = False # print(resp.status_code) except Exception as e: print('Fatal error') logging.error('Fatal error:' + self.searchurl + 'downloaded fail') self.refer = self.searchurl else: self.refer = resp.url self.switch = True # 当分页处理完成,设置为False self.comment_switch = {} # 评论分页开关,键名为goodid # print(self.refer) # 测试 self.test = [] # 更新 self.maxpage = 5 #更新状态,最多翻看5页评论。 self.update_status = False #默认不开启更新状态
def comment_detail(self, goodid, page=0, callback=None, meta=None, keyword=None, goodsname=None, client=None): if self.debug: return print(goodid, page, keyword, goodsname, client) # url = self.urlmoduel.format(goodid) #url = '' url = '' # 解析详情页,获取评论信息 data = { 'callback': 'fetchJSON_comment98vv229', 'productId': goodid, 'score': '0', 'sortType': '6', #按时间排序 'page': page, 'pageSize': '10', # 最多显示十条评论 'isShadowSku': '0', 'fold': '1' } data = { 'callback': 'fetchJSON_comment98vv762', 'productId': goodid, 'score': '0', 'sortType': '6', 'page': page, 'pageSize': '10', 'isShadowSku': '0', 'rid': '0', 'fold': '1' } try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url, params=data, headers=self.headers, proxies=proxies) except Exception as e: print('{}'.format(e)) logging.error('Fatal error:' + url + 'downloaded fail') return cod = resp.encoding result = resp.content.decode(cod) reT = r'\w+?\((.*?)\);$' res =, result, re.S) print(res) # 调试 if res: res = res = json.loads(res) # print(res) try: comments = res.get("comments") except Exception as e: logging.error('comment_detail error:' + e) return if len(comments) == 0: self.comment_switch[goodid] = False return myresult = [] # 最终要获得的数据 结构 for i in comments: # print(i) temp = {} temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d') temp['size'] = i.get('productSize', None) temp['comment_time'] = i.get('creationTime', None) # 2015-09-09 11:35:27 temp['content'] = i.get('showOrderComment', {}).get( "content", i.get('content')) # 这个当然不错的了,我们是用来作图的 temp['img'] = re.findall( r'http\:\/\/img30\.360buyimg\.com\/shaidan\/jfs\/[\w\/]+?\.jpg', temp.get('content', '')) if len(temp['img']) == 0: temp['img'] = None temp['content'] = i.get('content') # print(temp['website_url']) temp['website'] = 'JD' temp['website_url'] = '' # temp['type'] = self.keyword if keyword is None else keyword temp['client'] = self.client if client is None else client temp['score'] = i.get('score', None) replies = i.get('replies', None) temp['replytime'] = None if replies is not None: try: temp['replytime'] = replies[0].get( 'creationTime', None) except IndexError: temp['replytime'] = None else: temp['replytime'] = None # 回复时间 temp['md5_id'] = self.md5('{}{}'.format( goodid, i.get('id', ''))) temp[ 'goodsname'] = self.goodsname if goodsname is None else goodsname norepeat = self.md5('{}{}{}'.format( goodid, i.get('id', ''), temp['replytime'] if temp['replytime'] else 'null')) if not dbredis.sadd("Reviews_norepeat6", norepeat): self.comment_switch[goodid] = False #break myresult.append(temp) pipelines = MongodbPipeline() try: # print(myresult) pipelines.insert(myresult) except Exception as e: logging.error('insert error' + self.keyword + e + '{}'.format(page)) else: self.comment_switch[goodid] = False pass
def __search(self, page=1, callback=None, meta=None): ''' page:第多少个半页 ''' # 解析第一部分 第一部分和第二部分可以合并 # 解析第二部分 # 处理分页 if page >= 200: return refer = self.refer url2 = '' headers = {} headers['Referer'] = refer headers.update(self.headers) data2 = { 'keyword': self.keyword, 'enc': 'utf-8', 'qrst': '1', 'rt': '1', 'stop': '1', 'vt': '2', 'wq': self.keyword, 'page': page, 's': (page - 1) * 30 + 1, 'scrolling': 'y', 'log_id': time.time(), 'tpl': '1_M', } # print('测试') #测试时候使 try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url2, params=data2, headers=headers, proxies=proxies) except Exception as e: logging.error('Fatal error:' + url2 + 'downloaded fail') return # code = resp.encoding'status code : {}'.format(resp.status_code)) # print(resp.status_code) result = resp.text # print(result) html = etree.HTML(result) items = html.xpath(r'//li[@class = "gl-item"]') length = len(items) if length == 0: self.switch = False for item in items: temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href') # print(temp_url) if len(temp_url) > 0: _ = re.findall(self.pattern, temp_url[0]) if len(_) > 0: url = self.urlmoduel.format(_[0]) goodid = _[0] # print(url) else: continue pass else: continue # 为了数据完整性,此处需要修改 res = etree.tostring(item) cod = chardet.detect(res).get("encoding") res = res.decode(cod) # kw = self.keyword.split(' ') reT = self.keyword2 + '[a-zA-Z]' # print(reT) res = re.sub(r'<font.+?>', '', res) res = re.sub(r'</font>', '', res) tres = etree.HTML(res) tres = tres.xpath(r'//a/em/text()') # 获取标题 if len(tres): res = tres[0] else: print('空') continue print(res) if, res, re.S):'Invalid Match ') # print(goodid,'x') continue if self.keyword2 not in res: continue if '显示器' not in res: continue else:'{}'.format(goodid)) print(res) # print(reT) # print(goodid,'okay') # continue #测试的时候使用 if goodid in self.setfilter: #去掉爬过了的网页 continue else: self.setfilter.add(goodid) print(goodid) #测试 callback(goodid=goodid, callback=self.comment_detail) '''break # 必须删除,调试的时候使用
def download_dell_TMall(self, url=None, callback=None):
        if callback is None:
            callback = self.parse_comment
        if url is None and == 0:
            url = ''
        if url is None and != 0:
            url = ''
        url = re.sub(r'&s=(\d+?)&', '', url)
        url += '&s={}'.format( * 60)
        rawurl = url
        if self.count >= 10:
            time.sleep(300)
            self.count = 0
        headers = {'User-Agent': self.get_headers()}
        headers['referer'] = ''
        headers['cookie'] = '[COOKIE STRING TRUNCATED]'
        temp_proxy = get_one_proxy()
        proxies = {'http': temp_proxy.strip(), 'https': temp_proxy.strip()}
        try:
            resp = self.session2.get(url, headers=headers, proxies=proxies, timeout=self.timeout)
        except Exception as e:
            print(e)
            return self.download_dell_TMall(url=url, callback=callback)
        result = resp.text
        with open('rawtmall.txt', 'w') as f:
            f.write(result)
        print(result)
        html = etree.HTML(result)
        reslist = html.xpath(r'//div[@class="product-iWrap"]')
        print(len(reslist))
        for res in reslist:
            title = self.clean(res.xpath(r'.//p[@class="productTitle"]/a//@title'))
            if '差价' in title:
                continue
            if '英寸' not in title:
                continue
            type = None
            for i in keywords_type.keys():
                if i in title:
                    type = i
                    break
            if type is None:
                continue
            print(title, type)
   += 1
            print('第{}页'.format(
def comment_detail(self, goodid, url=None, page=1, callback=None, meta=None, keyword=None, goodsname=None, client=None, default=None): self.session = random.choice(self.sessionlist) print('Page {}'.format(page)) # 解析详情页,获取评论信息 if callback is None: callback = self.comment_detail if page == 0: url = self.get_cookies(goodid) print(url) else: if url is None: url = '''{}¤tPageNum=1&pageSize=20&rateType=1&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0&_ksTS={}&callback=jsonp_tbcrate_reviews_list'''.format( goodid, "{}".format(time.time() * 1000).replace('.', '_')) url = re.sub(r'¤tPageNum=(\d+?)&', '¤tPageNum={}&'.format(page), url) url = re.sub( r'&_ksTS=[\d_]+?&', '¤tPageNum={}&'.format("{}".format( time.time() * 1000).replace('.', '_')), url) print(url) try: headers = {'User-Agent': self.get_headers()} temp_proxy = get_one_proxy() #proxies = {'http':'','https':''} proxies = {'http': temp_proxy.strip(), 'https': temp_proxy.strip()} resp = requests.get(url, headers=headers, proxies=proxies, timeout=self.timeout) except Exception as e: print(e) callback(goodid, url=url, page=page, callback=callback, meta=meta, keyword=keyword, goodsname=goodsname, client=client, default=default) # logging.error('Fatal error:'+url+'downloaded fail') return cod = resp.encoding result = resp.content.decode(cod) # print(result) reT = r'\w+?\((.*?)\)$' res =, result, re.S) if res: res = res = json.loads(res) try: comments = res.get("comments", []) except Exception as e: # logging.error('comment_detail error:' + e) return print(self.count, len(comments)) if len(comments) == 0: if self.count >= 5: self.comment_switch[goodid] = False return if callback is None: callback = self.comment_detail self.count += 1 print(self.count) callback(goodid, url=url, page=page, callback=callback, meta=meta, keyword=keyword, goodsname=goodsname, client=client, default=default) return #self.comment_switch[goodid] = False myresult = [] # 最终要获得的数据 结构 self.count = 0 temp2 = {} for i in comments: # print(i) # print('#'*50) temp = {} temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d') temp['size'] = i.get('productSize', None) temp['comment_time'] = i.get('date', None) # 2018年11月22日 15:23 temp['content'] = i.get('content', '') # 这个当然不错的了,我们是用来作图的 temp['img'] = i.get('photos') temp['img'] = json.dumps( list( map( lambda x: re.sub(r'^(https:)?//', 'https://', x.get('url', '')).replace( '_400x400.jpg', ''), temp.get('img', [])))) if len(temp['img']) == 0: temp['img'] = None temp['content'] = i.get('content') # print(temp['website_url']) temp['website'] = 'JD' temp['website_url'] = '' # temp['type'] = self.keyword if keyword is None else keyword temp['client'] = self.client if client is None else client temp['score'] = i.get('score', default) replies = i.get('replies', None) temp['replytime'] = i.get('shareInfo', {}).get('lastReplyTime', '') if temp['replytime'] == '': temp['replytime'] = None temp['md5_id'] = self.md5('{}{}'.format( goodid, i.get('rateId', ''))) temp[ 'goodsname'] = self.goodsname if goodsname is None else goodsname norepeat = self.md5('{}{}{}'.format( goodid, i.get('rateId', ''), temp['replytime'] if temp['replytime'] else 'null')) print(temp) self.results.append(temp) self.record_num += 1 if self.record_num >= self.max_record_per_file: self.save_to_file() # md5_temp = self.md5(temp) # if md5_temp == self.last_md5 and len(temp)>0: # return # else: # self.last_md5 = md5_temp page = page + 1 if callback is None: callback = self.comment_detail callback(goodid, url=url, page=page, callback=callback, meta=meta, keyword=keyword, goodsname=goodsname, client=client, default=default) return
def download_dell_Taobao(self,url=None,callback=None):
        self.session.cookies = random.choice(self.cookies_dict)
        if callback is None:
            callback = self.parse_comment
        if url is None and
            url = ''
        if url is None and!=0:
            url = ''
        url = re.sub(r'&s=(\d+?)$','',url)
        url += '&s={}'.format(*44)
        rawurl = url
        if self.count >=10:
            time.sleep(300)
            self.count = 0
        headers = {'User-Agent':self.get_headers()}
        headers['cookie'] = '[COOKIE STRING TRUNCATED]'
        temp_proxy = get_one_proxy()
        proxies = {'http':temp_proxy.strip(),'https':temp_proxy.strip()}
        try:
            resp = self.session2.get(url,headers=headers,proxies=proxies,timeout = self.timeout)
        except Exception as e :
            print(e)
            return self.download_dell_Taobao(url=url,callback=callback)
        result = resp.text
        result ='g_page_config = ({.*?});',result,re.S)
        if result:
            self.count = 0
            result = json.loads(
            items = result.get('mods',{}).get('itemlist',{}).get('data',{}).get('auctions',[])
            print(len(items))
            for item in items:
                shopname = item.get('nick')
                price = item.get('view_price')
                url = item.get('detail_url')
                title = item.get('raw_title')
                goodid = re.findall(r'\?id=(\d+?)&',url)
                if goodid:
                    goodid = goodid[0]
                else:
                    continue
                if '差价' in title:
                    continue
                if '英寸' not in title:
                    continue
                type = None
                for i in keywords_type.keys():
                    if i in title:
                        type = i
                        break
                if type is None:
                    continue
                websites = item.get('icon',[])
                for website in websites:
                    if '尚天猫,就购了' in website.get('title',''):
                        print(website.get('title',''))
                        continue
                print(shopname,price,goodid,title,type,sep='\n')
                self.goodid_list.append((shopname,price,goodid,title,type))
            if < self.startpage+5 and len(items) ==44:
       += 1
                url = rawurl
                url = re.sub(r'&s=(\d+?)$','',url)
                url += '&s={}'.format(*44)
                print('url:{}'.format(url))
                if url.startswith('//'):
                    url = 'https:'+url
                print(url)
                return self.download_dell_Taobao(url)
        else:
            print('正在更换用户...')
            time.sleep(30)
            self.count += 1
            self.cookie_id = self.cookie_id + 1 if self.cookie_id + 1 <len(self.cookies_dict) else 0
            self.session.cookies = self.cookies_dict[self.cookie_id]
            return self.download_dell_Taobao(url=url,callback=callback)
        print('淘宝导入完成,共导入{}条数据'.format(
        with open('goodid{}.txt'.format(self.startpage),'w') as f:
            f.write(json.dumps(self.goodid_list))
        print('数据写入完成...')