def run_cjbrand(): keyisvalue = rconnection_test.keys(redis_key_phone_w) if keyisvalue: print keyisvalue # 读取品牌型号搜索 else: print "没有找到key: %s" % redis_key_phone_w while True: axw = rconnection_test.lpop(redis_key_phone_w) try: if axw: modeljson = json.loads( str(axw).replace("\\", "、").replace(" ", "")) url = modeljson["urls"] # print url search_CHUNTAO_brand(url, modeljson) else: print "没有找到key, break" break except Exception, e: print "load url json error : %s" % e wr = ErrorLogsFile("load url json error : %s ,,,,, error : %s " % (axw, e)) wr.saveerrorlog()
def search_CHUNTAO_page(urlx, category): print urlx url = str(urlx).replace( "s=0", "s={0}" ) #"https://list.CHUN TAOall.com/search_product.hCHUN TAO?cat=50936015&s={0}" sesson = requests.session() isValue = True index = 0 # 先查找页面显示的 页数 while isValue: if errorCount < index: print "%s: CHUN TAO not find this url : %s" % (datetime.now(), url) break # 随机获取代理ip proxy = rconnection_yz.srandmember(redis_key_proxy) proxyjson = json.loads(proxy) proxiip = proxyjson["ip"] sesson.proxies = { 'http': 'http://' + proxiip, 'https': 'https://' + proxiip } try: req = sesson.get(url, timeout=30) hCHUNTAOl = req.text req.close() isValue = False if hCHUNTAOl: # print hCHUNTAOl # 查找总页数 totalye = re.search('(?<=data-totalPage=")\d+(?=")', str(hCHUNTAOl)) # print totalye if totalye: print totalye.group() # search_CHUNTAO(1, url, category) for int_page in range(int(totalye.group())): search_CHUNTAO(int_page, url, category) else: print "search the total page regular isvalid ?" wr = ErrorLogsFile( "search the total page regular isvalid ?: url:%s" % (url)) wr.saveerrorlog() except Exception, e: isValue = True index += 1 print "errormessage: %s" % e if index == errorCount: print "search total page category error: %s , %s" % (index, e) wr = ErrorLogsFile( "search total page category error: url:%s,errormessage:%s" % (url, e)) wr.saveerrorlog() return time.sleep(5)
def search_CHUNTAO(totalpage, urls, category): # url = "https://list.CHUN TAOall.com/search_product.hCHUN TAO?cat=50936015&s=0" sesson = requests.session() isValue = True index = 0 # print "总页数:%s" % totalye while isValue: if errorCount < index: print "%s: CHUN TAO not find this url : %s" % (datetime.now(), urls) break # 随机获取代理ip proxy = rconnection_yz.srandmember(redis_key_proxy) proxyjson = json.loads(proxy) proxiip = proxyjson["ip"] sesson.proxies = { 'http': 'http://' + proxiip, 'https': 'https://' + proxiip } # 重新赋值 url pagecount = 40 * totalpage url = urls.format(pagecount) try: print "toale : %s ,url : %s" % (totalpage, url) req = sesson.get(url, timeout=30) hCHUNTAOl = req.text # print hCHUNTAOl req.close() isValue = False if hCHUNTAOl: # 查找商品名称、月销量、单价、旗舰店 # tzurl = re.findall(r'<div class="item-info">[\s\S]*?</div>[\s\S]*?</div>[\s\S]*?</a>', hCHUNTAOl) tzurl = re.findall( r'<div class="item-info">[\s\S]*?</div>[\s\S]*?</div>[\s\S]*?</li>', hCHUNTAOl) wgh = 0 if len(tzurl) == 0: wgh = 1 tzurl = re.findall(r'<span class="volume">月销量(.*)</span>', hCHUNTAOl) if tzurl: for i in tzurl: if (wgh == 1): ix = i[1] else: ix = i if len(ix) > 0: # 【村淘优选】判断 search_value = re.search('(?<=title=")村淘优选(?=")', str(ix)) if search_value == None: continue # 开始查找id号, search_id = re.search('(id=(?P<dd>.*?)")', ix) if search_id: # 截取页面信息id spid = search_id.group("dd") # print "spid : %s"% spid # 截取页面信息商品名称 if "title=" not in ix: continue # 商品名称 search_spname = re.search( '(?<=title=").*(?=".*target="_blank">)', ix) if search_spname: spname = search_spname.group() # print "spname : %s" % spname # 判断是否无效,如果isspnameTrue 为True,则表示无效数据,过滤,如果为False,表示是有效数据 isspnameTrue = False for ia in invalid_keywords: if ia in spname and "送" not in spname: isspnameTrue = True if isspnameTrue == True: continue else: # print ix # 查找月销量 search_yxl = re.search( '(?<=<span class="volume">月销量).*(?=</span>)', str(ix)) if search_yxl: yxl = search_yxl.group() yxl = str(yxl).replace( " ", "") # print "search_yxl : %s" % yxl # 查找单价 search_price = re.search( '(?<=class="price-value">).*(?=<)', str(ix)) if search_price: price = search_price.group() # print "price : %s" % price result_url = CHUNTAO_url.format( spid) result = '{"urlweb":"cun","urls":"%s","urlleibie":"%s","price":"%s","yxl":"%s","spname": "%s"}' % ( result_url, category, price, yxl, spname) # 拼写json类型保存至redis rconnection_test.lpush( redis_key_phone_w, result) else: print "%s:can not find CHUN TAO price,please search regular is valid:%s" % ( datetime.now(), url) wr = ErrorLogsFile( "can not find CHUN TAO price,please search regular is valid:%s" % (url)) wr.saveerrorlog() else: print "%s:can not find CHUN TAO yue xiao liang,please search regular is valid:%s" % ( datetime.now(), url) wr = ErrorLogsFile( "can not find CHUN TAO yue xiao liang,please search regular is valid:%s" % (url)) wr.saveerrorlog() else: print "%s:can not find CHUN TAO spname,please search regular is valid:%s" % ( datetime.now(), url) wr = ErrorLogsFile( "can not find CHUN TAO spname,please search regular is valid:%s" % (url)) wr.saveerrorlog() else: print "%s:can not find CHUN TAO id,please search regular is valid" % datetime.now( ) wr = ErrorLogsFile( "can not find CHUN TAO id,please search regular is valid:%s" % (url)) wr.saveerrorlog() else: print "%s:CHUN TAO url---the first regular is valid %s?" % ( datetime.now(), url) wr = ErrorLogsFile( "CHUN TAO url---the first regular is valid:%s?" % (url)) wr.saveerrorlog() except Exception, e: isValue = True index += 1 if index == errorCount: print "connection redis error: %s , %s" % (index, e) wr = ErrorLogsFile( "connection redis error: url:%s ,errormessage:%s" % (url, e)) wr.saveerrorlog() time.sleep(5)
def search_CHUNTAO_brand(urls, attributes): sesson = requests.session() isValue = True index = 0 # print "总页数:%s" % totalye while isValue: if errorCount < index: print "%s: CHUN TAO not find this url : %s" % (datetime.now(), urls) break # 随机获取代理ip proxy = rconnection_yz.srandmember(redis_key_proxy) proxyjson = json.loads(proxy) proxiip = proxyjson["ip"] sesson.proxies = { 'http': 'http://' + proxiip, 'https': 'https://' + proxiip } try: req = sesson.get(urls, timeout=30) hCHUNTAOl = HTMLParser.HTMLParser().unescape(req.text) req.close() isValue = False if hCHUNTAOl: # print hCHUNTAOl # 查找型号 model = "" tz_model = re.search(r'(?<=型号:).*(?=<)', str(hCHUNTAOl)) if tz_model: model = str(tz_model.group()).replace(" ", "").replace(" ", "") if model == "": tz_model2 = re.search(r'(?<=货号:).*(?=<)', str(hCHUNTAOl)) if tz_model2: model = str(tz_model2.group()).replace(" ", "").replace( " ", "") else: print "not find model,please search regular is valued?: %s , %s" % ( datetime.now(), urls) wr = ErrorLogsFile( "not find model,please search regular is valued?: %s , %s" % (datetime.now(), urls)) wr.saveerrorlog() tz_brand = re.search(r'品牌[^"]{0,5}:(?P<dd>.*?)</li>', str(hCHUNTAOl)) brand = "" if tz_brand: brand = str(tz_brand.group("dd")).replace(" ", "").replace( " ", "") brand = brand.replace(" ", "") else: # print str(hCHUNTAOl) print "not find brand,please search regular is valued?: %s , %s" % ( datetime.now(), urls) wr = ErrorLogsFile( "not find brand,please search regular is valued?: %s , %s" % (datetime.now(), urls)) wr.saveerrorlog() isValue = False result = '{"urlweb":"cun","urls":"%s",' \ '"urlleibie":"%s","price":"%s",' \ '"yxl":"%s","spname": "%s",' \ '"brand":"%s","model":"%s"}' % \ (attributes["urls"], attributes["urlleibie"], attributes["price"], attributes["yxl"], attributes["spname"], brand, model) # 拼写json类型保存至redis rconnection_test.lpush(redis_key_phone_result, result) except Exception, e: print "connection redis error: %s , %s" % (index, e) isValue = True index += 1 if index == errorCount: print "connection redis error: %s , %s" % (index, e) wr = ErrorLogsFile( "connection redis error: url:%s ,errormessage:%s" % (urls, e)) wr.saveerrorlog() time.sleep(5)
def search_TM(totalpage, urls, category): # urls = "https://list.tmall.com/search_product.htm?s=0&q=%E7%94%B5%E5%AD%90%E7%A7%B0" sesson = requests.session() isValue = True index = 0 # print "总页数:%s" % totalye while isValue: if errorCount < index: print "%s: TM not find this url : %s" % (datetime.now(), urls) break # 随机获取代理ip redis_key_proxy = random.choice(proxykeys) proxy = rconnection_yz.srandmember(redis_key_proxy) proxyjson = json.loads(proxy) proxiip = proxyjson["ip"] sesson.proxies = {'http': 'http://' + proxiip, 'https': 'https://' + proxiip} # 随机获取 天猫cookie tmcookies = rconnection_test.srandmember(redis_key_tm_cookies) # tmcookiejson = json.loads(tmcookies) # tmcookie = tmcookiejson["cookie"] headers = { "User-Agent": "%s" % random.choice(user_agent_list), "Accept": "*/*", "Referer": "https://www.tmall.com/", "Cookie": cookie } print "begin aaaaaaaaaaaaaaaaaaaaaa" # 重新赋值 url pagecount = 60*totalpage url = urls.format(pagecount) try: print url req = sesson.get(url, headers=headers, timeout=30) html = req.text req.close() isValue = False if html: tzurl = re.findall(r'<p class="productTitle">[\s\S]*?</p>', html) wgh = 0 if len(tzurl) == 0: wgh = 1 tzurl = re.findall(r'(<a href="//detail.tmall.com/item.htm?)+(.*)(</a>)', html) print html if tzurl: for i in tzurl: if (wgh == 1): ix = i[1] else: ix = i if len(ix) > 0: # 开始查找id号, search_id = re.search("(id=(?P<dd>.*?))+(&skuId)", ix) if search_id: # 截取页面信息id spid = search_id.group("dd") # print spid # 截取页面信息商品名称 if "title=" not in ix: continue search_spname = re.search("(title=.*)+(>.*)", ix) if search_spname: spname = search_spname.group() # 判断是否无效,如果isspnameTrue 为True,则表示无效数据,过滤,如果为False,表示是有效数据 isspnameTrue = False for ia in invalid_keywords: if ia in spname and "送" not in spname: isspnameTrue = True if isspnameTrue == True: continue else: result_url=TM_url.format(spid) result = '{"Urlweb":"TM","Urls":"%s","Urlleibie":"%s","spbjpinpai": "",' \ '"spbjjixing": "",' \ '"pc": ""}'% (result_url, category) # 拼写json类型保存至redis rconnection_test.lpush(redis_key_phone_w, result) else: print "%s:can not find TM spname,please search regular is valid:%s" % (datetime.now(),url) wr = ErrorLogsFile( "can not find TM spname,please search regular is valid:%s" % ( url)) wr.saveerrorlog() else: print "%s:can not find TM id,please search regular is valid" % datetime.now() wr = ErrorLogsFile("can not find TM id,please search regular is valid:%s" % ( url)) wr.saveerrorlog() else: print "%s:TM url---the first regular is valid ?" % datetime.now() wr = ErrorLogsFile("TM url---the first regular is valid:%s?" % (url)) wr.saveerrorlog() time.sleep(5) isValue = True except Exception, e: isValue = True index += 1 if index == errorCount: print "connection redis error: %s , %s" % (index, e) wr = ErrorLogsFile("connection redis error: url:%s ,errormessage:%s" % (url,e)) wr.saveerrorlog() time.sleep(5) time.sleep(5)
def search_TM_urllib2(urlx, category): print urlx url = str(urlx).replace("s=0", "s={0}") #"https://list.tmall.com/search_product.htm?cat=50936015&s={0}" sesson = requests.session() isValue = True index = 0 # 先查找页面显示的 页数 while isValue: if errorCount < index: print "%s: TM not find this url : %s" % (datetime.now(), url) break # 随机获取代理ip proxy = rconnection_yz.srandmember(redis_key_proxy) proxyjson = json.loads(proxy) proxiip = proxyjson["ip"] print proxiip prxyip = {'http': 'http://' + proxiip, 'https': 'https://' + proxiip} proxy_s = urllib2.ProxyHandler(prxyip) openner = urllib2.build_opener(proxy_s) urllib2.install_opener(openner) # sesson.proxies = {'http': 'http://' + proxiip, 'https': 'https://' + proxiip} try: cj = cookielib.CookieJar() opener = urllib2.build_opener(proxy_s, urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = urllib2.urlopen(url) print cj req = urllib2.urlopen(url) print "req : %s" % req.read() html = req # req = sesson.get(url, timeout=30) # html = req.text # cj = cookielib.CookieJar() # opener = urllib2.build_opener(proxy_s,urllib2.HTTPCookieProcessor(cj)) # urllib2.install_opener(opener) # resp = urllib2.urlopen(url) # print cj # for index, cookie in enumerate(cj): # print '[', index, ']', cookie; req.close() isValue = False if html: # print html totalye = re.search('(?<=共)\d+(?=页)', str(html)) # print totalye if totalye: print totalye.group() for int_page in range(int(totalye.group())): print "intpage: %s" % int_page search_TM(int_page, url, category) else: print "search the total page regular isvalid ?" wr = ErrorLogsFile("search the total page regular isvalid ?: url:%s" % (url)) wr.saveerrorlog() except Exception, e: isValue = True index += 1 print "errormessage: %s" % e if index == errorCount: print "search total page category error: %s , %s" % (index, e) wr = ErrorLogsFile("search total page category error: url:%s,errormessage:%s" % (url, e)) wr.saveerrorlog() return time.sleep(5)
def search_TM_page(urlx, category): print urlx url = str(urlx).replace("s=0", "s={0}") #"https://list.tmall.com/search_product.htm?cat=50936015&s={0}" sesson = requests.session() isValue = True index = 0 isvalued =0 # 先查找页面显示的 页数 while isValue: if errorCount < index: print "%s: TM not find this url : %s" % (datetime.now(), url) break # 随机获取代理ip redis_key_proxy = random.choice(proxykeys) proxy = rconnection_yz.srandmember(redis_key_proxy) proxyjson = json.loads(proxy) proxiip = proxyjson["ip"] print proxiip sesson.proxies = {'http': 'http://' + proxiip, 'https': 'https://' + proxiip} # 随机获取 天猫cookie tmcookies = rconnection_test.srandmember(redis_key_tm_cookies) # tmcookiejson = json.loads(tmcookies) # tmcookie = tmcookiejson["cookie"] headers = { "User-Agent": "%s" % random.choice(user_agent_list), "Accept": "*/*", "Referer": "https://www.tmall.com/", "Cookie": cookie } try: # time.sleep(10) req = sesson.get(url, headers=headers, timeout=30) html = req.text req.close() isValue = False if html: # print html totalye = re.search('(?<=共)\d+(?=页)', str(html)) # print totalye if totalye: print totalye.group() page = totalye.group() print "page %s" % page for int_page in range(0, int(page)): print "intpage: %s" % int_page search_TM(int_page, url, category) else: print "search the total page regular isvalid ?" wr = ErrorLogsFile("search the total page regular isvalid ?: url:%s" % (url)) wr.saveerrorlog() isValue = True if isvalued == errorCount: return isvalued += 1 time.sleep(2) except Exception, e: isValue = True index += 1 print "errormessage: %s" % e if index == errorCount: print "search total page category error: %s , %s" % (index, e) wr = ErrorLogsFile("search total page category error: url:%s,errormessage:%s" % (url, e)) wr.saveerrorlog() return time.sleep(5)