def level2(arr_oneurl, arr_onename): global KEEPDIR allfile = listfiles(KEEPDIR + "/2urls", ".md") emptyfile = listfiles(KEEPDIR + "/2urls", "-url.mdxx") for two in range(len(arr_oneurl)): # 已经抓过! prefix = str(two + 1) if prefix + '-url.mdxx' in emptyfile or (prefix + '-name.md' in allfile and prefix + "-url.md" in allfile): logger.info("已存在!第" + prefix + "个一级类目:" + arr_oneurl[two] + "的二级类目...") continue twocontent = ratedownload(arr_oneurl[two]) if twocontent == None or twocontent == 0: continue else: twocontent = twocontent.decode('utf-8', 'ignore') arr_twourl, arr_twoname = urlparse(twocontent, level=2) logger.warning("正抓取!第" + prefix + "个一级类目:" + arr_oneurl[two] + "的二级类目...") logger.warning("还剩下" + str(len(arr_oneurl) - two + 1) + "个一级类目") logger.info(arr_twourl) savetofile("2urls/" + prefix + "-url.md", arr_twourl) savetofile("2urls/" + prefix + "-name.md", arr_twoname) logger.warning("已经抓取了二级类目下的所有url...")
def level6(catchfiles=[]): global KEEPDIR # 1-1-1-url.md 1-1-2-url.md # 四级所有URL文件 # 并发文件,如果传入文件,那么并发 if not catchfiles: level4file = listfiles(KEEPDIR + "/5urls", "url.md") else: logger.warning("并发文件") level4file = catchfiles # 五级下所有文件 level5file = listfiles(KEEPDIR + "/6urls", "md") emptyfile = listfiles(KEEPDIR + "/6urls", "-url.mdxx") # 遍历四级文件 # position为文件序列 for position in range(len(level4file)): # 文件名 filename = level4file[position] # 文件名前缀位置 weizhi = filename.split("-url")[0] urls = readfile("5urls/" + filename) # urlposition为链接序列 for urlposition in range(len(urls)): # 已经抓过!1-1-1-1-url.md prefix = str(urlposition + 1) if weizhi + '-' + prefix + '-url.mdxx' in emptyfile or ( weizhi + '-' + prefix + '-name.md' in level5file and weizhi + '-' + prefix + '-url.md' in level5file): logger.info("已存在!第" + str(position + 1) + "个四级类目:" + filename + ",第" + prefix + "个五级类目:" + urls[urlposition] + "的六级类目...") continue fourcontent = ratedownload(urls[urlposition]) if fourcontent == None or fourcontent == 0: continue else: fourcontent = fourcontent.decode('utf-8', 'ignore') arr_foururl, arr_fourname = urlparse(fourcontent, level=6) logger.warning("正抓取!第" + str(position + 1) + "个四级类目:" + filename + ",第" + prefix + "个五级类目:" + urls[urlposition] + "的六级类目...") logger.warning("本目录还剩" + str(len(urls) - urlposition + 1) + "个五级类目,排队" + str(len(level4file) - position + 1) + "个四级类目") logger.info(arr_foururl) savetofile("6urls/" + weizhi + '-' + prefix + '-url.md', arr_foururl) savetofile("6urls/" + weizhi + '-' + prefix + '-name.md', arr_fourname) logger.warning("已经抓取了六级类目下的所有url...") return "ok!!!----"
def level3(catchfiles=[]): global KEEPDIR # 1-url.md 2-url.md # 二级所有URL文件 # 并发文件,如果传入文件,那么并发 if not catchfiles: level2file = listfiles(KEEPDIR + "/2urls", "url.md") else: logger.warning("并发文件") level2file = catchfiles # 三级下所有文件 level3file = listfiles(KEEPDIR + "/3urls", "md") emptyfile = listfiles(KEEPDIR + "/3urls", "-url.mdxx") # 遍历二级文件 # position为文件序列 for position in range(len(level2file)): # 文件名 filename = level2file[position] # 文件名前缀位置 weizhi = filename.split("-url")[0] urls = readfile("2urls/" + filename) # urlposition为链接序列 for urlposition in range(len(urls)): # 已经抓过!1-2-url.md prefix = str(urlposition + 1) if weizhi + "-" + prefix + '-url.mdxx' in emptyfile or ( weizhi + '-' + prefix + '-name.md' in level3file and weizhi + '-' + prefix + '-url.md' in level3file): logger.info("已存在!第" + weizhi + "个一级类目:" + filename + ",第" + prefix + "个二级类目:" + urls[urlposition] + "的三级类目...") continue threecontent = ratedownload(urls[urlposition]) if threecontent == None or threecontent == 0: continue else: threecontent = threecontent.decode('utf-8', 'ignore') arr_threeurl, arr_threename = urlparse(threecontent, level=3) logger.warning("正抓取!第" + weizhi + "个一级类目:" + filename + ",第" + prefix + "个二级类目:" + urls[urlposition] + "的三级类目...") logger.warning("本目录还剩" + str(len(urls) - urlposition + 1) + "个二级类目,排队" + str(len(level2file) - position + 1) + "个一级类目") logger.info(arr_threeurl) savetofile("3urls/" + weizhi + '-' + prefix + '-url.md', arr_threeurl) savetofile("3urls/" + weizhi + '-' + prefix + '-name.md', arr_threename) logger.warning("已经抓取了三级类目下的所有url...")
def level1(): global KEEPDIR allfile = listfiles(KEEPDIR, ".md") if 'onename.md' in allfile and "oneurl.md" in allfile: arr_oneurl = readfile("oneurl.md") arr_onename = readfile("onename.md") logger.warning("一级类目已经存在,直接抓取二级类目的url...") else: # 下面是一级类目的抓取 # 一级目录下的网址 firsturl = "https://www.amazon.com/Best-Sellers/zgbs" onecontent = ratedownload(firsturl) if onecontent == None or onecontent == 0: raise else: onecontent = onecontent.decode('utf-8', 'ignore') arr_oneurl, arr_onename = urlparse(onecontent) savetofile("oneurl.md", arr_oneurl) savetofile("onename.md", arr_onename) logger.warning("已经抓取了一级类目:" + firsturl + "的所有url...") logger.info(arr_oneurl) return arr_oneurl, arr_onename
def unitlogic(url, mysqlconfig): global DATA_DIR # 抓取的类目URL catchurl = url[1] # 类目名 catchname = url[2] # 页数 try: page = int(url[3]) except: page = 5 # 类目ID id = url[0] # 大类名 bigpname = url[4] # 数据库 db = url[6] todays = tool.log.TODAYTIME year = todaystring(1) if getconfig()["ipinmysql"]: where = "mysql" else: where = "local" keepdir = createjia(DATA_DIR + "/data/items/" + year + "/" + bigpname.replace(" ", "") + "/" + todays) detaildir = createjia(DATA_DIR + "/data/detail/" + year + "/" + bigpname.replace(" ", "") + "/" + todays + "/" + id) listheader = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Connection": "keep-alive", "Accept-Language": "en-US;q=0.8,en;q=0.5", "Upgrade-Insecure-Requests": "1", 'Host': 'www.amazon.com' } ################# parsecontent = {} if fileexsit(keepdir + "/" + id + ".md"): with open(keepdir + "/" + id + ".md", "rb") as f: parsecontent = stringToObject(f.read().decode("utf-8", "ignore")) else: listcontent = ratedownload(url=catchurl, where=where, config=mysqlconfig, header=listheader, isdetail=False) if listcontent: parsecontent, isphone = phonelistparse( listcontent.decode("utf-8", "ignore")) if isphone: if parsecontent: if phoneinsertlist(parsecontent, url): with open(keepdir + "/" + id + ".md", "wb") as f: f.write( objectToString(parsecontent).encode("utf-8")) else: logger.error("手机列表页解析出错:" + catchurl) else: # PC端 if getconfig()["force"]: try: page = int(getconfig()["forcenum"]) except: page = 5 for i in range(1, min(5, page)): items3 = "/ref=zg_bs_apparel_pg_" + str( i + 1) + "?_encoding=UTF8&ajax=1&pg=" + str(i + 1) items17 = "/ref=zg_bs_apparel_pg_" + str( i + 1 ) + "?_encoding=UTF8&&isAboveTheFold=0&ajax=1&pg=" + str( i + 1) listheader = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Accept-Language": "en-US;q=0.8,en;q=0.5", "Upgrade-Insecure-Requests": "1", 'Referer': catchurl, 'Host': 'www.amazon.com', "X-Requested-With": "XMLHttpRequest" } try: content3 = ratedownload(url=catchurl + items3, where=where, config=mysqlconfig, header=listheader) content17 = ratedownload(url=catchurl + items17, where=where, config=mysqlconfig, header=listheader) if content3 == 0 and content17 == 0: break if content3 == None and content17 == None: continue if content3: temp3 = phonetopclistparse(content3) for i in temp3: parsecontent[i] = temp3[i] if content17: temp17 = phonetopclistparse(content17) for j in temp17: parsecontent[j] = temp17[j] except Exception as e: logger.error("手机到PC列表页出错" + str(i + 1) + ",跳过") logger.error(e, exc_info=1) if parsecontent: if phoneinsertlist(parsecontent, url): with open(keepdir + "/" + id + ".md", "wb") as f: f.write( objectToString(parsecontent).encode("utf-8")) else: logger.error("最后列表页解析出错:" + catchurl) ################## for asin in parsecontent: try: # smallrank-asin smallrank = parsecontent[asin][0] detailname = str(smallrank) + "-" + asin rankeep = detaildir + "/" + detailname if fileexsit(rankeep + ".md"): loggers.warning("Look!存在详情页:" + rankeep) continue if fileexsit(rankeep + ".emd"): loggers.warning("存在(页面找不到))!" + rankeep) continue detailurl = "https://www.amazon.com/dp/" + asin # detailurl = "https://www.amazon.com/gp/product/" + asin if fileexsit(rankeep + ".html"): with open(rankeep + ".html", "rb") as ff: detailpage = ff.read() else: detailheader = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Connection": "keep-alive", "Accept-Language": "en-US;q=0.8,en;q=0.5", "Upgrade-Insecure-Requests": "1", 'Host': 'www.amazon.com' } detailpage = ratedownload(url=detailurl, where=where, config=mysqlconfig, header=detailheader, isdetail=True) if detailpage == None: continue if detailpage == 0: with open(rankeep + ".emd", "wt") as f: f.write("1") continue else: with open(rankeep + ".html", "wb") as f: f.write(detailpage) try: pinfo = phonedetailparse(detailpage.decode("utf-8", "ignore")) except: try: # 不是手机端 pinfo = pinfoparse(detailpage.decode("utf-8", "ignore")) except: logger.error("PC解析詳情頁出錯:" + detailurl) continue try: pinfo["smallrank"] = int(smallrank) except: pinfo["smallrank"] = -1 pinfo["title"] = parsecontent[asin][3] pinfo["price"] = parsecontent[asin][4] pinfo["asin"] = asin pinfo["url"] = detailurl pinfo["img"] = parsecontent[asin][2] if len(pinfo["img"]) > 240: pinfo["img"] = "" pinfo["name"] = catchname pinfo["bigname"] = bigpname pinfo["id"] = todays + "-" + detailname # 插入数据库 if phoneinsertexsitlist(pinfo, url): with open(rankeep + ".md", "wt") as f: f.write("1") # 可以不管这个!失败也不要紧 phoneinsertpmysql(pinfo, db, id) except Exception as err: logger.error("发生一件商品:" + asin + ":ERROR") logger.error(err, exc_info=1) pass # 成功 logger.warning(todays + "|" + bigpname + "|" + db + ":" + id + " completed")
def unitlogic(url, mysqlconfig): global DATA_DIR # url: ('1-1', 'https://www.amazon.com/Best-Sellers-Appliances-Cooktops/zgbs/appliances/3741261/ref=zg_bs_nav_la_1_la/161-2441050-2846244', 'Cooktops', 2, 5, '1', '1', 'Appliances') # 抓取的类目URL catchurl = url[1] # 类目名 catchname = url[2] # 类目ID id = url[0] # 大类名 bigpname = url[4] # 页数 page = url[3] # 级别 level = url[5] # 数据库 db = url[6] # 2016/Appl/20160606/ todays = tool.log.TODAYTIME year = todaystring(1) db = getconfig()["dbprefix"] + db if not dbexist(db, id, todays): return if getconfig()["ipinmysql"]: where = "mysql" else: where = "local" keepdir = createjia(DATA_DIR + "/data/items/" + year + "/" + bigpname.replace(" ", "") + "/" + todays + "/" + id) detaildir = createjia(DATA_DIR + "/data/detail/" + year + "/" + bigpname.replace(" ", "") + "/" + todays + "/" + id) detailall = {} # 列表頁抓完? finish = listfiles(keepdir, ".jinhan") pagefinish = False if len(finish) >= 1: pagefinish = True # 重試多次仍然抓不到頁面? retryhappen = False if getconfig()["force"]: page = getconfig()["forcenum"] for i in range(min(page, 5)): itempath = keepdir + "/" + str(i + 1) + ".md" if fileexsit(itempath): logger.warning("已存在:" + id + "(" + str(i + 1) + ")-" + bigpname + ":" + catchname + "(" + str( level) + ") --" + catchurl) temp = readfilelist(itempath) for i in temp: try: temptemp = i.split(",") insertlist(temptemp, url) detailall[temptemp[0]] = temptemp[1] except: logger.error("列表页读取失败:内容行|" + i) continue else: # 如果不存在文件且已經完成,證明頁數不足 if pagefinish: break logger.warning("抓取:" + id + "(" + str(i + 1) + ")-" + bigpname + ":" + catchname + "(" + str( level) + ") --" + catchurl) # 构造页数 # ?_encoding=UTF8&pg=1&ajax=1 3个商品 # ?_encoding=UTF8&pg=1&ajax=1&isAboveTheFold=0 17个商品 # https://www.amazon.com/Best-Sellers-Clothing/zgbs/apparel/ref=zg_bs_apparel_pg_5?_encoding=UTF8&pg=5&ajax=1 # Referer:https://www.amazon.com/gp/bestsellers/apparel/ref=pd_zg_hrsr_a_1_1 # Referer:https://www.amazon.com/gp/bestsellers/apparel/ref=pd_zg_hrsr_a_1_1 # X-Requested-With:XMLHttpRequest items3 = "/ref=zg_bs_apparel_pg_" + str(i + 1) + "?_encoding=UTF8&ajax=1&pg=" + str(i + 1) items17 = "/ref=zg_bs_apparel_pg_" + str(i + 1) + "?_encoding=UTF8&&isAboveTheFold=0&ajax=1&pg=" + str( i + 1) listheader = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Accept-Language": "en-US;q=0.8,en;q=0.5", "Upgrade-Insecure-Requests": "1", # 'Referer': 'https://www.amazon.com/', 'Host': 'www.amazon.com' } content3 = ratedownload(url=catchurl + items3, where=where, config=mysqlconfig, header=listheader) content17 = ratedownload(url=catchurl + items17, where=where, config=mysqlconfig, header=listheader) if content3 == 0 or content17 == 0: break if content3 == None: retryhappen = True continue if content17 == None: retryhappen = True continue try: # {'91':['91', 'https://www.amazon.com/dp/B003Z968T0', 'WhisperKOOL® Platinum Split System 80...']} temp3 = rateparse(content3) temp17 = rateparse(content17) if temp3 == {} and temp17 == {}: continue else: with open(itempath, "wb") as f: for i in sorted(temp3.keys()): if insertlist(temp3[i], url): detailall[i] = temp3[i][1] f.write((",".join(temp3[i]) + "\n").encode("utf-8")) for j in sorted(temp17.keys()): if insertlist(temp17[j], url): detailall[i] = temp17[j][1] f.write((",".join(temp17[j]) + "\n").encode("utf-8")) except Exception as err: logger.error("解析列表頁錯誤:" + catchurl + ":" + str(i + 1)) logger.error(err, exc_info=1) pass if retryhappen == False and pagefinish == False: with open(keepdir + "/ko.jinhan", "wt") as f: f.write("1") for rank in detailall: detailname = rank + "-" + detailall[rank] rankeep = detaildir + "/" + detailname if fileexsit(rankeep + ".md"): loggers.warning("存在!" + rankeep) continue if fileexsit(rankeep + ".emd"): loggers.warning("存在(页面找不到))!" + rankeep) continue detailurl = "https://www.amazon.com/dp/" + detailall[rank] # TODO # 本地文件不保存 if fileexsit(rankeep + ".html"): with open(rankeep + ".html", "rb") as ff: detailpage = ff.read() else: detailheader = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Accept-Language": "en-US;q=0.8,en;q=0.5", "Upgrade-Insecure-Requests": "1", # "Cache-Control":"max-age=0", # 'Referer': 'https://www.amazon.com/', 'Host': 'www.amazon.com' } detailpage = ratedownload(url=detailurl, where=where, config=mysqlconfig, header=detailheader) if detailpage == None: continue if detailpage == 0: with open(rankeep + ".emd", "wt") as f: f.write("1") continue else: if getconfig()["localkeep"]: with open(rankeep + ".html", "wb") as f: f.write(detailpage) try: pinfo = pinfoparse(detailpage.decode("utf-8", "ignore")) except: logger.error("解析詳情頁出錯:" + detailurl) continue try: pinfo["smallrank"] = int(rank) except: pinfo["smallrank"] = -1 pinfo["asin"] = detailall[rank] pinfo["url"] = detailurl pinfo["name"] = catchname pinfo["bigname"] = bigpname pinfo["id"] = todays + "-" + detailname # 插入数据库,失败也不要紧 insertexsitlist(pinfo, url) if insertpmysql(pinfo, db, id): with open(rankeep + ".md", "wb") as f: f.write(objectToString(pinfo).encode("utf-8")) # 成功 logger.warning(todays + "|" + bigpname + "|" + db + ":" + id + " completed")