def jd_price(url): # ss = randnum(1000000,8888888) # https://p.3.cn/prices/mgets?callback=jQuery7955799&type=1&area=1_72_4137_0&pdtk=&pduid=531394193&pdpin=&pdbp=0&skuIds=J_10478786444 start = url.rfind('/') end = url.rfind('.') num = url[start + 1:end] baseurl = 'http://p.3.cn/prices/mgets?callback=jQuery' + str( fetch_util.randnum(1000000, 8888888)) + '&type=1&area=1_72_4137_0&pdtk=&pduid=531394193&pdpin=&pdbp=0&skuIds=J_' + str( num) # captureutil.printlog("价格请求接口url: " + baseurl) # baseurl = 'http://p.3.cn/prices/mgets?skuIds=J_' + str(storeid) + ',J_&type=1' priceJson = fetch_util.openurl2(baseurl, refererurl=url) jsstart = priceJson.find('{') jsend = priceJson.find('}') priceJson = priceJson[jsstart:jsend + 1] if priceJson: # jQuery3493581([{"id":"J_10550439205","p":"79.00","m":"199.00","op":"106.00"}]); try: js = json.loads(priceJson) # 得到一个数组 if 'p' in js: return js['p'] except: return '-1.00' return '-1.00'
def func(lists, maincategory, outfile): if lists and len(lists) > 0: alllen = len(lists) count = 0 for currenturl in lists: count += 1 pageinfo = PageInfo(currenturl) appname = pageinfo.getappname() category = pageinfo.getcategory() if not category: category = 'unknow' tag = pageinfo.gettag() if not tag: tag = 'unknow' # 打印日志 # captureutil.printlog(currenturl + '\t' + appname) fetch_util.print_log('[' + str(count) + '/' + str(alllen) + '] ' + currenturl) outinfo = currenturl + '\t' + maincategory + '>' + appname + '\tc:' + category + '\tt:' + tag # 写入结果 fetch_util.write(outinfo, outfile) # 随机休眠几秒 sleep(fetch_util.randnum(10, 40))
def sleep(self, start, end): sleep(fetch_util.randnum(start, end))
def main(): # 指定大类别url outfile = '/Users/Lan/TestDir/out/wandoujia.txt' # 文件输出位置 specurls = ['http://www.wandoujia.com/category/396'] # outfile = wandoujiaconfig.outfile # specurls = wandoujiaconfig.specurls # specurls = ['http://www.wandoujia.com/category/382', 'http://www.wandoujia.com/category/388', # 'http://www.wandoujia.com/category/402', 'http://www.wandoujia.com/category/392'] allurls = [] fetch_util.print_log('update request urls ...') for specurl in specurls: maincategoryurls = MainCategoryUrls(specurl) url = maincategoryurls.geturls() allurls.append(url) urls = fetch_util.liststolist(allurls) fetch_util.print_log('update request urls finished, len: ' + str(len(urls))) for url in urls: parentpage = ParentPage(url=url) requesturls = parentpage.getpageurls() if requesturls and len(requesturls) > 0: for requesturl in requesturls: parentpageurl = ParentPageUrl(requesturl) # 当前主大类别 maincategory = parentpageurl.getcategory() if not maincategory: maincategory = 'unknow' # 当前页面可请求urls currenturls = parentpageurl.getcurpageurls() if currenturls and len(currenturls) > 0: tasks = fetch_util.task_dispatch(currenturls, 10) threads = [] for task in tasks: th = threading.Thread(target=func, args=(task, maincategory, outfile)) th.start() threads.append(th) pass for th in threads: th.join() # 写入结果 fetch_util.write('\r\n------ i am line -----\r\n', outfile) fetch_util.print_log("has finish: " + url) sleep(fetch_util.randnum(10, 30))