def prod_search_job(): print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "prod_search_job 开始----") job = sched.get_job(job_id="prod_search_job") next = int(job.next_run_time.strftime('%Y%m%d%H%M%S')) clasName = tbHttp.TBProdSearchCrawer.__name__ count = BaseHttpGet.getHttpGetPoolCount(clasName) # 如果队列中的元素为空,则加入一批到队列中 if count < 10: pass #开启40个线程进行处理 tpool = MyThreadPool.MyThreadPool(40) for i in range(10000): now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S')) if next - now < 3: print( time.strftime("%d %H:%M:%S", time.localtime(time.time())), i, "prod_search_job 结束--------------------------------------------------------" ) return tpool.callInThread(do_http, clasName) pass #如果要提前结束,则放入一批新的查询 qlist = tbDao.random_prod_name() city = chinaCity.getFristCity() for q in qlist: if (tbpool.ProdQuerykeyExist(q)): continue prod = tbHttp.TBProdSearchCrawer() prod.pageno = 1 prod.q = q prod.city = city BaseHttpGet.pushHttpGet(prod) print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "prod_search_job 提前结束----")
def update_prod_item_job(): print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "do_update_shop_create_time 开始----") job = sched.get_job(job_id="update_prod_item_job") next = int(job.next_run_time.strftime('%Y%m%d%H%M%S')) clasName = tbHttp.TBProdItemCrawer.__name__ count = BaseHttpGet.getHttpGetPoolCount(clasName) # 开启线程进行处理 tpool = MyThreadPool.MyThreadPool(10) for i in range(2000): now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S')) if next - now < 7: print( time.strftime("%d %H:%M:%S", time.localtime(time.time())), i, "update_shop_create_time_job 结束--------------------------------------------------------" ) return tpool.callInThread(do_http, clasName) print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "do_update_shop_create_time 提前结束----") pass #如果队列中的元素为空,则加入一批到队列中 list = models.TTbShopProd.objects.filter(shopid=None)[0:5000] for p in list: http = tbHttp.TBProdItemCrawer() http.product_id = p.product_id http.uid = p.uid BaseHttpGet.pushHttpGet(http) pass print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "do_update_shop_create_time 提前结束----") pass
def update_shop_create_time_job(): print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "do_update_shop_create_time 开始----") job = sched.get_job(job_id="update_shop_create_time_job") next = int(job.next_run_time.strftime('%Y%m%d%H%M%S')) clasName = tbHttp.TBShopCreateTimeCrawer.__name__ count = BaseHttpGet.getHttpGetPoolCount(clasName) #如果队列中的元素为空,则加入一批到队列中 if count == 0: list = models.TTbShop.objects.filter(shop_createtime=None)[0:5000] for shop in list: http = tbHttp.TBShopCreateTimeCrawer() http.shopid = shop.shopid http.isProxy = True BaseHttpGet.pushHttpGet(http) pass #开启线程进行处理 tpool = MyThreadPool.MyThreadPool(5) for i in range(10000): now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S')) if next - now < 10: print( time.strftime("%d %H:%M:%S", time.localtime(time.time())), i, "update_shop_create_time_job 结束--------------------------------------------------------" ) return tpool.callInThread(do_http, clasName) print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "do_update_shop_create_time 提前结束----") pass
def do_http(clasName=None): http = BaseHttpGet.popHttpGet(clasName) if http is None: return if http.run(): pass else: BaseHttpGet.pushHttpGet(http) pass
def nextQuery(self): ncity = chinaCity.getNextCity(self.city) if ncity is not None: self.city = ncity self.id = None # id必须设置为空,否则无放入到运行队列里 self.pageno = 1 BaseHttpGet.pushHttpGet(self) # 如果结束了。则把查询的key放入缓存,同一个查询key,3天内部重复查询 tbpool.ShopQuerykeyExist(self.q)
def nextQuery(self): n_c = chinaCity.getNextCity(self.city) if n_c is not None: self.city = n_c self.id = None # id必须设置为空,否则无放入到运行队列里 self.pageno = 1 BaseHttpGet.pushHttpGet(self) #如果结束了,则把查询关键字放入缓存 tbpool.ProdQuerykeyExist(self.q) return
def init_shop_search(): print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "init_shop_search 开始----") cityl = chinaCity.listAllCity() cat = tbcategory.getFristQueryKey() count = 0 for city in cityl: tshop = tbHttp.TBShopSearchCrawer() tshop.pageno = 1 tshop.q = cat tshop.city = city tshop.id = "shop_search," + cat + city BaseHttpGet.pushHttpGet(tshop) count = count + 1 print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "init_shop_search 结束----", count)
def parse(self, response): try: print(time.strftime("%d %H:%M:%S", time.localtime(time.time())), "抓取网站(", self.url, ")开始-----") soup = BeautifulSoup(response.content.decode("utf-8", 'replace'), "lxml") trs = soup.find_all("tr", class_='tr3 t_one', align="center") for tr in trs: if (str(tr).find("置顶") > 0): continue namelink = tr.find('h3').find('a') if namelink is None: continue pub_id = namelink.get("id") mv = models.XP1024Movie.objects.filter(pub_id=pub_id).first() # 判断影片是否已经存在,如果存在,则不在进行下一步处理 if mv is None: mv = models.XP1024Movie() mv.pub_src = "1024xp" else: continue mv.pub_type = self.pub_type mv.pub_day = tr.find('a', class_='f10').string.strip() mv.pub_name = namelink.string.strip() mv.pub_info_url = "/pw/" + namelink.get("href") mv.pub_id = pub_id catidx = mv.pub_name.find("] ") if catidx > 0: mv.pub_name = mv.pub_name[catidx + 2:] # 抽取明细 info = xp1024_info_crawer() info.mv = mv #放入ID,避免重复 info.id = pub_id #print("pub_id",pub_id) BaseHttpGet.pushHttpGet(info) except Exception as e: print("xp1024_list_crawer数据解析出错:", e) return False return True pass
def xp1024_search_job(): print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),"xp1024_search_job 开始----") for i in range(1, 10): http=xp1024Http.xp1024_list_crawer() http.url=xp_base_url+"/pw/thread.php?fid=5&page=" + str(i) http.pub_type="亚洲无码" BaseHttpGet.pushHttpGet(http) for i in range(1, 10): http=xp1024Http.xp1024_list_crawer() http.url=xp_base_url+"/pw/thread.php?fid=22&page=" + str(i) http.pub_type="日本骑兵" BaseHttpGet.pushHttpGet(http) for i in range(1, 10): http=xp1024Http.xp1024_list_crawer() http.url=xp_base_url+"/pw/thread.php?fid=7&page=" + str(i) http.pub_type="歐美新片" BaseHttpGet.pushHttpGet(http) #执行多线程处理 # 开启5个线程进行处理 tpool = MyThreadPool.MyThreadPool(2) for i in range(10000): if BaseHttpGet.getHttpGetPoolCount(xp1024Http.xp1024_list_crawer.__name__)==0: break tpool.callInThread(do_http, xp1024Http.xp1024_list_crawer.__name__) pass for i in range(10000): if BaseHttpGet.getHttpGetPoolCount(xp1024Http.xp1024_info_crawer.__name__) == 0: break tpool.callInThread(do_http, xp1024Http.xp1024_info_crawer.__name__) pass tpool.wait()
def parse(self, response): try: rettext = response.text # 成功的话,必然包含下面的字符串 if rettext.find("g_page_config =") == -1: if CRA_COUNT % 50 == 0: print("数据抓取错误:", rettext, CRA_COUNT) return False g_pagestr = stringExt.StringExt(rettext).extractLine( "g_page_config", "pageName").ExtStr("g_page_config = ").str() if g_pagestr is None: return False g_pagestr = g_pagestr[:len(g_pagestr) - 1] # 如果没有shopItems,则抓取结束了 if g_pagestr.find("shopItems") == -1: self.nextQuery() return True page = json.loads(g_pagestr) items = page["mods"]["shoplist"]["data"]["shopItems"] itemcount = 0 sesscount = 0 for item in items: itemcount = itemcount + 1 shopurl = item["shopUrl"] shopid = self.paseInt(shopurl[shopurl.find("shop") + 4:shopurl.find(".taobao")]) #如果在缓存中存在,则直接跳过 if tbpool.ShopIdExist(shopid): continue #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题 shop = models.TTbShop.objects.filter(shopid=shopid).first() if shop is None: sesscount = sesscount + 1 shop = models.TTbShop() shop.shopid = shopid else: continue shop.mainpage = shopurl shop.uid = self.paseInt(item["uid"]) shop.nick = item["nick"] shop.user_rate_url = item['userRateUrl'] shop.title = item['title'] # shop.shop_score = self.paseInt(item['totalsold']) shop.prod_count = self.paseInt(item['procnt']) shop.shop_area = item['provcity'] if item["isTmall"] is True: shop.shop_type = "TM" else: shop.shop_type = "TB" shop.save() pass # 如果整页都没有一条新的的,则直接跳过10页 if sesscount == 0: self.pageno = self.pageno + 10 # 每20条输出一条 if CRA_COUNT % 50 == 0: print("数据抓取结束", self.city, self.q, self.pageno, sesscount) # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入 if self.pageno < 100: self.pageno = self.pageno + 1 self.id = None # id必须设置为空,否则无放入到运行队列里 BaseHttpGet.pushHttpGet(self) else: self.nextQuery() except Exception as e: print("TBShopSearchCrawer数据解析出错:", e) return False return True pass
def parse(self, response): try: rettext = response.text # 成功的话,必然包含下面的字符串 if rettext.find("g_page_config =") == -1: if CRA_COUNT % 50 == 0: print("数据抓取错误:", rettext, CRA_COUNT) return False st = stringExt.StringExt(rettext) g_pagestr = st.extractLine( "g_page_config", "pageName").ExtStr("g_page_config = ").str() if g_pagestr is None: return False g_pagestr = g_pagestr[:len(g_pagestr) - 1] # 如果没有auctions,则抓取结束了 if g_pagestr.find("auctions") == -1: self.nextQuery() return True page = json.loads(g_pagestr) items = page["mods"]["itemlist"]["data"]["auctions"] itemcount = 0 sesscount = 0 for item in items: itemcount = itemcount + 1 product_id = item["nid"] product_id = self.paseInt(product_id) if product_id is None: continue view_sales = 0 if "view_sales" in item: view_sales = stringExt.StringExt( item["view_sales"]).ExtStr("", "人").int() if view_sales == 0: continue #如果在缓存中存在,则直接跳过 if tbpool.prodIdExist(product_id): continue #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题 prod = models.TTbShopProd.objects.filter( product_id=product_id).first() if prod is None: prod = models.TTbShopProd() prod.product_id = product_id else: continue prod.prod_loc = item["item_loc"] prod.name = item["raw_title"] prod.uid = item["user_id"] prod.view_sales = view_sales prod.create_time = time.strftime("%Y%m%d", time.localtime(time.time())) prod.update_time = time.strftime("%Y%m%d", time.localtime(time.time())) prod.shop_price = self.paseInt(item["view_price"] * 100) prod.save() sesscount = sesscount + 1 pass # 如果整页都没有一条新的的,则加100页 if sesscount == 0: self.pageno = self.pageno + 100 # 每20条输出一条 if CRA_COUNT % 50 == 0: print("数据抓取结束", self.city, self.q, self.pageno, sesscount) # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入 if self.pageno < 100: self.pageno = self.pageno + 1 self.id = None # id必须设置为空,否则无放入到运行队列里 BaseHttpGet.pushHttpGet(self) else: self.nextQuery() except Exception as e: print("TBProdSearchCrawer数据解析出错:", e) return False return True pass