Esempio n. 1
0
def prod_search_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "prod_search_job 开始----")
    job = sched.get_job(job_id="prod_search_job")
    next = int(job.next_run_time.strftime('%Y%m%d%H%M%S'))
    clasName = tbHttp.TBProdSearchCrawer.__name__
    count = BaseHttpGet.getHttpGetPoolCount(clasName)
    # 如果队列中的元素为空,则加入一批到队列中
    if count < 10:
        pass
    #开启40个线程进行处理
    tpool = MyThreadPool.MyThreadPool(40)
    for i in range(10000):
        now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if next - now < 3:
            print(
                time.strftime("%d %H:%M:%S", time.localtime(time.time())), i,
                "prod_search_job 结束--------------------------------------------------------"
            )
            return
        tpool.callInThread(do_http, clasName)
    pass
    #如果要提前结束,则放入一批新的查询
    qlist = tbDao.random_prod_name()
    city = chinaCity.getFristCity()
    for q in qlist:
        if (tbpool.ProdQuerykeyExist(q)):
            continue
        prod = tbHttp.TBProdSearchCrawer()
        prod.pageno = 1
        prod.q = q
        prod.city = city
        BaseHttpGet.pushHttpGet(prod)
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "prod_search_job 提前结束----")
Esempio n. 2
0
def update_prod_item_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 开始----")
    job = sched.get_job(job_id="update_prod_item_job")
    next = int(job.next_run_time.strftime('%Y%m%d%H%M%S'))
    clasName = tbHttp.TBProdItemCrawer.__name__
    count = BaseHttpGet.getHttpGetPoolCount(clasName)
    # 开启线程进行处理
    tpool = MyThreadPool.MyThreadPool(10)
    for i in range(2000):
        now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if next - now < 7:
            print(
                time.strftime("%d %H:%M:%S", time.localtime(time.time())), i,
                "update_shop_create_time_job 结束--------------------------------------------------------"
            )
            return
        tpool.callInThread(do_http, clasName)
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 提前结束----")
    pass
    #如果队列中的元素为空,则加入一批到队列中
    list = models.TTbShopProd.objects.filter(shopid=None)[0:5000]
    for p in list:
        http = tbHttp.TBProdItemCrawer()
        http.product_id = p.product_id
        http.uid = p.uid
        BaseHttpGet.pushHttpGet(http)
    pass
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 提前结束----")
    pass
Esempio n. 3
0
def update_shop_create_time_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 开始----")
    job = sched.get_job(job_id="update_shop_create_time_job")
    next = int(job.next_run_time.strftime('%Y%m%d%H%M%S'))
    clasName = tbHttp.TBShopCreateTimeCrawer.__name__
    count = BaseHttpGet.getHttpGetPoolCount(clasName)

    #如果队列中的元素为空,则加入一批到队列中
    if count == 0:
        list = models.TTbShop.objects.filter(shop_createtime=None)[0:5000]
        for shop in list:
            http = tbHttp.TBShopCreateTimeCrawer()
            http.shopid = shop.shopid
            http.isProxy = True
            BaseHttpGet.pushHttpGet(http)
        pass
    #开启线程进行处理
    tpool = MyThreadPool.MyThreadPool(5)
    for i in range(10000):
        now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if next - now < 10:
            print(
                time.strftime("%d %H:%M:%S", time.localtime(time.time())), i,
                "update_shop_create_time_job 结束--------------------------------------------------------"
            )
            return
        tpool.callInThread(do_http, clasName)
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 提前结束----")
    pass
Esempio n. 4
0
def do_http(clasName=None):
    http = BaseHttpGet.popHttpGet(clasName)
    if http is None:
        return
    if http.run():
        pass
    else:
        BaseHttpGet.pushHttpGet(http)
        pass
Esempio n. 5
0
    def nextQuery(self):
        ncity = chinaCity.getNextCity(self.city)
        if ncity is not None:
            self.city = ncity
            self.id = None  # id必须设置为空,否则无放入到运行队列里
            self.pageno = 1
            BaseHttpGet.pushHttpGet(self)

        # 如果结束了。则把查询的key放入缓存,同一个查询key,3天内部重复查询
        tbpool.ShopQuerykeyExist(self.q)
Esempio n. 6
0
 def nextQuery(self):
     n_c = chinaCity.getNextCity(self.city)
     if n_c is not None:
         self.city = n_c
         self.id = None  # id必须设置为空,否则无放入到运行队列里
         self.pageno = 1
         BaseHttpGet.pushHttpGet(self)
     #如果结束了,则把查询关键字放入缓存
     tbpool.ProdQuerykeyExist(self.q)
     return
Esempio n. 7
0
def init_shop_search():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "init_shop_search 开始----")
    cityl = chinaCity.listAllCity()
    cat = tbcategory.getFristQueryKey()
    count = 0
    for city in cityl:
        tshop = tbHttp.TBShopSearchCrawer()
        tshop.pageno = 1
        tshop.q = cat
        tshop.city = city
        tshop.id = "shop_search," + cat + city
        BaseHttpGet.pushHttpGet(tshop)
        count = count + 1

    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "init_shop_search 结束----", count)
Esempio n. 8
0
    def parse(self, response):
        try:
            print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
                  "抓取网站(", self.url, ")开始-----")
            soup = BeautifulSoup(response.content.decode("utf-8", 'replace'),
                                 "lxml")
            trs = soup.find_all("tr", class_='tr3 t_one', align="center")
            for tr in trs:
                if (str(tr).find("置顶") > 0):
                    continue
                namelink = tr.find('h3').find('a')
                if namelink is None:
                    continue
                pub_id = namelink.get("id")
                mv = models.XP1024Movie.objects.filter(pub_id=pub_id).first()
                # 判断影片是否已经存在,如果存在,则不在进行下一步处理
                if mv is None:
                    mv = models.XP1024Movie()
                    mv.pub_src = "1024xp"
                else:
                    continue
                mv.pub_type = self.pub_type
                mv.pub_day = tr.find('a', class_='f10').string.strip()
                mv.pub_name = namelink.string.strip()
                mv.pub_info_url = "/pw/" + namelink.get("href")
                mv.pub_id = pub_id
                catidx = mv.pub_name.find("] ")
                if catidx > 0:
                    mv.pub_name = mv.pub_name[catidx + 2:]

                # 抽取明细
                info = xp1024_info_crawer()
                info.mv = mv
                #放入ID,避免重复
                info.id = pub_id
                #print("pub_id",pub_id)
                BaseHttpGet.pushHttpGet(info)

        except Exception as e:
            print("xp1024_list_crawer数据解析出错:", e)
            return False
        return True
        pass
Esempio n. 9
0
def xp1024_search_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),"xp1024_search_job 开始----")
    for i in range(1, 10):
        http=xp1024Http.xp1024_list_crawer()
        http.url=xp_base_url+"/pw/thread.php?fid=5&page=" + str(i)
        http.pub_type="亚洲无码"
        BaseHttpGet.pushHttpGet(http)
    for i in range(1, 10):
        http=xp1024Http.xp1024_list_crawer()
        http.url=xp_base_url+"/pw/thread.php?fid=22&page=" + str(i)
        http.pub_type="日本骑兵"
        BaseHttpGet.pushHttpGet(http)
    for i in range(1, 10):
        http=xp1024Http.xp1024_list_crawer()
        http.url=xp_base_url+"/pw/thread.php?fid=7&page=" + str(i)
        http.pub_type="歐美新片"
        BaseHttpGet.pushHttpGet(http)

    #执行多线程处理
    # 开启5个线程进行处理

    tpool = MyThreadPool.MyThreadPool(2)
    for i in range(10000):
        if BaseHttpGet.getHttpGetPoolCount(xp1024Http.xp1024_list_crawer.__name__)==0:
            break
        tpool.callInThread(do_http, xp1024Http.xp1024_list_crawer.__name__)
    pass

    for i in range(10000):
        if BaseHttpGet.getHttpGetPoolCount(xp1024Http.xp1024_info_crawer.__name__) == 0:
            break
        tpool.callInThread(do_http,  xp1024Http.xp1024_info_crawer.__name__)
    pass
    tpool.wait()
Esempio n. 10
0
    def parse(self, response):
        try:
            rettext = response.text
            # 成功的话,必然包含下面的字符串
            if rettext.find("g_page_config =") == -1:
                if CRA_COUNT % 50 == 0:
                    print("数据抓取错误:", rettext, CRA_COUNT)
                return False

            g_pagestr = stringExt.StringExt(rettext).extractLine(
                "g_page_config", "pageName").ExtStr("g_page_config = ").str()
            if g_pagestr is None:
                return False
            g_pagestr = g_pagestr[:len(g_pagestr) - 1]
            # 如果没有shopItems,则抓取结束了
            if g_pagestr.find("shopItems") == -1:
                self.nextQuery()
                return True
            page = json.loads(g_pagestr)
            items = page["mods"]["shoplist"]["data"]["shopItems"]
            itemcount = 0
            sesscount = 0
            for item in items:
                itemcount = itemcount + 1
                shopurl = item["shopUrl"]
                shopid = self.paseInt(shopurl[shopurl.find("shop") +
                                              4:shopurl.find(".taobao")])
                #如果在缓存中存在,则直接跳过
                if tbpool.ShopIdExist(shopid):
                    continue
                #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
                shop = models.TTbShop.objects.filter(shopid=shopid).first()
                if shop is None:
                    sesscount = sesscount + 1
                    shop = models.TTbShop()
                    shop.shopid = shopid
                else:
                    continue
                shop.mainpage = shopurl
                shop.uid = self.paseInt(item["uid"])
                shop.nick = item["nick"]
                shop.user_rate_url = item['userRateUrl']
                shop.title = item['title']
                # shop.shop_score = self.paseInt(item['totalsold'])
                shop.prod_count = self.paseInt(item['procnt'])
                shop.shop_area = item['provcity']
                if item["isTmall"] is True:
                    shop.shop_type = "TM"
                else:
                    shop.shop_type = "TB"
                shop.save()
            pass
            # 如果整页都没有一条新的的,则直接跳过10页
            if sesscount == 0:
                self.pageno = self.pageno + 10
            # 每20条输出一条
            if CRA_COUNT % 50 == 0:
                print("数据抓取结束", self.city, self.q, self.pageno, sesscount)
            # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入
            if self.pageno < 100:
                self.pageno = self.pageno + 1
                self.id = None  # id必须设置为空,否则无放入到运行队列里
                BaseHttpGet.pushHttpGet(self)
            else:
                self.nextQuery()
        except Exception as e:
            print("TBShopSearchCrawer数据解析出错:", e)
            return False
        return True
        pass
Esempio n. 11
0
    def parse(self, response):
        try:
            rettext = response.text
            # 成功的话,必然包含下面的字符串
            if rettext.find("g_page_config =") == -1:
                if CRA_COUNT % 50 == 0:
                    print("数据抓取错误:", rettext, CRA_COUNT)
                return False
            st = stringExt.StringExt(rettext)
            g_pagestr = st.extractLine(
                "g_page_config", "pageName").ExtStr("g_page_config = ").str()

            if g_pagestr is None:
                return False
            g_pagestr = g_pagestr[:len(g_pagestr) - 1]
            # 如果没有auctions,则抓取结束了
            if g_pagestr.find("auctions") == -1:
                self.nextQuery()
                return True
            page = json.loads(g_pagestr)
            items = page["mods"]["itemlist"]["data"]["auctions"]
            itemcount = 0
            sesscount = 0
            for item in items:
                itemcount = itemcount + 1
                product_id = item["nid"]
                product_id = self.paseInt(product_id)
                if product_id is None:
                    continue
                view_sales = 0
                if "view_sales" in item:
                    view_sales = stringExt.StringExt(
                        item["view_sales"]).ExtStr("", "人").int()
                if view_sales == 0:
                    continue
                #如果在缓存中存在,则直接跳过
                if tbpool.prodIdExist(product_id):
                    continue
                #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
                prod = models.TTbShopProd.objects.filter(
                    product_id=product_id).first()
                if prod is None:
                    prod = models.TTbShopProd()
                    prod.product_id = product_id
                else:
                    continue
                prod.prod_loc = item["item_loc"]
                prod.name = item["raw_title"]
                prod.uid = item["user_id"]
                prod.view_sales = view_sales
                prod.create_time = time.strftime("%Y%m%d",
                                                 time.localtime(time.time()))
                prod.update_time = time.strftime("%Y%m%d",
                                                 time.localtime(time.time()))
                prod.shop_price = self.paseInt(item["view_price"] * 100)
                prod.save()
                sesscount = sesscount + 1

            pass
            # 如果整页都没有一条新的的,则加100页
            if sesscount == 0:
                self.pageno = self.pageno + 100
            # 每20条输出一条
            if CRA_COUNT % 50 == 0:
                print("数据抓取结束", self.city, self.q, self.pageno, sesscount)
            # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入
            if self.pageno < 100:
                self.pageno = self.pageno + 1
                self.id = None  # id必须设置为空,否则无放入到运行队列里
                BaseHttpGet.pushHttpGet(self)
            else:
                self.nextQuery()
        except Exception as e:
            print("TBProdSearchCrawer数据解析出错:", e)
            return False
        return True
        pass