Esempio n. 1
0
def getCurrIp():
    L.acquire()
    global currip
    global last_updata_ip
    try:
        if len(currip) > 2 and (time.time() - last_updata_ip) < 60 * 10:
            return currip
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.3'
        }
        print("getCurrIp--")
        # 忽略警告
        requests.packages.urllib3.disable_warnings()
        url = "https://ip.cn/"
        r = requests.get(url, headers=headers)
        r.encoding = 'utf-8'

        ip = stringExt.StringExt(r.text).ExtStr("IP:<code>", "</code>").str()
        if len(ip) > 5 and len(ip) < 20:
            currip = ip
            last_updata_ip = time.time()
            print("currip:", currip)
        pass
    except Exception:
        pass
    finally:
        L.release()
    return currip
Esempio n. 2
0
    def parse(self, response):
        try:
            html = response.text
            if html.find("microscope-data") < 1:
                print("淘宝的商品主页数据抓取失败1")
                time.sleep(2)
                return False
            if html.find("您查看的宝贝不存在") > 1:
                print("淘宝的商品主页数据抓取失败2")
                return False
            st = stringExt.StringExt(html)
            shopid = st.extractLine("microscope-data",
                                    "meta").ExtStr("shopId=", ";").int()
            if shopid is None:
                print("淘宝的商品主页数据抓取失败3")
                models.TTbShopProd.objects.filter(
                    product_id=self.product_id).update(shopid=-1)
                return False
            models.TTbShopProd.objects.filter(uid=self.uid).update(
                shopid=shopid)

            print("淘宝的商品主页数据抓取成功", self.product_id)
        except Exception as e:
            print("TBUserRateCrawer数据解析出错:", e)
            return False
        return True
Esempio n. 3
0
 def parse(self, response):
     try:
         shopCreatetime = stringExt.StringExt(response.text).ExtStr(
             "starts\":\"", "\"}").str()
         if shopCreatetime == None:
             print("获取时间失败", response.text)
             return False
         #更新数据
         shop = models.TTbShop.objects.get(shopid=self.shopid)
         shop.shop_createtime = shopCreatetime
         shop.save()
         print("获取时间成功", self.shopid)
     except Exception as e:
         print("TBShopCreateTimeCrawer数据解析出错:", e)
         return False
     return True
Esempio n. 4
0
    def parse(self, response):
        try:
            rettext = response.text
            # 成功的话,必然包含下面的字符串
            if rettext.find("g_page_config =") == -1:
                if CRA_COUNT % 50 == 0:
                    print("数据抓取错误:", rettext, CRA_COUNT)
                return False

            g_pagestr = stringExt.StringExt(rettext).extractLine(
                "g_page_config", "pageName").ExtStr("g_page_config = ").str()
            if g_pagestr is None:
                return False
            g_pagestr = g_pagestr[:len(g_pagestr) - 1]
            # 如果没有shopItems,则抓取结束了
            if g_pagestr.find("shopItems") == -1:
                self.nextQuery()
                return True
            page = json.loads(g_pagestr)
            items = page["mods"]["shoplist"]["data"]["shopItems"]
            itemcount = 0
            sesscount = 0
            for item in items:
                itemcount = itemcount + 1
                shopurl = item["shopUrl"]
                shopid = self.paseInt(shopurl[shopurl.find("shop") +
                                              4:shopurl.find(".taobao")])
                #如果在缓存中存在,则直接跳过
                if tbpool.ShopIdExist(shopid):
                    continue
                #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
                shop = models.TTbShop.objects.filter(shopid=shopid).first()
                if shop is None:
                    sesscount = sesscount + 1
                    shop = models.TTbShop()
                    shop.shopid = shopid
                else:
                    continue
                shop.mainpage = shopurl
                shop.uid = self.paseInt(item["uid"])
                shop.nick = item["nick"]
                shop.user_rate_url = item['userRateUrl']
                shop.title = item['title']
                # shop.shop_score = self.paseInt(item['totalsold'])
                shop.prod_count = self.paseInt(item['procnt'])
                shop.shop_area = item['provcity']
                if item["isTmall"] is True:
                    shop.shop_type = "TM"
                else:
                    shop.shop_type = "TB"
                shop.save()
            pass
            # 如果整页都没有一条新的的,则直接跳过10页
            if sesscount == 0:
                self.pageno = self.pageno + 10
            # 每20条输出一条
            if CRA_COUNT % 50 == 0:
                print("数据抓取结束", self.city, self.q, self.pageno, sesscount)
            # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入
            if self.pageno < 100:
                self.pageno = self.pageno + 1
                self.id = None  # id必须设置为空,否则无放入到运行队列里
                BaseHttpGet.pushHttpGet(self)
            else:
                self.nextQuery()
        except Exception as e:
            print("TBShopSearchCrawer数据解析出错:", e)
            return False
        return True
        pass
Esempio n. 5
0
    def parse(self, response):
        try:
            html = response.text
            if html.find("卖家信用") < 1:
                print("获取淘宝卖家主页数据失败", html)
                return False
            st = stringExt.StringExt(html)
            shopArea = st.extractLine("<li>所在地区").ExtStr("所在地区:",
                                                         "</li>").str()
            mainCname = st.extractLine(
                "<span id=\"chart-name\" class=\"data\">").ExtStr(
                    "class=\"data\">", "</span>").str()
            if mainCname is None:
                mainCname = st.extractLine("<li>当前主营").ExtStr(
                    "target=\"_blank\">", "</a>").str()

            # 近半年评分人数 // 共 < span > 5056965 < / span > 人
            commentCount = st.extractLine("共<span>", "</span>人").ExtStr(
                "<span>", "</span>").int()

            itemScore = st.extractLine("<em title",
                                       "class=\"count\"",
                                       beg=html.find("宝贝与描述相符")).ExtStr(
                                           "class=\"count\">",
                                           "</em>").float()
            serviceScore = st.extractLine("<em title",
                                          "class=\"count\"",
                                          beg=html.find("卖家的服务态度")).ExtStr(
                                              "class=\"count\">",
                                              "</em>").float()
            deliveryScore = st.extractLine("<em title",
                                           "class=\"count\"",
                                           beg=html.find("物流服务的质量")).ExtStr(
                                               "class=\"count\">",
                                               "</em>").float()

            sellerCredit = st.extractLine(
                "<span id=\"chart-num\" class=\"data\">").ExtStr(
                    "class=\"data\">", "</span>").int()
            if sellerCredit is None:
                sellerCredit = st.extractLine(
                    "<div class=\"list\">卖家信用").ExtStr("卖家信用:").int()

            # 查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
            shop = models.TTbShop.objects.filter(shopid=self.shopid).first()
            if shop is None:
                shop = models.TTbShop()
                shop.shopid = self.shopid
            if shopArea is not None:
                shop.shop_area = shopArea
            if mainCname is not None:
                shop.main_cname = mainCname
            if commentCount is not None:
                shop.comment_count = commentCount
            if sellerCredit is not None:
                shop.seller_credit = sellerCredit
            if itemScore is not None:
                shop.item_score = itemScore
            if serviceScore is not None:
                shop.service_score = serviceScore
            if deliveryScore is not None:
                shop.delivery_score = deliveryScore
            shop.save()

            if CRA_COUNT % 10 == 0:
                print("获取淘宝卖家主页数据成功", self.shopid)
        except Exception as e:
            print("TBUserRateCrawer数据解析出错:", e)
            return False
        return True
Esempio n. 6
0
    def parse(self, response):
        try:
            html = response.text
            if html.find("window.shop_config") < 1:
                print("获取商家主页数据失败", html)
                return False

            st = stringExt.StringExt(html)
            shopname_l = st.extractLine("shop-name", "<a class=")
            shopname = shopname_l.ExtStr("<span>", "</span>").str()
            shopurl = shopname_l.ExtStr("href=\"", "\"").str()
            if shopname is None:
                shopname_l = st.extractLine("shop-name-title", "<span")
                shopname = shopname_l.ExtStr("\">", "</span>").str()

            shop_config = st.ExtStr("window.shop_config =", "};").str()
            jobj = json.loads(shop_config + "}")
            userId = jobj["userId"]
            shopId = jobj["shopId"]
            user_nick = urllib.parse.unquote(jobj["user_nick"])
            userRateUrlstr = st.extractLine("<a ",
                                            "//rate.taobao.com/").ExtStr(
                                                "href=\"", "\"").str()
            shopType = "TB"
            if st.indexCount("tmall.com") > 5:
                shopType = "TM"
            #描述分数
            itemScore = st.extractLine("dsr-num red", "</span>",
                                       html.find("描述")).ExtStr(
                                           "red\">", "</span>").float()
            # 服务分数
            serviceScore = st.extractLine("dsr-num red", "</span>",
                                          html.find("服务")).ExtStr(
                                              "red\">", "</span>").float()
            # 物流分数
            deliveryScore = st.extractLine("dsr-num red", "</span>",
                                           html.find("物流")).ExtStr(
                                               "red\">", "</span>").float()

            # 查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
            shop = models.TTbShop.objects.filter(shopid=shopId).first()
            if shop is None:
                shop = models.TTbShop()
                shop.shopid = shopId
            if userId is not None:
                shop.uid = userId
            if shopurl is not None:
                shop.mainpage = shopurl
            if user_nick is not None:
                shop.nick = user_nick
            if userRateUrlstr is not None:
                shop.user_rate_url = userRateUrlstr
            if shopType is not None:
                shop.shop_type = shopType
            if shopname is not None:
                shop.title = shopname
            if itemScore is not None:
                shop.item_score = itemScore
            if serviceScore is not None:
                shop.service_score = serviceScore
            if deliveryScore is not None:
                shop.delivery_score = deliveryScore
            shop.save()

            print("获取商家主页数据成功", self.shopid)
        except Exception as e:
            print("TBShopMainCrawer数据解析出错:", e)
            return False
        return True
Esempio n. 7
0
    def parse(self, response):
        try:
            rettext = response.text
            # 成功的话,必然包含下面的字符串
            if rettext.find("g_page_config =") == -1:
                if CRA_COUNT % 50 == 0:
                    print("数据抓取错误:", rettext, CRA_COUNT)
                return False
            st = stringExt.StringExt(rettext)
            g_pagestr = st.extractLine(
                "g_page_config", "pageName").ExtStr("g_page_config = ").str()

            if g_pagestr is None:
                return False
            g_pagestr = g_pagestr[:len(g_pagestr) - 1]
            # 如果没有auctions,则抓取结束了
            if g_pagestr.find("auctions") == -1:
                self.nextQuery()
                return True
            page = json.loads(g_pagestr)
            items = page["mods"]["itemlist"]["data"]["auctions"]
            itemcount = 0
            sesscount = 0
            for item in items:
                itemcount = itemcount + 1
                product_id = item["nid"]
                product_id = self.paseInt(product_id)
                if product_id is None:
                    continue
                view_sales = 0
                if "view_sales" in item:
                    view_sales = stringExt.StringExt(
                        item["view_sales"]).ExtStr("", "人").int()
                if view_sales == 0:
                    continue
                #如果在缓存中存在,则直接跳过
                if tbpool.prodIdExist(product_id):
                    continue
                #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
                prod = models.TTbShopProd.objects.filter(
                    product_id=product_id).first()
                if prod is None:
                    prod = models.TTbShopProd()
                    prod.product_id = product_id
                else:
                    continue
                prod.prod_loc = item["item_loc"]
                prod.name = item["raw_title"]
                prod.uid = item["user_id"]
                prod.view_sales = view_sales
                prod.create_time = time.strftime("%Y%m%d",
                                                 time.localtime(time.time()))
                prod.update_time = time.strftime("%Y%m%d",
                                                 time.localtime(time.time()))
                prod.shop_price = self.paseInt(item["view_price"] * 100)
                prod.save()
                sesscount = sesscount + 1

            pass
            # 如果整页都没有一条新的的,则加100页
            if sesscount == 0:
                self.pageno = self.pageno + 100
            # 每20条输出一条
            if CRA_COUNT % 50 == 0:
                print("数据抓取结束", self.city, self.q, self.pageno, sesscount)
            # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入
            if self.pageno < 100:
                self.pageno = self.pageno + 1
                self.id = None  # id必须设置为空,否则无放入到运行队列里
                BaseHttpGet.pushHttpGet(self)
            else:
                self.nextQuery()
        except Exception as e:
            print("TBProdSearchCrawer数据解析出错:", e)
            return False
        return True
        pass
Esempio n. 8
0
            return False
        return True


# 测试类
class TestHttpGet(BaseHttpGet.BaseHttpGet):
    url = "http://www.baidu.com/"

    def before(self):
        return False

    def parse(self, response):
        print(response.text)
        return True


if __name__ == '__main__':

    test = TBProdSearchCrawer()
    test.q = "美食"
    test.city = "广州"
    #test.url = "https://rate.taobao.com/user-rate-UMmIGvFQyOFHT.htm?spm=a1z10.1-c-s.0.0.20c375c0IN5YFU"
    #test.run()
    item = {"a": 123}

    print("view_sales" in item)
    stringExt.StringExt(item["view_sales"]).ExtStr("", "人").int()
    #test.run()

    pass