def closeWorker(cookie, group="full"):
    """
    关闭 worker
    :return:
    """
    headers = {
        'Cookie': cookie,
        'Origin': "http://rhino.dev.datatub.com",
        'Accept-Encoding': "gzip, deflate",
        'Accept-Language': "zh-CN,zh;q=0.9",
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
        'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
        'Accept': "*/*",
        'Referer': "http://rhino.dev.datatub.com/",
        'X-Requested-With': "XMLHttpRequest",
        'Connection': "keep-alive"
    }
    setting = Setting()
    setting.headers = headers
    setting.request = "POST"

    res = Downloader(setting=setting).payload(CLOSE_WORKER_URL,
                                              "workerGroup={}".format(group))
    LOG.info("CLOSE WORKER 结果 {}".format(res.json()))
    if res.json()["code"] == 0:
        return True
def clearWorker(cookie):
    headers = {
        'Cookie': "{}".format(cookie),
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        'Host': "rhino.dev.datatub.com"
    }
    setting = Setting()
    setting.headers = headers
    setting.request = "POST"
    res = Downloader(setting=setting).request(CLEAR_WORKER_URL)
    LOG.info("Clear 结果 {}".format(res.json()))
    if res.json()["code"] == 0:
        return True
Exemple #3
0
def buildSite(body):
    """创建站点"""
    payload = url_util.urlencode(body)
    s = Setting(request="POST", headers=HEADERS)
    res = Downloader.payload(MAIN_URL, data=payload, setting=s)
    print(res.json())
    return res.json()["data"]["id"]
def crawlComment(shopId: str):
    post_data_of_comment_list = "lng=113.334699&lat=23.125753&gpsLng=113.334699&gpsLat=23.125753&shopId=0&mtWmPoiId=1113200877771207&startIndex=0&labelId=0&scoreType=0&uuid=16d0af3bce3c8-0b9b6f8144bd7e-29792349-6b82e-16d0af3bce4c8&platform=3&partner=4&originUrl=https%3A%2F%2Fh5.waimai.meituan.com%2Fwaimai%2Fmindex%2Fmenu%3FdpShopId%3D%26mtShopId%3D1113200877771207%26utm_source%3D%26source%3Dshoplist%26initialLat%3D%26initialLng%3D%26actualLat%3D23.125753%26actualLng%3D113.334699&riskLevel=71&optimusCode=10&wm_latitude=0&wm_longitude=0&wm_actual_latitude=23125753&wm_actual_longitude=113334699&openh5_uuid=16d0af3bce3c8-0b9b6f8144bd7e-29792349-6b82e-16d0af3bce4c8&_token=eJyVkWuPmkAUhv8LH%2FyiEYZBBkxMgxeUy64CgrJN04CAIAy4MOBi0%2F%2Fesd3t9msnk5x3nvck5zI%2FmFqLmCngAOLAiOnimpkyYMyNRWbEkIY6ExFJAgIAiUgYMad%2FGc9JgjhiwtpbMtOvEx6OJIH%2F9gA2fX%2BCT8UL9D4yNJrApIRcmynLppPxLchwkI1xnJE2KMenCrN%2FEIuzMorfWByX7Zfo6qTVVYtmA0zeFQAA8hwnIXpoQ2jQEvy9qdr6FM8G77GhqUXWkEFWZiQLCjMgs7%2B6PM8GwYlW%2FY15OAb8BE3gB6M2rTCGUBBlme7kf3pOKxwzdFq8f0zLi2hEt%2FgA%2BQPQGPxrjJaa9zDJh%2FlEv4JWbLJzSVWs90Vuku3trrgpGao3r3GOUFebNPON0lOUxva3Pg4PuzPmxc1NvvbOq2lnutbu7mxBErZs9ZSFWrkxTrlRX7FZQVPZ68K%2BuOgeAP2KxK8F1KTIOjoJCE99%2FszuclWx9GhRrkPrKanCdH7PG%2BQJUeHuF43d99t6LuVruXqSAUg2gDhFB%2F1ee%2BP3bvTCHzwAgfHCHkKz8r0wv%2BhmsnEuE8W6qLKTo3VnJSukv7Ftho1O0nXLNi6Rc3WPfrJZ4IPaqu4xcEvhYEdl7rPJrtnpykJWQ9n3RGfrOjBdqT0xOLsqajXVbc%2BxClkt46tl%2BY6C%2BtcQgrbzyPFY2flLrBRSu%2BREzUyXiutBNuktvo5Xc2ujDvdmMYnPu0DU3IUlkHvHzvn7MEie82XWBbRlRc2zIlxvh%2FiSo9rdnuFwnsYXvEhgcFyZCM2JZ3Tb84z5%2BQvKAQ0P"

    logger.info("抓取 餐品评论 {}".format(shopId))
    res = Downloader(setting=setting).payload(COMMENT_URL,
                                              post_data_of_comment_list)
    jsonObject = res.json()

    try:
        for comment in jsonObject["data"]["list"]:
            comment["mtWmPoiId"] = shopId
            md5 = md5_util.md5("_".join([
                comment["commentTime"], comment["content"],
                str(comment["score"]), comment["deliveryTime"]
            ]))
            comment["_id"] = "_".join([str(comment["userID"]), shopId, md5])
            meituanwaimai_comment_list.save(comment)
    except:
        pass
Exemple #5
0
 def downloadTs(self, thread_num):
     while not self.queue.empty():
         ts = self.queue.get()
         try:
             print(thread_num, "下载ts", ts)
             ind = ts["tsInd"]
             res = Downloader.get(ts["url"])
             self.data[ind] = res.content
         except Exception as e:
             print("下载异常", e, ts)
Exemple #6
0
def getSign(i):
    while True:

        x_t = time_util.get_unix_v2()
        HOST = "192.168.1.103"
        row = random_util.get_random_ele_from_list(rows)

        exParams = collections.OrderedDict({
            "action": "ipv",
            "countryCode": "CN",
            "cpuCore": "4",
            "cpuMaxHz": "2265600",
            "from": "search",
            "id": row[0],
            "item_id": row[0],
            "latitude":"23.125712",
            "list_type": "search",
            "longitude":"113.334662",
            "osVersion":"23",
            "phoneType":"Nexus 5",
            "search_action":"initiative",
            "soVersion":"2.0",
            "utdid":"XW/eOqol6igDAO6KQj0b4Q3e"
        })

        data = collections.OrderedDict({
            "detail_v": "3.1.1",
            "exParams": json_util.to_json(exParams),
            "itemNumId": row[0]
        })


        params = collections.OrderedDict({
            "deviceId": "AuI9v8NvMf8kPEACBJUBffn0N6wOeMTO1lYOHPKDqOvh",
            "appKey": "21646297",
            "api": "mtop.taobao.detail.getdetail",
            "data": json_util.to_json(data),
            "utdid": "XW/eOqol6igDAO6KQj0b4Q3e",
            "x-features": "27",
            "ttid": "703304@taobao_android_7.6.0",
            "lng": "113.334662",
            "v": "6.0",
            "sid": None,
            "t": x_t,
            "uid": None,
            "lat": "23.125712",
        })
        try:
            data_ = url_util.quote(json_util.to_json(params))
            sign = Downloader(setting=Setting()).get("http://{}/?data={}".format(HOST, data_)).json()["data"]
            print("{}号线程 测试 itemId[{}] [{}]".format(i, row[0], sign))
            time_util.sleep(3)
        except Exception as e:
            traceback.print_exc()
            print("请求异常 {}".format(e))
def crawlFoodMenus(shopId: str):
    logger.info("抓取 餐品清单 {}".format(shopId))
    post_data_of_food_list["mtWmPoiId"] = shopId
    res = Downloader(setting=setting).post(FOOD_URL, post_data_of_food_list)
    jsonObject = res.json()

    try:
        categoryList = jsonObject["data"]["categoryList"]
        ttfDecrypt(categoryList)

        # 抓取餐品id
        for category in categoryList:
            for spu in category["spuList"]:
                spu["_id"] = "{}_{}_{}".format(category["tag"], shopId,
                                               spu["spuId"])
                spu["mtWmPoiId"] = shopId
                spu["categoryName"] = category["categoryName"]
                meituanwaimai_food_list.save(spu)
    except Exception as e:
        traceback.print_exc()
def selectWorker(cookie):
    """
    查询worker
    :return:
    """
    headers = {
        'Cookie': "{}".format(cookie),
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        'Host': "rhino.dev.datatub.com"
    }
    setting = Setting()
    setting.headers = headers
    json = Downloader(setting=setting).get_json(SELECT_WORKER_URL)
    LOG.log("SELECT 结果 {}".format(json))
    if json["code"] == 0:
        return True
Exemple #9
0
class TaobaoCrawler(object):

    downloader = Downloader()
    BASE_INFO_URL = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract&callback=onSibRequestSuccess"

    def run(self):
        url = "https://item.taobao.com/item.htm?id=593167331763"
        # self.downloader.middleware_list.append(ProxyMiddleWare())
        res = self.downloader.get(url)

        itemId = text_util.get_first_match(res.text, "itemId\s+:\s+'(\d+)',")
        sellerId = text_util.get_first_match(res.text,
                                             "sellerId\s+:\s+'(\d+)'")
        shopName = text_util.get_first_match(res.text,
                                             "shopName\s*:\s*'(.*?)'")
        skuMap = text_util.get_first_match(res.text, "skuMap\s*:\s*({.*})")
        title = text_util.get_first_match(res.text, "title\s*:\s*'(.*)'")
        propertyMemoMap = text_util.get_first_match(
            res.text, "propertyMemoMap\s*:\s*({.*})")

        self.downloader.setting.headers.update({"Referer": url})
        self.downloader.setting.headers.update({
            "Cookie":
            "t=192584b50433a81c5feae77e9e99411f; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; OUTFOX_SEARCH_USER_ID_NCOO=1427643707.8819768; enc=V2PIbfvRYC7hvhCHq8qkNaMekFaEJPNApT08%2FgVaEAQ2OC%2BI2X4ku9sCq5dBhGRyaf7sP3uWnXEnmirxNFKDhQ%3D%3D; cna=4vCbFAVQ8hgCAbc/WcslocCr; cookie2=1931f04989f237d225904534cc89e2a7; _tb_token_=4e1edb04afa8; v=0; miid=1429757782455434771; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zZPEr%2FCrvCMS%2BG3sTRRWrQ%2BVVTl09ME1KrXE7g7f5SykjbFjU2EbuQocCrCuXu%2BxnGiDUI4y7SiU8R5wYO2UYEEivSgzo9bmwuwMAMEhtH43hBt535uXkDsXTju7V5XRRxfiOYs5k5VhVmShunGRh%2FOIXRI5LD3ngB8VZblVPU62%2FNCVT0brygusVvRPUvgT3iMfNN3l4HrDoNlJ1N88B%2FsJExCyaSkUuHnRgisCCXwa6iP2ttiJOjfsdh9kgRqJM2cYKE5mdnN7YlWI7MtgU0YitBpzvFoYM9wDlxNIrehSt32D2awKXRliVeBIw%3D; uc3=id2=UUpnjMGWeTDxMA%3D%3D&vt3=F8dBy3MLoylZjTIKqDw%3D&lg2=W5iHLLyFOGW7aA%3D%3D&nk2=suEMAecR; csg=f0359cd1; lgc=%5Cu6768%5Cu7545%5Cu5E05; dnk=%5Cu6768%5Cu7545%5Cu5E05; skt=d1c02800fe0af2e7; existShop=MTU2Njg4NjA4OA%3D%3D; uc4=id4=0%40U2gtHRBkJk9a2SFfxwUCZdl9g6Mj&nk4=0%40sOlUtvsiedjt3d5KnKNpEJI%3D; tracknick=%5Cu6768%5Cu7545%5Cu5E05; _cc_=V32FPkk%2Fhw%3D%3D; tg=0; mt=ci=19_1; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; whl=-1%260%260%261566893100933; _m_h5_tk=ed82048ac357de15b1d9f408c5a87f3b_1567332023191; _m_h5_tk_enc=1ce5e64614f05ae7b3fe320776816210; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=U%2BGCWk%2F7p4mBoUyS4E9C&existShop=false&pas=0&cookie14=UoTaH0QlXL3bSQ%3D%3D&tag=8&lng=zh_CN; isg=BAUFdlAy9O_vYtC8ivmC0CRlFEiT0L5xhaOb-wdr0zw7niYQwREaJYs_qILN3tEM; l=cBNHS6EVqJJZw89-BOfNVQLf1P_OuIOf1sPP2doM4IB1951TMdIxHHwIzx_Bp3QQE95xUExySDo_2Rnp7yz3rAonhFSjOC0eQ"
        })
        self.downloader.middleware_list = []
        res = self.downloader.get(self.BASE_INFO_URL.format(itemId, sellerId))
        text_util.get_first_match(res.text, "onSibRequestSuccess\((.*)\);")
        info = json_util.to_python(
            text_util.get_first_match(res.text,
                                      "onSibRequestSuccess\((.*)\);"))

        print(
            {
                "itemId": itemId,
                "sellerId": sellerId,
                "shopName": shopName.encode('utf-8').decode("unicode-escape"),
                "title": title.encode('utf-8').decode("unicode-escape"),
                "skuMap": json_util.to_python(skuMap),
                "propertyMemoMap": propertyMemoMap,
                "soldTotalCount":
                info["data"]["soldQuantity"]["confirmGoodsCount"],
                "stock": info["data"]["dynStock"]["stock"]
            },
            end="\n")
def addWorker(workerNum, group, cookie):
    """
    添加 worker
    :return:
    """
    headers = {
        'Cookie': "{}".format(cookie),
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        'Host': "rhino.dev.datatub.com"
    }
    setting = Setting()
    setting.headers = headers
    json = Downloader(setting=setting).get_json(
        ADD_WORKER_URL.format(workerNum, group))
    LOG.info("ADD 结果 {}".format(json))
    if json["code"] == 0:
        return True
Exemple #11
0
class ProxyMiddleWare(MiddleWare):
    """代理 middle ware"""
    downloader = Downloader()

    def __init__(self):
        self.logger = logger_util.get_logger(self.__class__)

    def before(self, url: str, data: Union[str, Dict], setting: Setting):
        bs4 = self.downloader.get_bs4(
            "http://proxy.datastory.com.cn/getADAllHost?id=rhino")
        ip = random_util.get_random_ele_from_list(
            bs4.select("ipserver ips ip"))
        setting.set_proxies(
            ip.select_one("host").text,
            ip.select_one("port").text)
        self.logger.info("使用代理ip {} {}".format(
            ip.select_one("host").text,
            ip.select_one("port").text))

    def after(self, url: str, data: Union[str, Dict], setting: Setting,
              res: Response):
        pass
def login():
    headers = {
        'Host': "rhino.dev.datatub.com",
        'Content-Length': "35",
        'Accept': "*/*",
        'Origin': "http://rhino.dev.datatub.com",
        'X-Requested-With': "XMLHttpRequest",
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
        'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
        'Referer': "http://rhino.dev.datatub.com/login.html",
        'Accept-Encoding': "gzip, deflate",
        'Accept-Language': "zh-CN,zh;q=0.9",
        'Connection': "close"
    }

    payload = "userName=changshuai&passWord=123456"
    setting = Setting()
    setting.headers = headers
    setting.request = "POST"
    res = Downloader(setting=setting).payload(LOGIN_URL, payload)
    return res.cookies.get("JSESSIONID")
Exemple #13
0
def test_get_first_match():
    res = Downloader.get_with_bs4("https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html")
    print(text_util.get_first_match(res.text, "job_id: '(.*?)',"))
for poiId in poiIds:
    setting = Setting()
    setting.headers["Host"] = "i.waimai.meituan.com"
    setting.headers["Accept"] = "application/json"
    setting.headers[
        "Referer"] = "https://h5.waimai.meituan.com/waimai/mindex/menu?dpShopId=&mtShopId={}&utm_source=&source=shoplist&initialLat=23.129112&initialLng=113.264385&actualLat=&actualLng=".format(
            poiId)
    setting.headers["Origin"] = "https://h5.waimai.meituan.com"
    setting.headers[
        "User-Agent"] = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36"
    setting.headers["Content-Type"] = "application/x-www-form-urlencoded"
    setting.headers[
        "Cookie"] = "uuid=86f1d5d1-b229-4c12-a7bd-10dc5ef16e35; terminal=i; utm_source=; au_trace_key_net=default; wm_order_channel=default; _lx_utm=utm_source%3D60066; cssVersion=2ef84fdd; w_token=Qy1uY6h0RnVOU73sk3xFEAYn9EsAAAAA5wgAAOe8BzkBC_CqjTAFX4W2RnmK7ZF9TKNcPV8HuP7MoY8V0BrOiUilE8Gmjv_IzPBsyA; w_utmz=\"utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)\"; openh5_uuid=2E42332BF3FA3E12F5CFCFA99E799888E8883DB85EF252B10CB184761FFDB340; w_latlng=0,0; w_actual_lat=23125756; w_actual_lng=113334698; w_visitid=ef165329-a0cb-4252-aec2-3aec93bd291e"
    setting.headers["Accept-Encoding"] = "gzip, deflate"
    setting.headers["Connection"] = "keep-alive"
    downloader = Downloader(setting=setting)

    post_data = {}
    post_data["geoType"] = "2"
    post_data["mtWmPoiId"] = "{}".format(poiId)
    post_data["dpShopId"] = "-1"
    post_data["source"] = "shoplist"
    post_data["skuId"] = ""
    post_data[
        "uuid"] = "16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8"
    post_data["platform"] = "3"
    post_data["partner"] = "4"
    post_data[
        "originUrl"] = "https://h5.waimai.meituan.com/waimai/mindex/menu?dpShopId=&mtShopId={}&utm_source=&source=shoplist&initialLat=23.129112&initialLng=113.264385&actualLat=&actualLng=".format(
            poiId)
    post_data["riskLevel"] = "71"
def crawlShopList():
    post_data_of_shop_list = {
        "startIndex":
        "0",
        "sortId":
        "0",
        "multiFilterIds":
        "",
        "sliderSelectCode":
        "",
        "sliderSelectMin":
        "",
        "sliderSelectMax":
        "",
        "geoType":
        "2",
        "rankTraceId":
        "",
        "uuid":
        "EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C",
        "platform":
        "3",
        "partner":
        "4",
        "originUrl":
        "http://h5.waimai.meituan.com/waimai/mindex/home",
        "riskLevel":
        "71",
        "optimusCode":
        "10",
        "wm_latitude":
        "23129112",
        "wm_longitude":
        "113264385",
        "wm_actual_latitude":
        "",
        "wm_actual_longitude":
        "",
        "openh5_uuid":
        "EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C",
        "_token":
        ("eJx9kG2PmkAUhf/LJNsvEgcFBExMI7CyuAgi43SlaRoEVORtweFFmv73DjHN7oemyU3uuc+9J3Myv0BlhGA+YSciO2FAE1VgDiZ"
         "jdjwDDCA3uhFms6nMT1lR5KYMCD6ziShLPAOOFdbA/DsviIzET38MYEfnD/Chpjyt4cKgB+BCyPscwoswbv048+NxFsWk9vNxUG"
         "TwgWAW52HUDS2iif7rSItznH89FVUQLUhVR1+OfpD8rKt0MdieuOXTdEXrn2bKH5CKx5MPEQGaNkNDWklmRFkY5mSYafc/cUYz8"
         "LAjf3cb+pE08S0+51RF6ztJNNKc+6XTibKpFPBy9XVXkZZn44gR26VeYGAEnzXdsjOukje2j965xhqFsJDEXlfs/F632esltOvO"
         "sStNJtoqxkQxrboVUdmn2DvlrnfGjZGqfUIi9ohqf71ODzsRWeW9MkoRbRLop5aXYhHjbO+tdi+512FuH2n6i08kf9PqppMZt76"
         "8nirdNYMRQc9q70nu2lFRWzpmYJ52d0vNBadYKdtgyX4rePXWXgsrLXGSBq+yPiMHp+wdQdhgAzrI6HxO6i3prDus8d48q1Fbuc"
         "o1JDDRot3L27ZVJ4p0mJmd6/ZvWQ/rI7lAKLTs3jo0SBZsDY1mr9zOHB3Y0ybcpn2A+aDZh/YWLxcL8PsPSaDfFA=="
         ),
    }

    total = 0
    while True:
        # download 解析
        res = Downloader.post(SHOP_SEARCH_URL.format(time_util.get_unix()),
                              post_data_of_shop_list,
                              setting=setting)
        jsonObject = res.json()

        # 判断是否请求结束
        if total > 2000:
            logger.info("商品列表超过两千,结束请求")
            break

        # 判断是否请求异常
        if "data" not in jsonObject or not isinstance(jsonObject["data"],
                                                      dict):
            logger.error("第[{}]次 登录失效 or 接口异常 ".format(
                post_data_of_shop_list["startIndex"]))
            post_data_of_shop_list["startIndex"] = str(
                int(post_data_of_shop_list["startIndex"]) + 1)
            continue

        # 解析抽取
        try:
            for shop in jsonObject["data"]["shopList"]:
                shop["_id"] = shop["mtWmPoiId"]
                ttfDecrypt(shop)
                meituanwaimai_shop_list.save(shop)
                shop_list.add(shop["mtWmPoiId"])
            total += len(jsonObject["data"]["shopList"])

            # 写入数据库
            logger.info("第[{}]次 请求,抓取数据 {}量".format(
                post_data_of_shop_list["startIndex"],
                len(jsonObject["data"]["shopList"])))
        except Exception as e:

            #
            logger.error("第[{}]次 解析失败 或 插入失败".format(
                post_data_of_shop_list["startIndex"]))
            traceback.print_exc()

        post_data_of_shop_list["startIndex"] = str(
            int(post_data_of_shop_list["startIndex"]) + 1)
        time_util.sleep(5)
def searchShopList(keyword: str,
                   latitude: str,
                   longitude: str,
                   topN: int = 1000):
    post_data = (
        "geoType=2&cityId=1&secondCategoryId=&start=0&queryType=12002&keyword=%E7%B2%A5&categoryType=&entrance"
        "Id=0&uuid=16cb375798ab8-07b01f3c8b2835-1a201708-1fa400-16cb375798bc8&platform=3&partner=4&originUrl=h"
        "ttp%3A%2F%2Fh5.waimai.meituan.com%2Fwaimai%2Fmindex%2Fsearchresults%3FqueryType%3D12002%26keyword%3D%"
        "25E7%25B2%25A5%26entranceId%3D0%26qwTypeId%3D0%26mode%3Dsearch&riskLevel=71&optimusCode=10&wm_latitud"
        "e=23125801&wm_longitude=113334718&wm_actual_latitude=23125801&wm_actual_longitude=113334718&openh5_uu"
        "id=16cb375798ab8-07b01f3c8b2835-1a201708-1fa400-16cb375798bc8&_token=eJxVUNmOolAQ%2FReS7heNcEEWTcyETT"
        "bBVhHByTwgXBZZxMsmdubfh57uSWeSSs5SJ5WqeseQFmJLQACWAFOsgwhbYmBGzBhsijX12KEZhmIXAAAWEFMs%2BM8jAcFNsQtyJ"
        "Gz5kwZgys3JXx%2FGftTfxjcj52N9JLQxgCVNUy1xPKFnvZ8WfjorYNq0fjkLbgX%2BaeFFWobwgdfQR0GCYN3mTf3j3kI02EMFV4"
        "AkCPI1g0N%2FQ%2BHqRWZfBPKFp19h2SC%2FDKAWrojXe%2F%2BR%2FUuLWwhXn8PGA7FxlcIeVxkx%2B0L%2FC5t%2F2hw%2FMmb"
        "rNC5HBvUhP8rNtn%2Fyx05uc1hFSjn3PJ1vhMfO0HW%2FRw8qt5B5Jv1rK6tavAnEoo2jdi%2FmDsW%2B9ZzKUaZtZto1erOZxUHa"
        "yUhlLtlJ0BTLzW9rJhBvvAt5I0SiJVVROpl09mTP1Ar5UGM%2FFCR49wpnT5X6mdsN7vrgXaoyWW8P9aClWaFD70wGcf5oirMrFvB"
        "iaOQOpLQS28KglqH%2F1hXbhaqQFp5Dp67398ICfC1tY%2FawqXbPSSQja4FOnC6r2ZoBar3ZPVwRVybSMQGi%2BWTU0p64XBbMox"
        "ThTdhuIaRSP%2FLSqwLv%2BfXhcyfDmGSUeu0E2XSdilUvXsfxTGjwmXHTh8ZhQiEbZMM8ZxmFL1pLvwPHyNmGoPdEHFf66SSvk8G"
        "m6PT53MtZIinkxswF6DhDYncsf3QLJPQr7Pcfj3reuA%3D%3D")
    data = url_util.unquote_post_data(post_data)
    data["keyword"] = keyword
    data["start"] = "0"
    data["wm_latitude"] = latitude
    data["wm_longitude"] = longitude
    data["wm_actual_latitude"] = latitude
    data["wm_actual_longitude"] = longitude

    fieldList = fields.split(",")
    csv_util.save2csv_v3("poi.csv", fieldList)
    total = 0
    while True:
        jsonObject = None
        try:
            # 请求解析
            res = Downloader(setting).post(SEARCH_URL, data)
            jsonObject = res.json()
            logger.info("{} 抓取第{}页 数量:{}".format(
                keyword, data["start"],
                len(jsonObject["data"]["searchPoiList"])))

            # 抽取数据
            total += len(jsonObject["data"]["searchPoiList"])
            ttfDecrypt(jsonObject["data"]["searchPoiList"])
            for poi in jsonObject["data"]["searchPoiList"]:
                poi["_id"] = poi["id"]
                poi["keyword"] = keyword
                poi["url"] = "http://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId={}".format(
                    poi["_id"])
                csv_util.save2csv_v3("poi.csv", poi, fieldList)

            # 判断是否翻页
            if not jsonObject["data"]["hasNextPage"] or total > topN:
                logger.info("抓取第{}页 总数量{}".format(data["start"], total))
                logger.info("翻页抓取结束")
                break
            data["start"] = str(int(data["start"]) + 1)

        except Exception as e:
            traceback.print_exc()
            data["start"] = str(int(data["start"]) + 1)
            logger.error("{}页抓取异常,jsonObject: {}".format(
                data["start"], jsonObject))
Exemple #17
0
# @Time         : 18-2-13 下午8:46
# @Author       : DioMryang
# @File         : test_downloader.py
from unittest import TestCase

# @Description  :
from dio_core.network.downloader import Downloader
from dio_core.network.downloader.downloader import Setting

page = 0
while True:
    url = (
        "https://www.google.com/search?q=经贸磋商&tbs=cdr:1,cd_min:1/21/2019,cd_max:1/"
        + "28/2019&tbm=nws&start={}")
    setting = Setting()
    setting.set_proxies("116.31.102.3", "57003")

    soup = Downloader.get_with_bs4(url.format(page), setting=setting).soup

    result = soup.select(".l.lLrAF")
    if not result:
        break
    for aTag in soup.select(".l.lLrAF"):
        print(aTag["href"])
    page += 10
    print()
Exemple #18
0
def testTp(body):
    payload = url_util.urlencode(body)
    s = Setting(request="POST", headers=HEADERS)
    res = Downloader.payload(TEST_TEMPLATE_URL, data=payload, setting=s)
    return res.json()["data"]
Exemple #19
0
def buildMainSite(url: str, parentId: int = -1):
    """
    建立主站点
    :param parentId: 父级站点
    :param url: 站点url
    :param domain: 站点domian
    :param name:
    :return: siteId
    """
    soup = Downloader.get_bs4(url)
    title = soup.select_one("title").text.replace("--", "dio").replace("-", "dio").replace("_", "dio")\
        .replace("——", "dio").replace("|", "dio").replace("·", "dio").replace(" ", "dio")
    name = ""
    host = url_util.get_host(url)
    print("host 为" + host)

    for n in title.split("dio"):
        print("site name 为: " + n)
        if input() in ("", "y", "Y", "Yes"):
            name = n.strip()
            break

    if parentId != -1:
        mainSiteName = getSiteInfo(parentId)["name"]
        name = "{}_{}".format(mainSiteName, name)

    if input("是否添加频道后缀") in ("", "y", "Y", "Yes"):
        name = "{}{}".format(name, "频道  ")
    print("输出name为: {}".format(name))
    siteQuery = {
        "name": name,
        "domain": url_util.get_host(url),
        "tags": [],
        "maxDepth": "2",
        "overtime": "",
        "params": {
            "spark.executors.num":
            "1",
            "spark.executor.core.num":
            "1",
            "error_fail":
            "0.9",
            "fail_queue":
            "OFF",
            "rhino.task.unique.manager.class":
            "com.datatub.rhino.framework.component.operatior.manager.unique.RedisListUniqueManager",
            "rhino.task.unique.manager.cache_size":
            "1000",
            "rhino.task.job.info.collector.class":
            "com.datatub.rhino.framework.component.collector.LocalJobInfoCollector"
        },
        "threshold": "",
        "frequency": "",
        "interval": "20",
        "cron": "",
        "template": [],
        "agent": [],
        "category": "3"
    }

    query = {"parentId": parentId, "site": json_util.to_json(siteQuery)}
    siteId = buildSite(query)
    print("输出页面url为: " +
          "http://v3.rhino.datatub.com/#/gather/siteManager?site={}".format(
              siteId))
    return siteId
Exemple #20
0
def main_downloader():
    proxies = [
        ["222.189.191.53", "9999"],
        ["182.111.64.7", "41766"],
        ["115.151.3.16", "9999"],
        ["121.233.206.151", "9999"],
        ["116.209.52.143", "9999"],
        ["1.198.72.234", "9999"],
        ["121.61.1.48", "9999"],
        ["183.148.133.22", "9999"],
        ["115.239.24.166", "9999"],
        ["110.52.235.226", "9999"],
        ["113.122.168.246", "9999"],
        ["59.62.165.99", "808"],
        ["218.91.112.42", "9999"],
        ["111.177.161.70", "9999"],
        ["110.52.235.231", "9999"],
        ["180.116.48.122", "9999"],
        ["113.122.168.23", "9999"],
        ["49.77.59.235", "8118"],
        ["110.52.235.173", "9999"],
        ["111.177.187.211", "9999"],
        ["124.94.192.206", "9999"],
        ["125.123.137.71", "9999"],
        ["121.61.1.222", "9999"],
        ["111.72.154.47", "9999"],
        ["125.123.138.26", "9999"],
        ["110.52.235.244", "9999"],
        ["121.61.24.254", "9999"],
        ["111.177.170.35", "9999"],
        ["42.53.73.131", "9999"],
        ["111.177.180.221", "9999"],
        ["111.177.170.11", "9999"],
        ["60.173.244.133", "41306"],
        ["116.209.59.131", "9999"],
        ["221.235.234.199", "9999"],
        ["110.52.235.76", "9999"],
        ["121.61.24.242", "9999"],
        ["112.87.69.158", "9999"],
        ["59.62.166.60", "9999"],
        ["59.62.166.172", "9999"],
        ["61.184.43.129", "9999"],
        ["110.52.235.70", "808"],
        ["116.209.56.164", "9999"],
        ["171.80.152.26", "9999"],
        ["110.52.235.79", "9999"],
        ["116.209.55.171", "9999"],
        ["116.209.52.190", "9999"],
        ["118.187.58.34", "53281"],
        ["110.52.235.67", "9999"],
        ["115.212.81.84", "8118"],
        ["121.31.158.51", "8123"],
        ["116.209.56.95", "9999"],
        ["116.209.56.179", "9999"],
        ["183.148.145.229", "9999"],
        ["121.61.3.223", "9999"],
        ["101.236.42.63", "8866"],
        ["111.176.31.69", "9999"],
        ["116.209.54.22", "9999"],
        ["116.209.57.233", "9999"],
        ["125.123.136.232", "9999"],
        ["27.29.95.209", "9999"],
        ["116.209.57.22", "9999"],
        ["112.85.174.44", "9999"],
        ["61.183.233.6", "54896"],
        ["116.209.59.150", "9999"],
        ["116.209.55.191", "9999"],
        ["116.209.56.125", "9999"],
        ["125.123.142.141", "9999"],
        ["59.62.167.130", "53128"],
        ["175.148.77.188", "1133"],
        ["116.209.52.177", "9999"],
        ["125.123.138.171", "9999"],
        ["111.181.65.0", "9999"],
        ["1.192.246.197", "9999"],
        ["111.177.179.8", "9999"],
        ["110.52.235.86", "9999"],
        ["120.35.12.105", "3128"],
        ["116.209.57.16", "9999"],
        ["59.45.16.10", "59156"],
        ["111.181.66.158", "9999"],
        ["112.85.130.51", "9999"],
        ["116.208.55.173", "9999"],
        ["115.151.5.177", "9999"],
        ["113.121.147.233", "9999"],
        ["171.80.0.190", "9999"],
        ["110.52.235.139", "9999"],
        ["121.61.3.176", "9999"],
        ["110.52.235.71", "9999"],
        ["110.52.235.114", "9999"],
        ["112.85.165.66", "9999"],
        ["116.209.59.174", "9999"],
        ["121.61.1.9", "9999"],
        ["112.85.174.93", "9999"],
        ["123.163.115.203", "9999"],
        ["180.119.141.144", "9999"],
        ["116.209.54.168", "9999"],
        ["116.209.58.45", "9999"],
        ["125.123.142.215", "9999"],
        ["110.52.235.196", "9999"],
    ]
    for proxy in proxies:
        setting = Setting()
        setting.set_proxies(proxy[0], proxy[1])
        setting.timeout = 10
        setting.repeat = 1
        print("使用代理", proxy)
        try:
            res = Downloader.get("http://icanhazip.com", setting=setting)
            print("success", proxy, res.text, res.status_code)
        except Exception as e:
            print("fail")
Exemple #21
0
from dio_core.network.downloader import Downloader
from dio_core.utils import file_util, time_util

rows = file_util.readRows(
    "/home/changshuai/PycharmProjects/dio_core/Test/Data/kill_job_urls.txt")
for row in rows:
    url = "http://api.rhino.datatub.com/common/job/kill?job_id={}&token=5fa92f2a597cc60201780504be1028a7".format(
        row)
    res = Downloader.get(url)
    print(row, res.text, url)
    time_util.sleep(3)
Exemple #22
0
    def getTsUrls(self):

        res = Downloader.get(self.m3u8)
        result = text_util.get_all_match("#EXTINF:.*,\n(.*)", res.text)
        return zip(range(len(result)), result)
Exemple #23
0
def buildTemplate(body):
    """创建模板"""
    payload = url_util.urlencode(body)
    s = Setting(request="POST", headers=HEADERS)
    res = Downloader.payload(BUILD_TEMPLATE_URL, data=payload, setting=s)
    return res.json()["data"]["id"]
Exemple #24
0
def buildTask(body):
    payload = url_util.urlencode(body)
    s = Setting(request="POST", headers=HEADERS)
    res = Downloader.payload(BUILD_TASK_URL, data=payload, setting=s)
    print(res.json())
    return res.json()["data"]["id"]
Exemple #25
0
from dio_core.network.downloader import Downloader


soup = Downloader.get_with_bs4("http://proxy.datastory.com.cn/getADAllHost?id=rhino").soup
for ip in soup.ipserver.select("ip"):
    print("http\t{}\t{}".format(ip.select_one("host").text, ip.select_one("port").text))
Exemple #26
0
def getSiteInfo(siteId: int):
    url = GET_SITE_INFO_URL.format(siteId)
    s = Setting()
    s.setParams(headers=HEADERS)
    return Downloader.get(url, setting=s).json()["data"]["site"]
Exemple #27
0
import logging

from dio_core.network.downloader import Downloader
from dio_core.utils import time_util

logger = logging.get_logger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler("log.txt")
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

while True:
    soup = Downloader.get_bs4(
        "http://proxy.datastory.com.cn/getADAllHost?id=ss-teg")
    logger.info("count:{}".format(soup.select_one("count").text))

    for proxy in soup.select("ips id"):
        logger.info("proxy: {}".format(proxy.text))
    time_util.sleep(10)
for kwarg in kwargs:
    setting = Setting()
    setting.headers["Host"] = "i.waimai.meituan.com"
    setting.headers["Accept"] = "application/json"
    setting.headers[
        "Referer"] = "https://h5.waimai.meituan.com/waimai/mindex/searchresults?queryType=11002&entranceId=0&keyword=%E5%B9%B2%E6%8D%9E%E8%9E%BA%E8%9B%B3%E7%B2%89&qwTypeId=11002&mode=1"
    setting.headers["Origin"] = "https://h5.waimai.meituan.com"
    setting.headers[
        "User-Agent"] = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36"
    setting.headers["Content-Type"] = "application/x-www-form-urlencoded"
    setting.headers[
        "Cookie"] = "uuid=86f1d5d1-b229-4c12-a7bd-10dc5ef16e45; terminal=i; utm_source=; au_trace_key_net=default; wm_order_channel=default; _lx_utm=utm_source%3D60066; w_token=Qy1uY6h0RnVOU73sk3xFEAYn9EsAAAAA5wgAAOe8BzkBC_CqjTAFX4W2RnmK7ZF9TKNcPV8HuP7MoY8V0BrOiUilE8Gmjv_IzPBsyA; w_utmz=\"utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)\"; w_actual_lat=23125756; w_actual_lng=113334698; openh5_uuid=16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8; w_latlng=23129112,113264385; cssVersion=4c2d803d; w_visitid=49523e1f-aa20-46a9-b219-4907cd3201be"
    setting.headers["Accept-Encoding"] = "gzip, deflate"
    setting.headers["Connection"] = "keep-alive"
    downloader = Downloader(setting=setting)
    post_data = {}
    post_data["geoType"] = "2"
    post_data["cityId"] = "1"
    post_data["secondCategoryId"] = ""
    post_data["start"] = "0"
    post_data["queryType"] = "11002"
    post_data["keyword"] = kwarg["keyword"]
    post_data["categoryType"] = ""
    post_data["entranceId"] = "0"
    post_data[
        "uuid"] = "16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8"
    post_data["platform"] = "3"
    post_data["partner"] = "4"
    post_data[
        "originUrl"] = "https://h5.waimai.meituan.com/waimai/mindex/searchresults?queryType=11002&entranceId=0&keyword=%E5%B9%B2%E6%8D%9E%E8%9E%BA%E8%9B%B3%E7%B2%89&qwTypeId=11002&mode=1"
import jsonpath

from dio_core.network.downloader import Downloader
from dio_core.utils import json_util, url_util, time_util
from dio_core_test.utils import text_util

keyword = "女装"

for i in range(100):
    html = Downloader.get(
        "https://shopsearch.taobao.com/browse/shop_search.htm?q={}&s={}".
        format(keyword, i * 20)).text
    data = json_util.to_python(
        text_util.get_first_match(html, "g_page_config = (.*);"))
    for shop in jsonpath.jsonpath(data, "$.mods.shoplist.data.shopItems.*"):
        if "shopIcon" in shop and "title" in shop["shopIcon"] and "天猫" in shop[
                "shopIcon"]["title"]:
            print("天猫\t{}\t{}".format(url_util.patch_url(shop["shopUrl"]),
                                      shop["procnt"]))
        else:
            print("淘宝\t{}\t{}".format(url_util.patch_url(shop["shopUrl"]),
                                      shop["procnt"]))
    time_util.sleep(5)