def closeWorker(cookie, group="full"): """ 关闭 worker :return: """ headers = { 'Cookie': cookie, 'Origin': "http://rhino.dev.datatub.com", 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9", 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8", 'Accept': "*/*", 'Referer': "http://rhino.dev.datatub.com/", 'X-Requested-With': "XMLHttpRequest", 'Connection': "keep-alive" } setting = Setting() setting.headers = headers setting.request = "POST" res = Downloader(setting=setting).payload(CLOSE_WORKER_URL, "workerGroup={}".format(group)) LOG.info("CLOSE WORKER 结果 {}".format(res.json())) if res.json()["code"] == 0: return True
def clearWorker(cookie): headers = { 'Cookie': "{}".format(cookie), 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'Host': "rhino.dev.datatub.com" } setting = Setting() setting.headers = headers setting.request = "POST" res = Downloader(setting=setting).request(CLEAR_WORKER_URL) LOG.info("Clear 结果 {}".format(res.json())) if res.json()["code"] == 0: return True
def buildSite(body): """创建站点""" payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(MAIN_URL, data=payload, setting=s) print(res.json()) return res.json()["data"]["id"]
def crawlComment(shopId: str): post_data_of_comment_list = "lng=113.334699&lat=23.125753&gpsLng=113.334699&gpsLat=23.125753&shopId=0&mtWmPoiId=1113200877771207&startIndex=0&labelId=0&scoreType=0&uuid=16d0af3bce3c8-0b9b6f8144bd7e-29792349-6b82e-16d0af3bce4c8&platform=3&partner=4&originUrl=https%3A%2F%2Fh5.waimai.meituan.com%2Fwaimai%2Fmindex%2Fmenu%3FdpShopId%3D%26mtShopId%3D1113200877771207%26utm_source%3D%26source%3Dshoplist%26initialLat%3D%26initialLng%3D%26actualLat%3D23.125753%26actualLng%3D113.334699&riskLevel=71&optimusCode=10&wm_latitude=0&wm_longitude=0&wm_actual_latitude=23125753&wm_actual_longitude=113334699&openh5_uuid=16d0af3bce3c8-0b9b6f8144bd7e-29792349-6b82e-16d0af3bce4c8&_token=eJyVkWuPmkAUhv8LH%2FyiEYZBBkxMgxeUy64CgrJN04CAIAy4MOBi0%2F%2Fesd3t9msnk5x3nvck5zI%2FmFqLmCngAOLAiOnimpkyYMyNRWbEkIY6ExFJAgIAiUgYMad%2FGc9JgjhiwtpbMtOvEx6OJIH%2F9gA2fX%2BCT8UL9D4yNJrApIRcmynLppPxLchwkI1xnJE2KMenCrN%2FEIuzMorfWByX7Zfo6qTVVYtmA0zeFQAA8hwnIXpoQ2jQEvy9qdr6FM8G77GhqUXWkEFWZiQLCjMgs7%2B6PM8GwYlW%2FY15OAb8BE3gB6M2rTCGUBBlme7kf3pOKxwzdFq8f0zLi2hEt%2FgA%2BQPQGPxrjJaa9zDJh%2FlEv4JWbLJzSVWs90Vuku3trrgpGao3r3GOUFebNPON0lOUxva3Pg4PuzPmxc1NvvbOq2lnutbu7mxBErZs9ZSFWrkxTrlRX7FZQVPZ68K%2BuOgeAP2KxK8F1KTIOjoJCE99%2FszuclWx9GhRrkPrKanCdH7PG%2BQJUeHuF43d99t6LuVruXqSAUg2gDhFB%2F1ee%2BP3bvTCHzwAgfHCHkKz8r0wv%2BhmsnEuE8W6qLKTo3VnJSukv7Ftho1O0nXLNi6Rc3WPfrJZ4IPaqu4xcEvhYEdl7rPJrtnpykJWQ9n3RGfrOjBdqT0xOLsqajXVbc%2BxClkt46tl%2BY6C%2BtcQgrbzyPFY2flLrBRSu%2BREzUyXiutBNuktvo5Xc2ujDvdmMYnPu0DU3IUlkHvHzvn7MEie82XWBbRlRc2zIlxvh%2FiSo9rdnuFwnsYXvEhgcFyZCM2JZ3Tb84z5%2BQvKAQ0P" logger.info("抓取 餐品评论 {}".format(shopId)) res = Downloader(setting=setting).payload(COMMENT_URL, post_data_of_comment_list) jsonObject = res.json() try: for comment in jsonObject["data"]["list"]: comment["mtWmPoiId"] = shopId md5 = md5_util.md5("_".join([ comment["commentTime"], comment["content"], str(comment["score"]), comment["deliveryTime"] ])) comment["_id"] = "_".join([str(comment["userID"]), shopId, md5]) meituanwaimai_comment_list.save(comment) except: pass
def downloadTs(self, thread_num): while not self.queue.empty(): ts = self.queue.get() try: print(thread_num, "下载ts", ts) ind = ts["tsInd"] res = Downloader.get(ts["url"]) self.data[ind] = res.content except Exception as e: print("下载异常", e, ts)
def getSign(i): while True: x_t = time_util.get_unix_v2() HOST = "192.168.1.103" row = random_util.get_random_ele_from_list(rows) exParams = collections.OrderedDict({ "action": "ipv", "countryCode": "CN", "cpuCore": "4", "cpuMaxHz": "2265600", "from": "search", "id": row[0], "item_id": row[0], "latitude":"23.125712", "list_type": "search", "longitude":"113.334662", "osVersion":"23", "phoneType":"Nexus 5", "search_action":"initiative", "soVersion":"2.0", "utdid":"XW/eOqol6igDAO6KQj0b4Q3e" }) data = collections.OrderedDict({ "detail_v": "3.1.1", "exParams": json_util.to_json(exParams), "itemNumId": row[0] }) params = collections.OrderedDict({ "deviceId": "AuI9v8NvMf8kPEACBJUBffn0N6wOeMTO1lYOHPKDqOvh", "appKey": "21646297", "api": "mtop.taobao.detail.getdetail", "data": json_util.to_json(data), "utdid": "XW/eOqol6igDAO6KQj0b4Q3e", "x-features": "27", "ttid": "703304@taobao_android_7.6.0", "lng": "113.334662", "v": "6.0", "sid": None, "t": x_t, "uid": None, "lat": "23.125712", }) try: data_ = url_util.quote(json_util.to_json(params)) sign = Downloader(setting=Setting()).get("http://{}/?data={}".format(HOST, data_)).json()["data"] print("{}号线程 测试 itemId[{}] [{}]".format(i, row[0], sign)) time_util.sleep(3) except Exception as e: traceback.print_exc() print("请求异常 {}".format(e))
def crawlFoodMenus(shopId: str): logger.info("抓取 餐品清单 {}".format(shopId)) post_data_of_food_list["mtWmPoiId"] = shopId res = Downloader(setting=setting).post(FOOD_URL, post_data_of_food_list) jsonObject = res.json() try: categoryList = jsonObject["data"]["categoryList"] ttfDecrypt(categoryList) # 抓取餐品id for category in categoryList: for spu in category["spuList"]: spu["_id"] = "{}_{}_{}".format(category["tag"], shopId, spu["spuId"]) spu["mtWmPoiId"] = shopId spu["categoryName"] = category["categoryName"] meituanwaimai_food_list.save(spu) except Exception as e: traceback.print_exc()
def selectWorker(cookie): """ 查询worker :return: """ headers = { 'Cookie': "{}".format(cookie), 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'Host': "rhino.dev.datatub.com" } setting = Setting() setting.headers = headers json = Downloader(setting=setting).get_json(SELECT_WORKER_URL) LOG.log("SELECT 结果 {}".format(json)) if json["code"] == 0: return True
class TaobaoCrawler(object): downloader = Downloader() BASE_INFO_URL = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract&callback=onSibRequestSuccess" def run(self): url = "https://item.taobao.com/item.htm?id=593167331763" # self.downloader.middleware_list.append(ProxyMiddleWare()) res = self.downloader.get(url) itemId = text_util.get_first_match(res.text, "itemId\s+:\s+'(\d+)',") sellerId = text_util.get_first_match(res.text, "sellerId\s+:\s+'(\d+)'") shopName = text_util.get_first_match(res.text, "shopName\s*:\s*'(.*?)'") skuMap = text_util.get_first_match(res.text, "skuMap\s*:\s*({.*})") title = text_util.get_first_match(res.text, "title\s*:\s*'(.*)'") propertyMemoMap = text_util.get_first_match( res.text, "propertyMemoMap\s*:\s*({.*})") self.downloader.setting.headers.update({"Referer": url}) self.downloader.setting.headers.update({ "Cookie": "t=192584b50433a81c5feae77e9e99411f; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; OUTFOX_SEARCH_USER_ID_NCOO=1427643707.8819768; enc=V2PIbfvRYC7hvhCHq8qkNaMekFaEJPNApT08%2FgVaEAQ2OC%2BI2X4ku9sCq5dBhGRyaf7sP3uWnXEnmirxNFKDhQ%3D%3D; cna=4vCbFAVQ8hgCAbc/WcslocCr; cookie2=1931f04989f237d225904534cc89e2a7; _tb_token_=4e1edb04afa8; v=0; miid=1429757782455434771; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zZPEr%2FCrvCMS%2BG3sTRRWrQ%2BVVTl09ME1KrXE7g7f5SykjbFjU2EbuQocCrCuXu%2BxnGiDUI4y7SiU8R5wYO2UYEEivSgzo9bmwuwMAMEhtH43hBt535uXkDsXTju7V5XRRxfiOYs5k5VhVmShunGRh%2FOIXRI5LD3ngB8VZblVPU62%2FNCVT0brygusVvRPUvgT3iMfNN3l4HrDoNlJ1N88B%2FsJExCyaSkUuHnRgisCCXwa6iP2ttiJOjfsdh9kgRqJM2cYKE5mdnN7YlWI7MtgU0YitBpzvFoYM9wDlxNIrehSt32D2awKXRliVeBIw%3D; uc3=id2=UUpnjMGWeTDxMA%3D%3D&vt3=F8dBy3MLoylZjTIKqDw%3D&lg2=W5iHLLyFOGW7aA%3D%3D&nk2=suEMAecR; csg=f0359cd1; lgc=%5Cu6768%5Cu7545%5Cu5E05; dnk=%5Cu6768%5Cu7545%5Cu5E05; skt=d1c02800fe0af2e7; existShop=MTU2Njg4NjA4OA%3D%3D; uc4=id4=0%40U2gtHRBkJk9a2SFfxwUCZdl9g6Mj&nk4=0%40sOlUtvsiedjt3d5KnKNpEJI%3D; tracknick=%5Cu6768%5Cu7545%5Cu5E05; _cc_=V32FPkk%2Fhw%3D%3D; tg=0; mt=ci=19_1; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; whl=-1%260%260%261566893100933; _m_h5_tk=ed82048ac357de15b1d9f408c5a87f3b_1567332023191; _m_h5_tk_enc=1ce5e64614f05ae7b3fe320776816210; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=U%2BGCWk%2F7p4mBoUyS4E9C&existShop=false&pas=0&cookie14=UoTaH0QlXL3bSQ%3D%3D&tag=8&lng=zh_CN; isg=BAUFdlAy9O_vYtC8ivmC0CRlFEiT0L5xhaOb-wdr0zw7niYQwREaJYs_qILN3tEM; l=cBNHS6EVqJJZw89-BOfNVQLf1P_OuIOf1sPP2doM4IB1951TMdIxHHwIzx_Bp3QQE95xUExySDo_2Rnp7yz3rAonhFSjOC0eQ" }) self.downloader.middleware_list = [] res = self.downloader.get(self.BASE_INFO_URL.format(itemId, sellerId)) text_util.get_first_match(res.text, "onSibRequestSuccess\((.*)\);") info = json_util.to_python( text_util.get_first_match(res.text, "onSibRequestSuccess\((.*)\);")) print( { "itemId": itemId, "sellerId": sellerId, "shopName": shopName.encode('utf-8').decode("unicode-escape"), "title": title.encode('utf-8').decode("unicode-escape"), "skuMap": json_util.to_python(skuMap), "propertyMemoMap": propertyMemoMap, "soldTotalCount": info["data"]["soldQuantity"]["confirmGoodsCount"], "stock": info["data"]["dynStock"]["stock"] }, end="\n")
def addWorker(workerNum, group, cookie): """ 添加 worker :return: """ headers = { 'Cookie': "{}".format(cookie), 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'Host': "rhino.dev.datatub.com" } setting = Setting() setting.headers = headers json = Downloader(setting=setting).get_json( ADD_WORKER_URL.format(workerNum, group)) LOG.info("ADD 结果 {}".format(json)) if json["code"] == 0: return True
class ProxyMiddleWare(MiddleWare): """代理 middle ware""" downloader = Downloader() def __init__(self): self.logger = logger_util.get_logger(self.__class__) def before(self, url: str, data: Union[str, Dict], setting: Setting): bs4 = self.downloader.get_bs4( "http://proxy.datastory.com.cn/getADAllHost?id=rhino") ip = random_util.get_random_ele_from_list( bs4.select("ipserver ips ip")) setting.set_proxies( ip.select_one("host").text, ip.select_one("port").text) self.logger.info("使用代理ip {} {}".format( ip.select_one("host").text, ip.select_one("port").text)) def after(self, url: str, data: Union[str, Dict], setting: Setting, res: Response): pass
def login(): headers = { 'Host': "rhino.dev.datatub.com", 'Content-Length': "35", 'Accept': "*/*", 'Origin': "http://rhino.dev.datatub.com", 'X-Requested-With': "XMLHttpRequest", 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8", 'Referer': "http://rhino.dev.datatub.com/login.html", 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9", 'Connection': "close" } payload = "userName=changshuai&passWord=123456" setting = Setting() setting.headers = headers setting.request = "POST" res = Downloader(setting=setting).payload(LOGIN_URL, payload) return res.cookies.get("JSESSIONID")
def test_get_first_match(): res = Downloader.get_with_bs4("https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html") print(text_util.get_first_match(res.text, "job_id: '(.*?)',"))
for poiId in poiIds: setting = Setting() setting.headers["Host"] = "i.waimai.meituan.com" setting.headers["Accept"] = "application/json" setting.headers[ "Referer"] = "https://h5.waimai.meituan.com/waimai/mindex/menu?dpShopId=&mtShopId={}&utm_source=&source=shoplist&initialLat=23.129112&initialLng=113.264385&actualLat=&actualLng=".format( poiId) setting.headers["Origin"] = "https://h5.waimai.meituan.com" setting.headers[ "User-Agent"] = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36" setting.headers["Content-Type"] = "application/x-www-form-urlencoded" setting.headers[ "Cookie"] = "uuid=86f1d5d1-b229-4c12-a7bd-10dc5ef16e35; terminal=i; utm_source=; au_trace_key_net=default; wm_order_channel=default; _lx_utm=utm_source%3D60066; cssVersion=2ef84fdd; w_token=Qy1uY6h0RnVOU73sk3xFEAYn9EsAAAAA5wgAAOe8BzkBC_CqjTAFX4W2RnmK7ZF9TKNcPV8HuP7MoY8V0BrOiUilE8Gmjv_IzPBsyA; w_utmz=\"utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)\"; openh5_uuid=2E42332BF3FA3E12F5CFCFA99E799888E8883DB85EF252B10CB184761FFDB340; w_latlng=0,0; w_actual_lat=23125756; w_actual_lng=113334698; w_visitid=ef165329-a0cb-4252-aec2-3aec93bd291e" setting.headers["Accept-Encoding"] = "gzip, deflate" setting.headers["Connection"] = "keep-alive" downloader = Downloader(setting=setting) post_data = {} post_data["geoType"] = "2" post_data["mtWmPoiId"] = "{}".format(poiId) post_data["dpShopId"] = "-1" post_data["source"] = "shoplist" post_data["skuId"] = "" post_data[ "uuid"] = "16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8" post_data["platform"] = "3" post_data["partner"] = "4" post_data[ "originUrl"] = "https://h5.waimai.meituan.com/waimai/mindex/menu?dpShopId=&mtShopId={}&utm_source=&source=shoplist&initialLat=23.129112&initialLng=113.264385&actualLat=&actualLng=".format( poiId) post_data["riskLevel"] = "71"
def crawlShopList(): post_data_of_shop_list = { "startIndex": "0", "sortId": "0", "multiFilterIds": "", "sliderSelectCode": "", "sliderSelectMin": "", "sliderSelectMax": "", "geoType": "2", "rankTraceId": "", "uuid": "EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C", "platform": "3", "partner": "4", "originUrl": "http://h5.waimai.meituan.com/waimai/mindex/home", "riskLevel": "71", "optimusCode": "10", "wm_latitude": "23129112", "wm_longitude": "113264385", "wm_actual_latitude": "", "wm_actual_longitude": "", "openh5_uuid": "EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C", "_token": ("eJx9kG2PmkAUhf/LJNsvEgcFBExMI7CyuAgi43SlaRoEVORtweFFmv73DjHN7oemyU3uuc+9J3Myv0BlhGA+YSciO2FAE1VgDiZ" "jdjwDDCA3uhFms6nMT1lR5KYMCD6ziShLPAOOFdbA/DsviIzET38MYEfnD/Chpjyt4cKgB+BCyPscwoswbv048+NxFsWk9vNxUG" "TwgWAW52HUDS2iif7rSItznH89FVUQLUhVR1+OfpD8rKt0MdieuOXTdEXrn2bKH5CKx5MPEQGaNkNDWklmRFkY5mSYafc/cUYz8" "LAjf3cb+pE08S0+51RF6ztJNNKc+6XTibKpFPBy9XVXkZZn44gR26VeYGAEnzXdsjOukje2j965xhqFsJDEXlfs/F632esltOvO" "sStNJtoqxkQxrboVUdmn2DvlrnfGjZGqfUIi9ohqf71ODzsRWeW9MkoRbRLop5aXYhHjbO+tdi+512FuH2n6i08kf9PqppMZt76" "8nirdNYMRQc9q70nu2lFRWzpmYJ52d0vNBadYKdtgyX4rePXWXgsrLXGSBq+yPiMHp+wdQdhgAzrI6HxO6i3prDus8d48q1Fbuc" "o1JDDRot3L27ZVJ4p0mJmd6/ZvWQ/rI7lAKLTs3jo0SBZsDY1mr9zOHB3Y0ybcpn2A+aDZh/YWLxcL8PsPSaDfFA==" ), } total = 0 while True: # download 解析 res = Downloader.post(SHOP_SEARCH_URL.format(time_util.get_unix()), post_data_of_shop_list, setting=setting) jsonObject = res.json() # 判断是否请求结束 if total > 2000: logger.info("商品列表超过两千,结束请求") break # 判断是否请求异常 if "data" not in jsonObject or not isinstance(jsonObject["data"], dict): logger.error("第[{}]次 登录失效 or 接口异常 ".format( post_data_of_shop_list["startIndex"])) post_data_of_shop_list["startIndex"] = str( int(post_data_of_shop_list["startIndex"]) + 1) continue # 解析抽取 try: for shop in jsonObject["data"]["shopList"]: shop["_id"] = shop["mtWmPoiId"] ttfDecrypt(shop) meituanwaimai_shop_list.save(shop) shop_list.add(shop["mtWmPoiId"]) total += len(jsonObject["data"]["shopList"]) # 写入数据库 logger.info("第[{}]次 请求,抓取数据 {}量".format( post_data_of_shop_list["startIndex"], len(jsonObject["data"]["shopList"]))) except Exception as e: # logger.error("第[{}]次 解析失败 或 插入失败".format( post_data_of_shop_list["startIndex"])) traceback.print_exc() post_data_of_shop_list["startIndex"] = str( int(post_data_of_shop_list["startIndex"]) + 1) time_util.sleep(5)
def searchShopList(keyword: str, latitude: str, longitude: str, topN: int = 1000): post_data = ( "geoType=2&cityId=1&secondCategoryId=&start=0&queryType=12002&keyword=%E7%B2%A5&categoryType=&entrance" "Id=0&uuid=16cb375798ab8-07b01f3c8b2835-1a201708-1fa400-16cb375798bc8&platform=3&partner=4&originUrl=h" "ttp%3A%2F%2Fh5.waimai.meituan.com%2Fwaimai%2Fmindex%2Fsearchresults%3FqueryType%3D12002%26keyword%3D%" "25E7%25B2%25A5%26entranceId%3D0%26qwTypeId%3D0%26mode%3Dsearch&riskLevel=71&optimusCode=10&wm_latitud" "e=23125801&wm_longitude=113334718&wm_actual_latitude=23125801&wm_actual_longitude=113334718&openh5_uu" "id=16cb375798ab8-07b01f3c8b2835-1a201708-1fa400-16cb375798bc8&_token=eJxVUNmOolAQ%2FReS7heNcEEWTcyETT" "bBVhHByTwgXBZZxMsmdubfh57uSWeSSs5SJ5WqeseQFmJLQACWAFOsgwhbYmBGzBhsijX12KEZhmIXAAAWEFMs%2BM8jAcFNsQtyJ" "Gz5kwZgys3JXx%2FGftTfxjcj52N9JLQxgCVNUy1xPKFnvZ8WfjorYNq0fjkLbgX%2BaeFFWobwgdfQR0GCYN3mTf3j3kI02EMFV4" "AkCPI1g0N%2FQ%2BHqRWZfBPKFp19h2SC%2FDKAWrojXe%2F%2BR%2FUuLWwhXn8PGA7FxlcIeVxkx%2B0L%2FC5t%2F2hw%2FMmb" "rNC5HBvUhP8rNtn%2Fyx05uc1hFSjn3PJ1vhMfO0HW%2FRw8qt5B5Jv1rK6tavAnEoo2jdi%2FmDsW%2B9ZzKUaZtZto1erOZxUHa" "yUhlLtlJ0BTLzW9rJhBvvAt5I0SiJVVROpl09mTP1Ar5UGM%2FFCR49wpnT5X6mdsN7vrgXaoyWW8P9aClWaFD70wGcf5oirMrFvB" "iaOQOpLQS28KglqH%2F1hXbhaqQFp5Dp67398ICfC1tY%2FawqXbPSSQja4FOnC6r2ZoBar3ZPVwRVybSMQGi%2BWTU0p64XBbMox" "ThTdhuIaRSP%2FLSqwLv%2BfXhcyfDmGSUeu0E2XSdilUvXsfxTGjwmXHTh8ZhQiEbZMM8ZxmFL1pLvwPHyNmGoPdEHFf66SSvk8G" "m6PT53MtZIinkxswF6DhDYncsf3QLJPQr7Pcfj3reuA%3D%3D") data = url_util.unquote_post_data(post_data) data["keyword"] = keyword data["start"] = "0" data["wm_latitude"] = latitude data["wm_longitude"] = longitude data["wm_actual_latitude"] = latitude data["wm_actual_longitude"] = longitude fieldList = fields.split(",") csv_util.save2csv_v3("poi.csv", fieldList) total = 0 while True: jsonObject = None try: # 请求解析 res = Downloader(setting).post(SEARCH_URL, data) jsonObject = res.json() logger.info("{} 抓取第{}页 数量:{}".format( keyword, data["start"], len(jsonObject["data"]["searchPoiList"]))) # 抽取数据 total += len(jsonObject["data"]["searchPoiList"]) ttfDecrypt(jsonObject["data"]["searchPoiList"]) for poi in jsonObject["data"]["searchPoiList"]: poi["_id"] = poi["id"] poi["keyword"] = keyword poi["url"] = "http://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId={}".format( poi["_id"]) csv_util.save2csv_v3("poi.csv", poi, fieldList) # 判断是否翻页 if not jsonObject["data"]["hasNextPage"] or total > topN: logger.info("抓取第{}页 总数量{}".format(data["start"], total)) logger.info("翻页抓取结束") break data["start"] = str(int(data["start"]) + 1) except Exception as e: traceback.print_exc() data["start"] = str(int(data["start"]) + 1) logger.error("{}页抓取异常,jsonObject: {}".format( data["start"], jsonObject))
# @Time : 18-2-13 下午8:46 # @Author : DioMryang # @File : test_downloader.py from unittest import TestCase # @Description : from dio_core.network.downloader import Downloader from dio_core.network.downloader.downloader import Setting page = 0 while True: url = ( "https://www.google.com/search?q=经贸磋商&tbs=cdr:1,cd_min:1/21/2019,cd_max:1/" + "28/2019&tbm=nws&start={}") setting = Setting() setting.set_proxies("116.31.102.3", "57003") soup = Downloader.get_with_bs4(url.format(page), setting=setting).soup result = soup.select(".l.lLrAF") if not result: break for aTag in soup.select(".l.lLrAF"): print(aTag["href"]) page += 10 print()
def testTp(body): payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(TEST_TEMPLATE_URL, data=payload, setting=s) return res.json()["data"]
def buildMainSite(url: str, parentId: int = -1): """ 建立主站点 :param parentId: 父级站点 :param url: 站点url :param domain: 站点domian :param name: :return: siteId """ soup = Downloader.get_bs4(url) title = soup.select_one("title").text.replace("--", "dio").replace("-", "dio").replace("_", "dio")\ .replace("——", "dio").replace("|", "dio").replace("·", "dio").replace(" ", "dio") name = "" host = url_util.get_host(url) print("host 为" + host) for n in title.split("dio"): print("site name 为: " + n) if input() in ("", "y", "Y", "Yes"): name = n.strip() break if parentId != -1: mainSiteName = getSiteInfo(parentId)["name"] name = "{}_{}".format(mainSiteName, name) if input("是否添加频道后缀") in ("", "y", "Y", "Yes"): name = "{}{}".format(name, "频道 ") print("输出name为: {}".format(name)) siteQuery = { "name": name, "domain": url_util.get_host(url), "tags": [], "maxDepth": "2", "overtime": "", "params": { "spark.executors.num": "1", "spark.executor.core.num": "1", "error_fail": "0.9", "fail_queue": "OFF", "rhino.task.unique.manager.class": "com.datatub.rhino.framework.component.operatior.manager.unique.RedisListUniqueManager", "rhino.task.unique.manager.cache_size": "1000", "rhino.task.job.info.collector.class": "com.datatub.rhino.framework.component.collector.LocalJobInfoCollector" }, "threshold": "", "frequency": "", "interval": "20", "cron": "", "template": [], "agent": [], "category": "3" } query = {"parentId": parentId, "site": json_util.to_json(siteQuery)} siteId = buildSite(query) print("输出页面url为: " + "http://v3.rhino.datatub.com/#/gather/siteManager?site={}".format( siteId)) return siteId
def main_downloader(): proxies = [ ["222.189.191.53", "9999"], ["182.111.64.7", "41766"], ["115.151.3.16", "9999"], ["121.233.206.151", "9999"], ["116.209.52.143", "9999"], ["1.198.72.234", "9999"], ["121.61.1.48", "9999"], ["183.148.133.22", "9999"], ["115.239.24.166", "9999"], ["110.52.235.226", "9999"], ["113.122.168.246", "9999"], ["59.62.165.99", "808"], ["218.91.112.42", "9999"], ["111.177.161.70", "9999"], ["110.52.235.231", "9999"], ["180.116.48.122", "9999"], ["113.122.168.23", "9999"], ["49.77.59.235", "8118"], ["110.52.235.173", "9999"], ["111.177.187.211", "9999"], ["124.94.192.206", "9999"], ["125.123.137.71", "9999"], ["121.61.1.222", "9999"], ["111.72.154.47", "9999"], ["125.123.138.26", "9999"], ["110.52.235.244", "9999"], ["121.61.24.254", "9999"], ["111.177.170.35", "9999"], ["42.53.73.131", "9999"], ["111.177.180.221", "9999"], ["111.177.170.11", "9999"], ["60.173.244.133", "41306"], ["116.209.59.131", "9999"], ["221.235.234.199", "9999"], ["110.52.235.76", "9999"], ["121.61.24.242", "9999"], ["112.87.69.158", "9999"], ["59.62.166.60", "9999"], ["59.62.166.172", "9999"], ["61.184.43.129", "9999"], ["110.52.235.70", "808"], ["116.209.56.164", "9999"], ["171.80.152.26", "9999"], ["110.52.235.79", "9999"], ["116.209.55.171", "9999"], ["116.209.52.190", "9999"], ["118.187.58.34", "53281"], ["110.52.235.67", "9999"], ["115.212.81.84", "8118"], ["121.31.158.51", "8123"], ["116.209.56.95", "9999"], ["116.209.56.179", "9999"], ["183.148.145.229", "9999"], ["121.61.3.223", "9999"], ["101.236.42.63", "8866"], ["111.176.31.69", "9999"], ["116.209.54.22", "9999"], ["116.209.57.233", "9999"], ["125.123.136.232", "9999"], ["27.29.95.209", "9999"], ["116.209.57.22", "9999"], ["112.85.174.44", "9999"], ["61.183.233.6", "54896"], ["116.209.59.150", "9999"], ["116.209.55.191", "9999"], ["116.209.56.125", "9999"], ["125.123.142.141", "9999"], ["59.62.167.130", "53128"], ["175.148.77.188", "1133"], ["116.209.52.177", "9999"], ["125.123.138.171", "9999"], ["111.181.65.0", "9999"], ["1.192.246.197", "9999"], ["111.177.179.8", "9999"], ["110.52.235.86", "9999"], ["120.35.12.105", "3128"], ["116.209.57.16", "9999"], ["59.45.16.10", "59156"], ["111.181.66.158", "9999"], ["112.85.130.51", "9999"], ["116.208.55.173", "9999"], ["115.151.5.177", "9999"], ["113.121.147.233", "9999"], ["171.80.0.190", "9999"], ["110.52.235.139", "9999"], ["121.61.3.176", "9999"], ["110.52.235.71", "9999"], ["110.52.235.114", "9999"], ["112.85.165.66", "9999"], ["116.209.59.174", "9999"], ["121.61.1.9", "9999"], ["112.85.174.93", "9999"], ["123.163.115.203", "9999"], ["180.119.141.144", "9999"], ["116.209.54.168", "9999"], ["116.209.58.45", "9999"], ["125.123.142.215", "9999"], ["110.52.235.196", "9999"], ] for proxy in proxies: setting = Setting() setting.set_proxies(proxy[0], proxy[1]) setting.timeout = 10 setting.repeat = 1 print("使用代理", proxy) try: res = Downloader.get("http://icanhazip.com", setting=setting) print("success", proxy, res.text, res.status_code) except Exception as e: print("fail")
from dio_core.network.downloader import Downloader from dio_core.utils import file_util, time_util rows = file_util.readRows( "/home/changshuai/PycharmProjects/dio_core/Test/Data/kill_job_urls.txt") for row in rows: url = "http://api.rhino.datatub.com/common/job/kill?job_id={}&token=5fa92f2a597cc60201780504be1028a7".format( row) res = Downloader.get(url) print(row, res.text, url) time_util.sleep(3)
def getTsUrls(self): res = Downloader.get(self.m3u8) result = text_util.get_all_match("#EXTINF:.*,\n(.*)", res.text) return zip(range(len(result)), result)
def buildTemplate(body): """创建模板""" payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(BUILD_TEMPLATE_URL, data=payload, setting=s) return res.json()["data"]["id"]
def buildTask(body): payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(BUILD_TASK_URL, data=payload, setting=s) print(res.json()) return res.json()["data"]["id"]
from dio_core.network.downloader import Downloader soup = Downloader.get_with_bs4("http://proxy.datastory.com.cn/getADAllHost?id=rhino").soup for ip in soup.ipserver.select("ip"): print("http\t{}\t{}".format(ip.select_one("host").text, ip.select_one("port").text))
def getSiteInfo(siteId: int): url = GET_SITE_INFO_URL.format(siteId) s = Setting() s.setParams(headers=HEADERS) return Downloader.get(url, setting=s).json()["data"]["site"]
import logging from dio_core.network.downloader import Downloader from dio_core.utils import time_util logger = logging.get_logger(__name__) logger.setLevel(level=logging.INFO) handler = logging.FileHandler("log.txt") handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) while True: soup = Downloader.get_bs4( "http://proxy.datastory.com.cn/getADAllHost?id=ss-teg") logger.info("count:{}".format(soup.select_one("count").text)) for proxy in soup.select("ips id"): logger.info("proxy: {}".format(proxy.text)) time_util.sleep(10)
for kwarg in kwargs: setting = Setting() setting.headers["Host"] = "i.waimai.meituan.com" setting.headers["Accept"] = "application/json" setting.headers[ "Referer"] = "https://h5.waimai.meituan.com/waimai/mindex/searchresults?queryType=11002&entranceId=0&keyword=%E5%B9%B2%E6%8D%9E%E8%9E%BA%E8%9B%B3%E7%B2%89&qwTypeId=11002&mode=1" setting.headers["Origin"] = "https://h5.waimai.meituan.com" setting.headers[ "User-Agent"] = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36" setting.headers["Content-Type"] = "application/x-www-form-urlencoded" setting.headers[ "Cookie"] = "uuid=86f1d5d1-b229-4c12-a7bd-10dc5ef16e45; terminal=i; utm_source=; au_trace_key_net=default; wm_order_channel=default; _lx_utm=utm_source%3D60066; w_token=Qy1uY6h0RnVOU73sk3xFEAYn9EsAAAAA5wgAAOe8BzkBC_CqjTAFX4W2RnmK7ZF9TKNcPV8HuP7MoY8V0BrOiUilE8Gmjv_IzPBsyA; w_utmz=\"utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)\"; w_actual_lat=23125756; w_actual_lng=113334698; openh5_uuid=16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8; w_latlng=23129112,113264385; cssVersion=4c2d803d; w_visitid=49523e1f-aa20-46a9-b219-4907cd3201be" setting.headers["Accept-Encoding"] = "gzip, deflate" setting.headers["Connection"] = "keep-alive" downloader = Downloader(setting=setting) post_data = {} post_data["geoType"] = "2" post_data["cityId"] = "1" post_data["secondCategoryId"] = "" post_data["start"] = "0" post_data["queryType"] = "11002" post_data["keyword"] = kwarg["keyword"] post_data["categoryType"] = "" post_data["entranceId"] = "0" post_data[ "uuid"] = "16d14d36a98c8-0e53a36f17ba0e-1a201708-1fa400-16d14d36a98c8" post_data["platform"] = "3" post_data["partner"] = "4" post_data[ "originUrl"] = "https://h5.waimai.meituan.com/waimai/mindex/searchresults?queryType=11002&entranceId=0&keyword=%E5%B9%B2%E6%8D%9E%E8%9E%BA%E8%9B%B3%E7%B2%89&qwTypeId=11002&mode=1"
import jsonpath from dio_core.network.downloader import Downloader from dio_core.utils import json_util, url_util, time_util from dio_core_test.utils import text_util keyword = "女装" for i in range(100): html = Downloader.get( "https://shopsearch.taobao.com/browse/shop_search.htm?q={}&s={}". format(keyword, i * 20)).text data = json_util.to_python( text_util.get_first_match(html, "g_page_config = (.*);")) for shop in jsonpath.jsonpath(data, "$.mods.shoplist.data.shopItems.*"): if "shopIcon" in shop and "title" in shop["shopIcon"] and "天猫" in shop[ "shopIcon"]["title"]: print("天猫\t{}\t{}".format(url_util.patch_url(shop["shopUrl"]), shop["procnt"])) else: print("淘宝\t{}\t{}".format(url_util.patch_url(shop["shopUrl"]), shop["procnt"])) time_util.sleep(5)