def closeWorker(cookie, group="full"): """ 关闭 worker :return: """ headers = { 'Cookie': cookie, 'Origin': "http://rhino.dev.datatub.com", 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9", 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8", 'Accept': "*/*", 'Referer': "http://rhino.dev.datatub.com/", 'X-Requested-With': "XMLHttpRequest", 'Connection': "keep-alive" } setting = Setting() setting.headers = headers setting.request = "POST" res = Downloader(setting=setting).payload(CLOSE_WORKER_URL, "workerGroup={}".format(group)) LOG.info("CLOSE WORKER 结果 {}".format(res.json())) if res.json()["code"] == 0: return True
def clearWorker(cookie): headers = { 'Cookie': "{}".format(cookie), 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'Host': "rhino.dev.datatub.com" } setting = Setting() setting.headers = headers setting.request = "POST" res = Downloader(setting=setting).request(CLEAR_WORKER_URL) LOG.info("Clear 结果 {}".format(res.json())) if res.json()["code"] == 0: return True
def init(url): soup = Downloader.get_with_bs4(url).soup m3u8_url = soup.select_one(".yunbofang video").attrs["src"] s = Setting() s.returnFailReq = True text = Downloader.get(m3u8_url, setting=s).text result = text_util.get_all_match("index\d+.ts", text) ts = result[-1] d = text_util.get_first_match(ts, "(\d+)") url_front = m3u8_url.split("index")[0] for i in range(int(d) + 1): q.put({"url": url_front + "index" + str(i) + ".ts", "ind": i})
def buildSite(body): """创建站点""" payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(MAIN_URL, data=payload, setting=s) print(res.json()) return res.json()["data"]["id"]
def selectWorker(cookie): """ 查询worker :return: """ headers = { 'Cookie': "{}".format(cookie), 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'Host': "rhino.dev.datatub.com" } setting = Setting() setting.headers = headers json = Downloader(setting=setting).get_json(SELECT_WORKER_URL) LOG.log("SELECT 结果 {}".format(json)) if json["code"] == 0: return True
def addWorker(workerNum, group, cookie): """ 添加 worker :return: """ headers = { 'Cookie': "{}".format(cookie), 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'Host': "rhino.dev.datatub.com" } setting = Setting() setting.headers = headers json = Downloader(setting=setting).get_json( ADD_WORKER_URL.format(workerNum, group)) LOG.info("ADD 结果 {}".format(json)) if json["code"] == 0: return True
def login(): headers = { 'Host': "rhino.dev.datatub.com", 'Content-Length': "35", 'Accept': "*/*", 'Origin': "http://rhino.dev.datatub.com", 'X-Requested-With': "XMLHttpRequest", 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8", 'Referer': "http://rhino.dev.datatub.com/login.html", 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9", 'Connection': "close" } payload = "userName=changshuai&passWord=123456" setting = Setting() setting.headers = headers setting.request = "POST" res = Downloader(setting=setting).payload(LOGIN_URL, payload) return res.cookies.get("JSESSIONID")
def getSiteInfo(siteId: int): url = GET_SITE_INFO_URL.format(siteId) s = Setting() s.setParams(headers=HEADERS) return Downloader.get(url, setting=s).json()["data"]["site"]
def testTp(body): payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(TEST_TEMPLATE_URL, data=payload, setting=s) return res.json()["data"]
def buildTask(body): payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(BUILD_TASK_URL, data=payload, setting=s) print(res.json()) return res.json()["data"]["id"]
def buildTemplate(body): """创建模板""" payload = url_util.urlencode(body) s = Setting(request="POST", headers=HEADERS) res = Downloader.payload(BUILD_TEMPLATE_URL, data=payload, setting=s) return res.json()["data"]["id"]
SHOP_SEARCH_URL = "http://i.waimai.meituan.com/openh5/homepage/poilist?_={}" FOOD_URL = "http://i.waimai.meituan.com/openh5/poi/food" COMMENT_URL = "http://i.waimai.meituan.com/openh5/poi/comments" # mongodb 配置 client = pymongo.MongoClient(host='localhost', port=27017) db = client['meituan'] meituanwaimai_shop_list = db['meituanwaimai_shop_list_v1'] meituanwaimai_search_list = db['meituanwaimai_search_list'] meituanwaimai_food_list = db['meituanwaimai_food_list_v1'] meituanwaimai_comment_list = db['meituanwaimai_comment_list_v1'] decrypt_collection = db["meituanwaimai_decrypt"] decrypt = decrypt_collection.find_one({"_id": 1})["match"] setting = Setting() setting.headers = { "Origin": "http://h5.waimai.meituan.com", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36", "Referer": "http://h5.waimai.meituan.com/waimai/mindex/home", "Host": "i.waimai.meituan.com", # "Cookie": '_lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=16cae674b36c8-01bf1cb1fd7b27-29792349-5df1a-16cae674b36c8; _ga=GA1.3.2021950140.1566294101; _gid=GA1.3.138356466.1566294101; terminal=i; w_utmz="utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)"; w_uuid=23UFuT0gtxcARBkc-Il6MOl2sihlNPJY7at6eTLmNTIUTgjS2XMaI4KAEWZGc_jm; utm_source=0; wx_channel_id=0; JSESSIONID=ok0xdngf40ahhx0dbvhg0atd; webp=1; __mta=45921691.1566294104072.1566294104072.1566294104072.1; w_addr=; w_actual_lat=23124630; w_actual_lng=113361990; wm_order_channel=default; utm_source=; au_trace_key_net=default; iuuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; mt_c_token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; oops=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; userId=64012031; cssVersion=e09c1174; _lxsdk=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; openh5_uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; w_token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; openh5_uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C,_lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=16cae674b36c8-01bf1cb1fd7b27-29792349-5df1a-16cae674b36c8; _ga=GA1.3.2021950140.1566294101; _gid=GA1.3.138356466.1566294101; terminal=i; w_utmz="utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)"; w_uuid=23UFuT0gtxcARBkc-Il6MOl2sihlNPJY7at6eTLmNTIUTgjS2XMaI4KAEWZGc_jm; utm_source=0; wx_channel_id=0; JSESSIONID=ok0xdngf40ahhx0dbvhg0atd; webp=1; __mta=45921691.1566294104072.1566294104072.1566294104072.1; w_addr=; w_actual_lat=23124630; w_actual_lng=113361990; wm_order_channel=default; utm_source=; au_trace_key_net=default; iuuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; mt_c_token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; oops=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; userId=64012031; cssVersion=e09c1174; _lxsdk=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; openh5_uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; w_token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; openh5_uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; uuid=86f1d5d1-b229-4c12-a7bd-10dc5ef16e45; terminal=i; w_utmz="utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)"; utm_source=; au_trace_key_net=default; wm_order_channel=default; _lx_utm=utm_source%3D60066; cssVersion=d70fc3f0; w_actual_lat=23125752; w_actual_lng=113334715; w_latlng=0,0; w_token=yDDjkm8ceYOgcuQ4qyaFMy3swNQAAAAA5wgAAHkBWapv9B8SL5PDs_OYrNT8vbNXv7Ua7Ty5u_9n3opjKT5T8ZXLpsrAm3n9Xf-cIg; openh5_uuid=EDAF663058DC10DD9DA74BF73550627E0BA82B4963E47B9FAEA941817D9F2F1C; w_visitid=58b6dcfb-0534-4b2a-9faa-a6f00f06d908', "Accept": "application/json", "Content-Type": "application/x-www-form-urlencoded", "Accept-Language": "zh-CN,zh;q=0.9", } post_data_of_food_list = { "geoType": "2",
def main_downloader(): proxies = [ ["222.189.191.53", "9999"], ["182.111.64.7", "41766"], ["115.151.3.16", "9999"], ["121.233.206.151", "9999"], ["116.209.52.143", "9999"], ["1.198.72.234", "9999"], ["121.61.1.48", "9999"], ["183.148.133.22", "9999"], ["115.239.24.166", "9999"], ["110.52.235.226", "9999"], ["113.122.168.246", "9999"], ["59.62.165.99", "808"], ["218.91.112.42", "9999"], ["111.177.161.70", "9999"], ["110.52.235.231", "9999"], ["180.116.48.122", "9999"], ["113.122.168.23", "9999"], ["49.77.59.235", "8118"], ["110.52.235.173", "9999"], ["111.177.187.211", "9999"], ["124.94.192.206", "9999"], ["125.123.137.71", "9999"], ["121.61.1.222", "9999"], ["111.72.154.47", "9999"], ["125.123.138.26", "9999"], ["110.52.235.244", "9999"], ["121.61.24.254", "9999"], ["111.177.170.35", "9999"], ["42.53.73.131", "9999"], ["111.177.180.221", "9999"], ["111.177.170.11", "9999"], ["60.173.244.133", "41306"], ["116.209.59.131", "9999"], ["221.235.234.199", "9999"], ["110.52.235.76", "9999"], ["121.61.24.242", "9999"], ["112.87.69.158", "9999"], ["59.62.166.60", "9999"], ["59.62.166.172", "9999"], ["61.184.43.129", "9999"], ["110.52.235.70", "808"], ["116.209.56.164", "9999"], ["171.80.152.26", "9999"], ["110.52.235.79", "9999"], ["116.209.55.171", "9999"], ["116.209.52.190", "9999"], ["118.187.58.34", "53281"], ["110.52.235.67", "9999"], ["115.212.81.84", "8118"], ["121.31.158.51", "8123"], ["116.209.56.95", "9999"], ["116.209.56.179", "9999"], ["183.148.145.229", "9999"], ["121.61.3.223", "9999"], ["101.236.42.63", "8866"], ["111.176.31.69", "9999"], ["116.209.54.22", "9999"], ["116.209.57.233", "9999"], ["125.123.136.232", "9999"], ["27.29.95.209", "9999"], ["116.209.57.22", "9999"], ["112.85.174.44", "9999"], ["61.183.233.6", "54896"], ["116.209.59.150", "9999"], ["116.209.55.191", "9999"], ["116.209.56.125", "9999"], ["125.123.142.141", "9999"], ["59.62.167.130", "53128"], ["175.148.77.188", "1133"], ["116.209.52.177", "9999"], ["125.123.138.171", "9999"], ["111.181.65.0", "9999"], ["1.192.246.197", "9999"], ["111.177.179.8", "9999"], ["110.52.235.86", "9999"], ["120.35.12.105", "3128"], ["116.209.57.16", "9999"], ["59.45.16.10", "59156"], ["111.181.66.158", "9999"], ["112.85.130.51", "9999"], ["116.208.55.173", "9999"], ["115.151.5.177", "9999"], ["113.121.147.233", "9999"], ["171.80.0.190", "9999"], ["110.52.235.139", "9999"], ["121.61.3.176", "9999"], ["110.52.235.71", "9999"], ["110.52.235.114", "9999"], ["112.85.165.66", "9999"], ["116.209.59.174", "9999"], ["121.61.1.9", "9999"], ["112.85.174.93", "9999"], ["123.163.115.203", "9999"], ["180.119.141.144", "9999"], ["116.209.54.168", "9999"], ["116.209.58.45", "9999"], ["125.123.142.215", "9999"], ["110.52.235.196", "9999"], ] for proxy in proxies: setting = Setting() setting.set_proxies(proxy[0], proxy[1]) setting.timeout = 10 setting.repeat = 1 print("使用代理", proxy) try: res = Downloader.get("http://icanhazip.com", setting=setting) print("success", proxy, res.text, res.status_code) except Exception as e: print("fail")
# @Time : 18-2-13 下午8:46 # @Author : DioMryang # @File : test_downloader.py from unittest import TestCase # @Description : from dio_core.network.downloader import Downloader from dio_core.network.downloader.downloader import Setting page = 0 while True: url = ( "https://www.google.com/search?q=经贸磋商&tbs=cdr:1,cd_min:1/21/2019,cd_max:1/" + "28/2019&tbm=nws&start={}") setting = Setting() setting.set_proxies("116.31.102.3", "57003") soup = Downloader.get_with_bs4(url.format(page), setting=setting).soup result = soup.select(".l.lLrAF") if not result: break for aTag in soup.select(".l.lLrAF"): print(aTag["href"]) page += 10 print()
import time import traceback from dio_core.network.downloader.downloader import Downloader from dio_core.network.downloader.downloader import Setting from dio_core.utils import file_util, parse_util, json_util from dio_core_test.utils import text_util setting = Setting() setting.headers["Cookie"] = ("DSCKID=91aecd4c-9d62-49ad-ae1b-9eb177c787ac; JSESSIONID=5AD6666BE97FEC415491055AFAFA60FE;" " seraph.rememberme.cookie=13124%3A5ad60cddb478faeca22570e7f156f07e5138011a; atlassian.xsr" "f.token=BP2B-R8C4-N6CQ-HZD4_9ab65e787a0932dd3b1abcc52a792e40d154415c_lin; jira.editor.use" "r.mode=wysiwyg") setting.htmlParse = True rows = list(file_util.readRows("/home/changshuai/PycharmProjects/dio_core/dio_core_test/Data/JIRA_LIST.txt")) allMsg = [] for url in rows: try: res = Downloader.get(url, setting) repeatsText = text_util.get_first_match(res.text, "WRM._unparsedData\[\"activity-panel-pipe-id\"\]=\"(.*)\";") repeats = repeatsText.encode("utf-8").decode("unicode-escape").encode("utf-8").decode("unicode-escape").replace("\\/", "/") soup = parse_util.get_bs4_soup(repeats.strip("\"")) [_.extract() for _ in res.soup.select_one("#description-val").select(".user-hover")] msgInfo = { "title": res.soup.select_one("#summary-val").text.strip(), "id": text_util.get_first_match(url, "/(CP-\d+)").strip(),