def buildMainSite(url: str, parentId: int = -1): """ 建立主站点 :param parentId: 父级站点 :param url: 站点url :param domain: 站点domian :param name: :return: siteId """ soup = Downloader.get_bs4(url) title = soup.select_one("title").text.replace("--", "dio").replace("-", "dio").replace("_", "dio")\ .replace("——", "dio").replace("|", "dio").replace("·", "dio").replace(" ", "dio") name = "" host = url_util.get_host(url) print("host 为" + host) for n in title.split("dio"): print("site name 为: " + n) if input() in ("", "y", "Y", "Yes"): name = n.strip() break if parentId != -1: mainSiteName = getSiteInfo(parentId)["name"] name = "{}_{}".format(mainSiteName, name) if input("是否添加频道后缀") in ("", "y", "Y", "Yes"): name = "{}{}".format(name, "频道 ") print("输出name为: {}".format(name)) siteQuery = { "name": name, "domain": url_util.get_host(url), "tags": [], "maxDepth": "2", "overtime": "", "params": { "spark.executors.num": "1", "spark.executor.core.num": "1", "error_fail": "0.9", "fail_queue": "OFF", "rhino.task.unique.manager.class": "com.datatub.rhino.framework.component.operatior.manager.unique.RedisListUniqueManager", "rhino.task.unique.manager.cache_size": "1000", "rhino.task.job.info.collector.class": "com.datatub.rhino.framework.component.collector.LocalJobInfoCollector" }, "threshold": "", "frequency": "", "interval": "20", "cron": "", "template": [], "agent": [], "category": "3" } query = {"parentId": parentId, "site": json_util.to_json(siteQuery)} siteId = buildSite(query) print("输出页面url为: " + "http://v3.rhino.datatub.com/#/gather/siteManager?site={}".format( siteId)) return siteId
import logging from dio_core.network.downloader import Downloader from dio_core.utils import time_util logger = logging.get_logger(__name__) logger.setLevel(level=logging.INFO) handler = logging.FileHandler("log.txt") handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) while True: soup = Downloader.get_bs4( "http://proxy.datastory.com.cn/getADAllHost?id=ss-teg") logger.info("count:{}".format(soup.select_one("count").text)) for proxy in soup.select("ips id"): logger.info("proxy: {}".format(proxy.text)) time_util.sleep(10)