Ejemplo n.º 1
0
def buildMainSite(url: str, parentId: int = -1):
    """
    建立主站点
    :param parentId: 父级站点
    :param url: 站点url
    :param domain: 站点domian
    :param name:
    :return: siteId
    """
    soup = Downloader.get_bs4(url)
    title = soup.select_one("title").text.replace("--", "dio").replace("-", "dio").replace("_", "dio")\
        .replace("——", "dio").replace("|", "dio").replace("·", "dio").replace(" ", "dio")
    name = ""
    host = url_util.get_host(url)
    print("host 为" + host)

    for n in title.split("dio"):
        print("site name 为: " + n)
        if input() in ("", "y", "Y", "Yes"):
            name = n.strip()
            break

    if parentId != -1:
        mainSiteName = getSiteInfo(parentId)["name"]
        name = "{}_{}".format(mainSiteName, name)

    if input("是否添加频道后缀") in ("", "y", "Y", "Yes"):
        name = "{}{}".format(name, "频道  ")
    print("输出name为: {}".format(name))
    siteQuery = {
        "name": name,
        "domain": url_util.get_host(url),
        "tags": [],
        "maxDepth": "2",
        "overtime": "",
        "params": {
            "spark.executors.num":
            "1",
            "spark.executor.core.num":
            "1",
            "error_fail":
            "0.9",
            "fail_queue":
            "OFF",
            "rhino.task.unique.manager.class":
            "com.datatub.rhino.framework.component.operatior.manager.unique.RedisListUniqueManager",
            "rhino.task.unique.manager.cache_size":
            "1000",
            "rhino.task.job.info.collector.class":
            "com.datatub.rhino.framework.component.collector.LocalJobInfoCollector"
        },
        "threshold": "",
        "frequency": "",
        "interval": "20",
        "cron": "",
        "template": [],
        "agent": [],
        "category": "3"
    }

    query = {"parentId": parentId, "site": json_util.to_json(siteQuery)}
    siteId = buildSite(query)
    print("输出页面url为: " +
          "http://v3.rhino.datatub.com/#/gather/siteManager?site={}".format(
              siteId))
    return siteId
Ejemplo n.º 2
0
import logging

from dio_core.network.downloader import Downloader
from dio_core.utils import time_util

logger = logging.get_logger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler("log.txt")
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

while True:
    soup = Downloader.get_bs4(
        "http://proxy.datastory.com.cn/getADAllHost?id=ss-teg")
    logger.info("count:{}".format(soup.select_one("count").text))

    for proxy in soup.select("ips id"):
        logger.info("proxy: {}".format(proxy.text))
    time_util.sleep(10)