def get_start_url():
    """
    根据配置文件设置,获取起始网址
    :return: url获取的网址,num需要爬取的数量
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    headers = {
        "content-type": "application/json"
    }

    url = "http://%s:%s%s"%(
        api_config.source_host,
        api_config.source_port,
        api_config.source_uri
    )
    hlog.var("url", url)
    response = requests.get(url=url, headers= headers)
    if 200 == response.status_code:
        response_string = response.text
        response_json = json.loads(response_string)
        if "SUCCESS" == response_json["code"]:
            hlog.info("获取起始url成功")
            hlog.exit_func(func_name)
            """
            返回结果需要返回
            {
                "code": "SUCCESS",
                "message": "成功",
                "result": {
                    "url": "https://www.zhipin.com/c101270100-p100199/",
                    "num": 3
                    }
            }
            现在返回的是
            {
                "code": "SUCCESS",
                "message": "成功",
                "result": "https://www.zhipin.com/c101270100-p100199/"
            }
            """
            return response_json["result"], 3

    hlog.debug("获取起始url失败,请检查网络")
    hlog.exit_func(func_name)
    return "", 0
Beispiel #2
0
def get_film_list():
    """
    获取电影信息
    :return:
    """

    from utils import get_urls

    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    film_list = list()
    film_objs = session.query(Film).all()

    for obj in film_objs:
        film_id = obj.id
        hlog.var('film_id', film_id)

        location_list = get_urls('Location', film_id)
        people_list = get_urls('People', film_id)
        specie_list = get_urls('Specie', film_id)
        vehicle_list = get_urls('Vehicle', film_id)

        film = {
            "id": obj.id,
            "title": obj.title,
            "description": obj.description,
            "director": obj.director,
            "producer": obj.producer,
            "release_date": obj.release_date,
            "rt_score": obj.rt_score,
            "url": obj.url,
            "people": people_list,
            "species": specie_list,
            "locations": location_list,
            "vehicles": vehicle_list
        }

        film_list.append(film)

    hlog.info("读取电影信息成功。")
    hlog.exit_func(func_name)

    return film_list
Beispiel #3
0
def get_platfrom(url):
    """
    根据url获取网站域名主体
    :param url: 网址
    :return:
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)
    hlog.var("url", url)
    try:
        domain = url.split("/")[2]
        platfrom = domain.split(".")[1]
        hlog.var("platfrom", platfrom)
    except:
        hlog.debug("获取网站域名主体失败")

    hlog.info("获取网站域名主体成功")
    hlog.exit_func(func_name)
    return platfrom
def send_data(source_url, htmlString, platform):
    """
    :param source_url: 爬取的是哪个网址
    :param htmlString: 爬取结果的字符串
    :param platform: 平台
    :return:
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    hlog.var("source_url", source_url)

    if "" == htmlString:
        hlog.exit_func(func_name)
        return

    spider_uuid = uuid.uuid1()
    hlog.var("spider_uuid", spider_uuid)

    encodedBytes = base64.b64encode(htmlString.encode("utf-8"))
    encodedStr = str(encodedBytes, "utf-8")

    data = {
        "url": source_url,
        "spiderUuid": str(spider_uuid),
        "platform": platform,
        "htmlString": encodedStr
    }

    headers = {
        "content-type": "application/json"
    }

    url = "http://%s:%s%s"%(
        api_config.target_host,
        api_config.target_port,
        api_config.target_uri
    )

    response = requests.post(url=url, data=json.dumps(data), headers= headers)
    hlog.info("发送结果完成,返回状态%s"%response.status_code)

    hlog.exit_func(func_name)
def crawl_url(url):
    """
    :param url: 要爬取的url
    :return: url的所有html字符串
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    output = os.popen("node spider.js %s" % url).read()
    jsonString = json.loads(output)
    hlog.info("爬取完成,返回状态%s" % jsonString["code"])

    html = ""
    if "success" == jsonString["code"]:
        html = jsonString["data"]
    else:
        hlog.debug("爬虫爬取有误")

    hlog.exit_func(func_name)
    return html