Example #1
0
def get_start_url():
    """
    根据配置文件设置,获取起始网址
    :return: url获取的网址,num需要爬取的数量
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    headers = {
        "content-type": "application/json"
    }

    url = "http://%s:%s%s"%(
        api_config.source_host,
        api_config.source_port,
        api_config.source_uri
    )
    hlog.var("url", url)
    response = requests.get(url=url, headers= headers)
    if 200 == response.status_code:
        response_string = response.text
        response_json = json.loads(response_string)
        if "SUCCESS" == response_json["code"]:
            hlog.info("获取起始url成功")
            hlog.exit_func(func_name)
            """
            返回结果需要返回
            {
                "code": "SUCCESS",
                "message": "成功",
                "result": {
                    "url": "https://www.zhipin.com/c101270100-p100199/",
                    "num": 3
                    }
            }
            现在返回的是
            {
                "code": "SUCCESS",
                "message": "成功",
                "result": "https://www.zhipin.com/c101270100-p100199/"
            }
            """
            return response_json["result"], 3

    hlog.debug("获取起始url失败,请检查网络")
    hlog.exit_func(func_name)
    return "", 0
Example #2
0
def get_film_list():
    """
    获取电影信息
    :return:
    """

    from utils import get_urls

    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    film_list = list()
    film_objs = session.query(Film).all()

    for obj in film_objs:
        film_id = obj.id
        hlog.var('film_id', film_id)

        location_list = get_urls('Location', film_id)
        people_list = get_urls('People', film_id)
        specie_list = get_urls('Specie', film_id)
        vehicle_list = get_urls('Vehicle', film_id)

        film = {
            "id": obj.id,
            "title": obj.title,
            "description": obj.description,
            "director": obj.director,
            "producer": obj.producer,
            "release_date": obj.release_date,
            "rt_score": obj.rt_score,
            "url": obj.url,
            "people": people_list,
            "species": specie_list,
            "locations": location_list,
            "vehicles": vehicle_list
        }

        film_list.append(film)

    hlog.info("读取电影信息成功。")
    hlog.exit_func(func_name)

    return film_list
Example #3
0
    def __init__(self, url, num):

        self.primeval_url = url
        self.start_url = url.split("?")[0]
        self.max_page = num
        self.platform = "boss直聘"

        hlog.var("SpiderZhipin.primeval_url", self.primeval_url)
        hlog.var("SpiderZhipin.start_url", self.start_url)
        hlog.var("SpiderZhipin.max_page", self.max_page)
        hlog.var("SpiderZhipin.platform", self.platform)
Example #4
0
    def __init__(self):
        hlog.info("初始化消息队列连接")

        hlog.var('mq_host', mq_config.host)
        hlog.var('mq_port', mq_config.port)
        hlog.var('mq_group', mq_config.group)
        hlog.var('mq_topic', mq_config.topic)

        self.consumer = KafkaConsumer(
            mq_config.topic,
            group_id=mq_config.group,
            bootstrap_servers=['{}:{}'.format(mq_config.host, mq_config.port)])
Example #5
0
def get_platfrom(url):
    """
    根据url获取网站域名主体
    :param url: 网址
    :return:
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)
    hlog.var("url", url)
    try:
        domain = url.split("/")[2]
        platfrom = domain.split(".")[1]
        hlog.var("platfrom", platfrom)
    except:
        hlog.debug("获取网站域名主体失败")

    hlog.info("获取网站域名主体成功")
    hlog.exit_func(func_name)
    return platfrom
Example #6
0
def send_data(source_url, htmlString, platform):
    """
    :param source_url: 爬取的是哪个网址
    :param htmlString: 爬取结果的字符串
    :param platform: 平台
    :return:
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    hlog.var("source_url", source_url)

    if "" == htmlString:
        hlog.exit_func(func_name)
        return

    spider_uuid = uuid.uuid1()
    hlog.var("spider_uuid", spider_uuid)

    encodedBytes = base64.b64encode(htmlString.encode("utf-8"))
    encodedStr = str(encodedBytes, "utf-8")

    data = {
        "url": source_url,
        "spiderUuid": str(spider_uuid),
        "platform": platform,
        "htmlString": encodedStr
    }

    headers = {
        "content-type": "application/json"
    }

    url = "http://%s:%s%s"%(
        api_config.target_host,
        api_config.target_port,
        api_config.target_uri
    )

    response = requests.post(url=url, data=json.dumps(data), headers= headers)
    hlog.info("发送结果完成,返回状态%s"%response.status_code)

    hlog.exit_func(func_name)
Example #7
0
def add_film():
    _id = request.form.get('id')
    _title = request.form.get('title')
    _description = request.form.get('description')
    _director = request.form.get('director')
    _producer = request.form.get('producer')
    _release_date = request.form.get('release_date')
    _rt_score = request.form.get('rt_score')
    _url = request.form.get('url')

    hlog.var('_id', _id)
    hlog.var('_title', _title)
    hlog.var('_description', _description)
    hlog.var('_director', _director)
    hlog.var('_producer', _producer)
    hlog.var('_release_date', _release_date)
    hlog.var('_rt_score', _rt_score)
    hlog.var('_url', _url)

    old_film = session.query(Film).filter(Film.id == _id).all()

    if old_film:
        return {"status": "1", "message": "电影编号已经存在"}

    film = Film(id=_id,
                title=_title,
                description=_description,
                director=_director,
                producer=_producer,
                release_date=_release_date,
                rt_score=_rt_score,
                url=_url)
    session.add(film)
    session.commit()

    return {"status": "0", "message": "新增成功"}