def get_start_url(): """ 根据配置文件设置,获取起始网址 :return: url获取的网址,num需要爬取的数量 """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) headers = { "content-type": "application/json" } url = "http://%s:%s%s"%( api_config.source_host, api_config.source_port, api_config.source_uri ) hlog.var("url", url) response = requests.get(url=url, headers= headers) if 200 == response.status_code: response_string = response.text response_json = json.loads(response_string) if "SUCCESS" == response_json["code"]: hlog.info("获取起始url成功") hlog.exit_func(func_name) """ 返回结果需要返回 { "code": "SUCCESS", "message": "成功", "result": { "url": "https://www.zhipin.com/c101270100-p100199/", "num": 3 } } 现在返回的是 { "code": "SUCCESS", "message": "成功", "result": "https://www.zhipin.com/c101270100-p100199/" } """ return response_json["result"], 3 hlog.debug("获取起始url失败,请检查网络") hlog.exit_func(func_name) return "", 0
def get_film_list(): """ 获取电影信息 :return: """ from utils import get_urls func_name = inspect.stack()[0][3] hlog.enter_func(func_name) film_list = list() film_objs = session.query(Film).all() for obj in film_objs: film_id = obj.id hlog.var('film_id', film_id) location_list = get_urls('Location', film_id) people_list = get_urls('People', film_id) specie_list = get_urls('Specie', film_id) vehicle_list = get_urls('Vehicle', film_id) film = { "id": obj.id, "title": obj.title, "description": obj.description, "director": obj.director, "producer": obj.producer, "release_date": obj.release_date, "rt_score": obj.rt_score, "url": obj.url, "people": people_list, "species": specie_list, "locations": location_list, "vehicles": vehicle_list } film_list.append(film) hlog.info("读取电影信息成功。") hlog.exit_func(func_name) return film_list
def __init__(self, url, num): self.primeval_url = url self.start_url = url.split("?")[0] self.max_page = num self.platform = "boss直聘" hlog.var("SpiderZhipin.primeval_url", self.primeval_url) hlog.var("SpiderZhipin.start_url", self.start_url) hlog.var("SpiderZhipin.max_page", self.max_page) hlog.var("SpiderZhipin.platform", self.platform)
def __init__(self): hlog.info("初始化消息队列连接") hlog.var('mq_host', mq_config.host) hlog.var('mq_port', mq_config.port) hlog.var('mq_group', mq_config.group) hlog.var('mq_topic', mq_config.topic) self.consumer = KafkaConsumer( mq_config.topic, group_id=mq_config.group, bootstrap_servers=['{}:{}'.format(mq_config.host, mq_config.port)])
def get_platfrom(url): """ 根据url获取网站域名主体 :param url: 网址 :return: """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) hlog.var("url", url) try: domain = url.split("/")[2] platfrom = domain.split(".")[1] hlog.var("platfrom", platfrom) except: hlog.debug("获取网站域名主体失败") hlog.info("获取网站域名主体成功") hlog.exit_func(func_name) return platfrom
def send_data(source_url, htmlString, platform): """ :param source_url: 爬取的是哪个网址 :param htmlString: 爬取结果的字符串 :param platform: 平台 :return: """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) hlog.var("source_url", source_url) if "" == htmlString: hlog.exit_func(func_name) return spider_uuid = uuid.uuid1() hlog.var("spider_uuid", spider_uuid) encodedBytes = base64.b64encode(htmlString.encode("utf-8")) encodedStr = str(encodedBytes, "utf-8") data = { "url": source_url, "spiderUuid": str(spider_uuid), "platform": platform, "htmlString": encodedStr } headers = { "content-type": "application/json" } url = "http://%s:%s%s"%( api_config.target_host, api_config.target_port, api_config.target_uri ) response = requests.post(url=url, data=json.dumps(data), headers= headers) hlog.info("发送结果完成,返回状态%s"%response.status_code) hlog.exit_func(func_name)
def add_film(): _id = request.form.get('id') _title = request.form.get('title') _description = request.form.get('description') _director = request.form.get('director') _producer = request.form.get('producer') _release_date = request.form.get('release_date') _rt_score = request.form.get('rt_score') _url = request.form.get('url') hlog.var('_id', _id) hlog.var('_title', _title) hlog.var('_description', _description) hlog.var('_director', _director) hlog.var('_producer', _producer) hlog.var('_release_date', _release_date) hlog.var('_rt_score', _rt_score) hlog.var('_url', _url) old_film = session.query(Film).filter(Film.id == _id).all() if old_film: return {"status": "1", "message": "电影编号已经存在"} film = Film(id=_id, title=_title, description=_description, director=_director, producer=_producer, release_date=_release_date, rt_score=_rt_score, url=_url) session.add(film) session.commit() return {"status": "0", "message": "新增成功"}