def get_consumer(): if not Consumer.last_obj: try: Consumer.last_obj = Consumer() except BaseException as e: hlog.debug("消息队列初始化连接异常") Consumer.last_obj = None else: hlog.info("消息队列连接完成") return Consumer.last_obj
def __init__(self): hlog.info("初始化消息队列连接") hlog.var('mq_host', mq_config.host) hlog.var('mq_port', mq_config.port) hlog.var('mq_group', mq_config.group) hlog.var('mq_topic', mq_config.topic) self.consumer = KafkaConsumer( mq_config.topic, group_id=mq_config.group, bootstrap_servers=['{}:{}'.format(mq_config.host, mq_config.port)])
def read_file(filename): hlog.info('read file %s' % filename) file = Path(filename) if not (file.exists() and file.is_file()): file.exists() and file.is_file() hlog.error('文件 %s 不存在' % filename) return None with file.open(mode='rb') as file: file_text = file.read() return file_text
def get_start_url(): """ 根据配置文件设置,获取起始网址 :return: url获取的网址,num需要爬取的数量 """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) headers = { "content-type": "application/json" } url = "http://%s:%s%s"%( api_config.source_host, api_config.source_port, api_config.source_uri ) hlog.var("url", url) response = requests.get(url=url, headers= headers) if 200 == response.status_code: response_string = response.text response_json = json.loads(response_string) if "SUCCESS" == response_json["code"]: hlog.info("获取起始url成功") hlog.exit_func(func_name) """ 返回结果需要返回 { "code": "SUCCESS", "message": "成功", "result": { "url": "https://www.zhipin.com/c101270100-p100199/", "num": 3 } } 现在返回的是 { "code": "SUCCESS", "message": "成功", "result": "https://www.zhipin.com/c101270100-p100199/" } """ return response_json["result"], 3 hlog.debug("获取起始url失败,请检查网络") hlog.exit_func(func_name) return "", 0
def get_film_list(): """ 获取电影信息 :return: """ from utils import get_urls func_name = inspect.stack()[0][3] hlog.enter_func(func_name) film_list = list() film_objs = session.query(Film).all() for obj in film_objs: film_id = obj.id hlog.var('film_id', film_id) location_list = get_urls('Location', film_id) people_list = get_urls('People', film_id) specie_list = get_urls('Specie', film_id) vehicle_list = get_urls('Vehicle', film_id) film = { "id": obj.id, "title": obj.title, "description": obj.description, "director": obj.director, "producer": obj.producer, "release_date": obj.release_date, "rt_score": obj.rt_score, "url": obj.url, "people": people_list, "species": specie_list, "locations": location_list, "vehicles": vehicle_list } film_list.append(film) hlog.info("读取电影信息成功。") hlog.exit_func(func_name) return film_list
def get_platfrom(url): """ 根据url获取网站域名主体 :param url: 网址 :return: """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) hlog.var("url", url) try: domain = url.split("/")[2] platfrom = domain.split(".")[1] hlog.var("platfrom", platfrom) except: hlog.debug("获取网站域名主体失败") hlog.info("获取网站域名主体成功") hlog.exit_func(func_name) return platfrom
def send_data(source_url, htmlString, platform): """ :param source_url: 爬取的是哪个网址 :param htmlString: 爬取结果的字符串 :param platform: 平台 :return: """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) hlog.var("source_url", source_url) if "" == htmlString: hlog.exit_func(func_name) return spider_uuid = uuid.uuid1() hlog.var("spider_uuid", spider_uuid) encodedBytes = base64.b64encode(htmlString.encode("utf-8")) encodedStr = str(encodedBytes, "utf-8") data = { "url": source_url, "spiderUuid": str(spider_uuid), "platform": platform, "htmlString": encodedStr } headers = { "content-type": "application/json" } url = "http://%s:%s%s"%( api_config.target_host, api_config.target_port, api_config.target_uri ) response = requests.post(url=url, data=json.dumps(data), headers= headers) hlog.info("发送结果完成,返回状态%s"%response.status_code) hlog.exit_func(func_name)
def crawl_url(url): """ :param url: 要爬取的url :return: url的所有html字符串 """ func_name = inspect.stack()[0][3] hlog.enter_func(func_name) output = os.popen("node spider.js %s" % url).read() jsonString = json.loads(output) hlog.info("爬取完成,返回状态%s" % jsonString["code"]) html = "" if "success" == jsonString["code"]: html = jsonString["data"] else: hlog.debug("爬虫爬取有误") hlog.exit_func(func_name) return html
def send(title, text, att_map=None, receivers=None, name=None): name = name if name else config.sender message = MIMEMultipart() message['Subject'] = Header(title, 'utf-8') message['From'] = Header(name, 'utf-8') message.attach(MIMEText(text, 'plain', 'utf-8')) smtp_obj = smtplib.SMTP() try: hlog.info('connect to %s:%s' % (config.mail_host, config.mail_port1)) smtp_obj.connect(config.mail_host, config.mail_port1) except smtplib.SMTPServerDisconnected as e: hlog.info(e) hlog.info('try another port') hlog.info('connect to %s:%s' % (config.mail_host, config.mail_port2)) smtp_obj.connect(config.mail_host, config.mail_port2) hlog.info('login with %s:%s' % (config.mail_user, config.mail_pass)) smtp_obj.login(config.mail_user, config.mail_pass) receivers = receivers if receivers else config.receivers for receiver in receivers: message['To'] = Header(receiver, 'utf-8') for filename, file in att_map.items(): att = MIMEText(file, 'base64', 'utf-8') att["Content-Type"] = 'application/octet-stream' att.add_header('Content-Disposition', 'attachment', filename=('gbk', '', filename)) message.attach(att) try: smtp_obj.sendmail( config.sender, receiver, message.as_string()) hlog.info('send success') except smtplib.SMTPException as e: hlog.error(e) smtp_obj.quit()