def get_and_store(self, url, depth): """ 根据给定的url获取并存储html页面 :param url: 要被下载的url链接 :param depth: url链接的深度 """ # url去重 if url in self.url_set: log.debug("{} has been crawled".format(url)) return else: self.url_set.add(url) log.debug("add {} to url_set".format(url)) # 为爬虫添加headers headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/57.0.2987.110 Safari/537.36", } try: log.warning("get {}".format(url)) r = requests.get(url, headers=headers, timeout=10) # 处理编码 r.encoding = r.apparent_encoding r.raise_for_status() html = r.content except Exception as e: log.critical("Failed to get {} depth:{} error:{}".format( url, depth, e), exc_info=True) return # 解析HTML soup = BeautifulSoup(html, 'lxml') # 连接数据库 db = DataStore(self.dbfile) if self.keyword == "": db.insert(url, str(None), html) db.close() else: if self.keyword in html: db.insert(url, self.keyword, html) db.close() else: log.warning("Cannot find {} in {}".format(self.keyword, url)) # 获取该页面内的链接,深度减一 self.get_hyperlink(url, soup, depth - 1)
from datetime import datetime import config from azure_helper import EventHubHelper from cube_parser import CubeParser from database import DataStore logging.config.fileConfig('log.config') logger = logging.getLogger(config.logger_name) def myExceptionHook(exctype, value, traceback): logger.error(value) sys.__excepthook__(exctype, value, traceback) if __name__ == '__main__': sys.excepthook = myExceptionHook print("Running at %s" % datetime.utcnow()) datas = CubeParser().get_data() print("Latest data: %s" % datas[0]["time"]) if config.use_event_hub: data_json = json.dumps(datas) EventHubHelper.send_request(config.servicebus_namespace, config.eventhub_name, config.eventhub_key_name, config.eventhub_private_key, data_json) else: db = DataStore(config.db_server, config.db_name, config.db_user, config.db_password) data_rows = db.parse_to_db_format(datas) db.send_request(data_rows)
logging.config.fileConfig('log.config') logger = logging.getLogger(config.logger_name) def myExceptionHook(exctype, value, traceback): logger.error(value) sys.__excepthook__(exctype, value, traceback) if __name__ == '__main__': sys.excepthook = myExceptionHook print("Running IFTTT checker at %s" % datetime.utcnow()) store = DataStore(config.db_server, config.db_name, config.db_user, config.db_password) rows = store.getSensorBatteryStatuses() current_hour = datetime.utcnow().hour for row in rows: sensor_id = row[0] battery = row[1] cable = row[2] if battery <= 15 and cable == 0 and current_hour > 19: logger.debug("Request charging %s (%s : %s)" % (sensor_id, battery, cable)) IFTTT.sendEvent(config.ifttt_api_key, sensor_id + config.ifttt_event_on) # Stop charging when nearing 100 if cable == 1 and battery > 96:
logging.config.fileConfig('log.config') logger = logging.getLogger(config.logger_name) def myExceptionHook(exctype, value, traceback): logger.error(value) sys.__excepthook__(exctype, value, traceback) if __name__ == '__main__': sys.excepthook = myExceptionHook print("Running IFTTT checker at %s" % datetime.utcnow()) store = DataStore(config.db_server, config.db_name, config.db_user, config.db_password) rows = store.getSensorBatteryStatuses() current_hour = datetime.utcnow().hour for row in rows: sensor_id = row[0] battery = row[1] cable = row[2] if battery <= 15 and cable == 0 and current_hour > 19: logger.debug("Request charging %s (%s : %s)" % (sensor_id, battery, cable)) IFTTT.sendEvent(config.ifttt_api_key, sensor_id + config.ifttt_event_on)