def check(self): """Check our monitoring target""" result = False result_log = "{url}: {result}" try: start = time() contents = self._fetch_contents() end = time() elapsed = "{0:.3f}".format(end - start) if self._check_contents(contents, self.settings): log.info(result_log.format( url=self.url, result="GOOD {0} ms response time".format(elapsed) )) result = elapsed else: log.warning(result_log.format( url=self.url, result="BAD, Content mismatch, {0} ms response time".format( elapsed ) )) except IOError as e: log.error(result_log.format( url=self.url, result="BAD, Connection error: {0}".format(e) )) return result
def start(self, args): """Start up all of the application components""" enable_log() # Make sure the results have some data in it before the web requests # come in self.results = App._init_results(self.config) if args.poll_seconds: self.config.override_monitor_setting( 'poll_seconds', args.poll_seconds ) if self.config.log_file: set_log_file(self.config.log_file) log.debug("Opening log file {}".format(self.config.log_file)) self._start_monitoring() if self.config.http_port: log.info("Starting web frontend on port {}".format( self.config.http_port )) self._start_frontend()
def consume_item(self, event): log.info('uploading to Dropbox: %s -> %s' % (event.source_absolute, event.target_absolute)) self.send_progress(event.source_absolute, 0.0) try: self._upload(event, event.target_absolute) except IOError as e: # file was deleted immediatily log.warning('upload failed' + str(e)) self.send_progress(event.source_absolute, 1.0)
def consume_item(self, event): log.info('uploading to GoogleDrive: %s -> %s' % (event.source_absolute, event.target_absolute)) self.send_progress(event.source_absolute, 0.0) # TODO handle dir/file removal try: if event.isdir: self._path_to_ids(event.target_absolute, create_missing=True) else: self._put_file(event.source_absolute, event.target_absolute) except IOError as e: # file was deleted immediatily? log.warning('upload failed' + str(e)) finally: self.send_progress(event.source_absolute, 1.0)
def _start_monitoring(self): """Start monitoring the URLs""" for url in self.config.monitors: log.info("Monitoring {} every {} seconds".format( url, self.config.monitors[url]["poll_seconds"] )) self.monitors.append({ "monitor": Monitor(url, self.config.monitors[url]), "nextRun": time() }) def _loop(): """Monitoring loop""" while True: now = time() for item in self.monitors: # Is it time to run this monitor yet? if item["nextRun"] < now: # Run it monitor = item["monitor"] url = monitor.get_url() result = monitor.check() # Update results self.results[url] = result # Update next run time settings = monitor.get_settings() item["nextRun"] = time() + float( settings["poll_seconds"]) # Wait a while to not eat all CPU sleep(1) self.thread = Thread(target=_loop) # For some reason this is needed for the main process to catch # CTRL+C properly self.thread.daemon = True self.thread.start()
def speedy_parameter_load(pdicts, params): global child_pids assert len(pdicts) > 4 signal.signal(signal.SIGINT, signal_handler) for i in xrange(4): sample = pdicts[i*len(pdicts)/4 : (i+1)*len(pdicts)/4] cpid = os.fork() if cpid: child_pids.append(cpid) else: log.info('Child Process Launched') engine = create_engine(conn_string) Session = sessionmaker() Session.configure(bind=engine) session = Session() linear_parameter_load(sample, params, session) session.close() sys.exit(0) for i,cpid in enumerate(child_pids): os.waitpid(cpid, 0) log.info('Child %s Finished' % i) child_pids = []
def end_callback(): log.info('\n********** wwa end **********') export_data.main()
def __parse_article_list(self, article_list, __biz, is_first_page=False): """ @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) """ # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get("title") digest = article_info.get("digest") url = article_info.get("content_url").replace("\\", "").replace( "amp;", "") source_url = article_info.get("source_url").replace("\\", "") # 引用的文章链接 cover = article_info.get("cover").replace("\\", "") subtype = article_info.get("subtype") is_multi = article_info.get("is_multi") author = article_info.get("author") copyright_stat = article_info.get("copyright_stat") duration = article_info.get("duration") del_flag = article_info.get("del_flag") type = comm_msg_info.get("type") publish_time = tools.timestamp_to_date( comm_msg_info.get("datetime")) sn = tools.get_param(url, "sn") if sn: # 缓存文章信息 article_data = { "title": title, "digest": digest, "url": url, "source_url": source_url, "cover": cover, "subtype": subtype, "is_multi": is_multi, "author": author, "copyright_stat": copyright_stat, "duration": duration, "del_flag": del_flag, "type": type, "publish_time": publish_time, "sn": sn, "__biz": __biz, "spider_time": tools.get_current_date(), } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get("list", []) is_first_article = True for article in article_list: comm_msg_info = article.get("comm_msg_info", {}) publish_timestamp = comm_msg_info.get("datetime") publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time( __biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account( publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie( __biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time( __biz, publish_time) if is_reach: log.info("采集到上次发布时间 公众号 {} 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info( "公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号".format(__biz)) return article_type = comm_msg_info.get("type") if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range( publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info("公众号 {} 超过采集时间范围 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info("公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集".format( __biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get("app_msg_ext_info", {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get( "multi_app_msg_item_list") for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time
def end_callback(): log.info('\n********** template end **********')
def end_callback(): log.info('\n********** spider_main end **********') task_status.is_doing = False
def end_callback(): export_data.main() log.info('\n********** VA_APP end **********')
def get_database_data(self, host): log.info("从数据库获取接口信息") session = self.__db_engine.creat_session() # 获取该服务器下所有接口 whichService = session.query(Server.id).filter( or_(Server.qaURL == host, Server.devURL == host), and_(Server.serverStatus == 1)).first() if whichService == [] or whichService is None: servername = '' for i in self.env.keys(): if self.env.get(i) == host: servername = i break serverid = session.query(Server).filter( and_(Server.serverName == servername, Server.serverStatus == 1)).first() if config.get('updateActionAPI') in ('DEV_API', 'DEV'): devurl = host qaurl = config.get('hosts').get('QA').get(servername) else: qaurl = host devurl = config.get('hosts').get('DEV').get(servername) if serverid == [] or serverid is None: addserver = Server(serverName=servername, swaggerURI='/v2/api-docs', qaURL=qaurl, devURL=devurl, serverStatus=1) session.add(addserver) else: log.info('数据库中已有相同的服务名:%s' % servername) session.commit() whichService = session.query(Server.id).filter( or_(Server.qaURL == host, Server.devURL == host)).first() whichService = whichService.id apis = session.query(Interface.id, Interface.apiPath, Interface.apiDesc, Interface.apiRequestMethod).filter( and_(Interface.apiServerId == whichService, Interface.apiStatus == 1)).all() api_result = {} # 获取接口中所有参数信息 for api in apis: api_id = api[0] api_path = api[1] api_desc = api[2] api_method = api[3] # 组装要返回的api请求响应参数信息 api_result.update({ api_path: { "desc": api_desc, "method": api_method, "inParameter": None, "outParameter": None } }) # 组装请求字段信息 try: in_parameter = session.query( InputParameters.inParameter).filter( and_(InputParameters.apiId == api_id, InputParameters.inParameterStatus == 1)).one() api_result.get(api_path).update( {"inParameter": json.loads(in_parameter[0])}) except orm.exc.NoResultFound: api_result.get(api_path).update({"inParameter": None}) # 组装响应字段信息 try: out_parameter = session.query( ExtractParameters.extParameter).filter( and_(ExtractParameters.apiId == api_id, ExtractParameters.extParameterStatus == 1)).one() api_result.get(api_path).update( {"outParameter": json.loads(out_parameter[0])}) except orm.exc.NoResultFound: api_result.get(api_path).update({"outParameter": None}) self.__db_engine.close_session() return api_result
def get_all_hot(): ''' @summary: 全网热点 --------- @param : --------- @result: ''' url = 'http://192.168.60.38:8001/hotspot_al/interface/getHotAnalysis?type=0' json = tools.get_json_by_requests(url, headers=HEADERS) datas = json['data'] hot_count = 0 # 相关新闻获取url root_url = 'http://192.168.60.38:8001/hotspot_al/interface/getHotRelateInfo?ids=%s' for data in datas: sql = 'select sequence.nextval, SEQ_IOPM_ARTICLE.nextval from dual' result = oracledb.find(sql)[0] hot_id = result[0] article_id = result[1] def export_callback(execute_type, sql, data_json): if execute_type != ExportData.EXCEPTION: infoIds = data['infoIds'] url = root_url % infoIds json = tools.get_json_by_requests(url, headers=HEADERS) articles = json['data'] # "EMOTION": 'vint_3', # "ACCOUNT": null, # "WEIGHT": 0, # "TITLE": "str_title", # "URL": "str_url", # "MAY_INVALID": , # "CLUES_IDS": "", # "WEBSITE_NAME": "str_site", # "KEYWORDS_COUNT": 1, # "HOST": "str_site", # "INFO_TYPE": 'int_type', # "COMMENT_COUNT": null, # "HOT_ID": "vint_%d"%hot_id, # "REVIEW_COUNT": null, # "UUID": "73ec16038e074530ff109e3cfad2594c", # "ID": 'vint_%d'%article_id, # "IS_VIP": null, # "IMAGE_URL": 'str_picture', # "KEYWORDS": "str_keywords", # "KEYWORD_CLUES_ID": "{"中央电视台":"88758"}", # "RELEASE_TIME": "date_pubtime", # "AUTHOR": "江门日报", # "CONTENT": "clob_content", # "RECORD_TIME": 'vdate_%s'%tools.get_current_date(), # "UP_COUNT": 'vint_null' key_map = { 'id': 'int_dataId', 'content': 'clob_content', 'url': 'str_url', 'website_name': 'str_site', 'image_url': 'str_picture', 'release_time': 'date_pubtime', 'keywords': 'str_keywords', 'emotion': 'str_emotion', 'host': 'str_site', 'title': 'str_title', 'info_type': 'int_type', 'hot_id': "vint_%d" % hot_id, 'record_time': 'vdate_%s' % tools.get_current_date() } export_data.export_to_oracle( key_map=key_map, aim_table='TAB_IOPM_ARTICLE_INFO', unique_key='url', datas=articles, unique_key_mapping_source_key={'url': 'str_url'}, sync_to_es=True) # 导出全国热点数据 key_map = { 'id': 'vint_%d' % hot_id, 'title': 'str_kw', 'hot': 'int_hot', 'hot_type': 'vint_0', 'record_time': 'vdate_%s' % tools.get_current_date() } # print(data['kw']) hot_count += export_data.export_to_oracle( key_map=key_map, aim_table='TAB_IOPM_HOT_INFO', unique_key='title', datas=data, callback=export_callback, sync_to_es=True) log.info(''' 共导出%d条全网热点 ''' % (hot_count))
def end_callback(): # 更新关键词状态 做完 log.info('\n********** live_app end **********')
def begin_callback(): log.info('\n********** live_app begin **********') db.delete('LiveApp_urls', {})
def tearDown(self): # Runs after each test self.end = time() log.info('Elapsed time in seconds: %s', "%.5f" % (self.end - self.start)) return super().tearDown()
def tearDownClass(self): # Runs once before all tests log.info('Ending %s', self.__name__)
def setUpClass(self): # Runs once before all tests log.info('Setting up %s', self.__name__) self.maxDiff = None # self.sortTestMethodsUsing = None self.client = app.test_client()
def add_latest_action(self): for key, host in self.hosts.items(): log.info(key) self.create_action(key, host)
def handle_sync_progress(self, syncer, file, progress): log.info("%s: %s %s" % (syncer.name, progress, file)) self.progress_callback(syncer, file, progress)
def initialize_database(path=MASTER_DOC): global models CSVModel.clear() csv_docs = xls_parse_from_url(path) log.info('Downloaded %s' % path) model_instances = {} for k,doc in csv_docs.iteritems(): if k in ['IDMap', 'AllScenarios']: continue try: csv_model = CSVModel(doc).create_model(k) models[csv_model.__name__] = csv_model model_instances[k] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue # We want a late load so that the order is preserved and deterministic from model.refs.parameter_ref import ParameterRef log.info('Dropping view') drop_dp_view(engine) drop_view(engine) CSVModel.drop_all(engine) CSVModel.create_all(engine) log.info('Creating view') initialize_view(engine) initialize_dp_view(engine) for k,v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) log.info("Initialized %s" % k) log.info("Initializing Parameter References and Associations") pdicts = [(pdict.scenario, pdict.id, pdict.parameter_ids) for pdict in model_instances['ParameterDictionary']] log.info("Loaded ParameterDictionary into memory") params = {p.id : p.scenario for p in model_instances['ParameterDefs']} log.info("Loaded Parameters into Memory") if engine.name == 'postgresql': speedy_parameter_load(pdicts, params) else: linear_parameter_load(pdicts, params, session)
def data_check(self): # 检查方法 def api_info_checker(key: str, database_data): if key == self.api_key_map_method: tmp_str = "请求方法" tmp_method_new = self.new_api_method.get("cmp_result") tmp_method_del = self.del_api_method.get("cmp_result") tmp_method_update = self.update_api_method.get("cmp_result") old_name = "old_" + key elif key == self.api_key_map_inParameter: tmp_str = "入参" tmp_method_new = self.new_in_parameter.get("cmp_result") tmp_method_del = self.del_in_parameter.get("cmp_result") tmp_method_update = self.update_in_parameter.get("cmp_result") old_name = "old_" + key elif key == self.api_key_map_outParameter: tmp_str = "出参" tmp_method_new = self.new_ext_parameter.get("cmp_result") tmp_method_del = self.del_ext_parameter.get("cmp_result") tmp_method_update = self.update_ext_parameter.get("cmp_result") old_name = "old_" + key elif key == self.api_key_map_desc: tmp_str = "描述" tmp_method_new = self.new_api_desc.get("cmp_result") tmp_method_del = self.del_api_desc.get("cmp_result") tmp_method_update = self.update_api_desc.get("cmp_result") old_name = "old_" + key else: raise KeyError("传入参数错误") if self.swagger_data.get(check_api).get( key) != self.database_data.get(check_api).get(key): # 数据被删除 if self.swagger_data.get(check_api).get(key) is None: log.info("接口%s %s被删除" % (check_api, tmp_str)) self.check_result.get("update_info").get(check_api).get("cmp_result"). \ append(tmp_method_del) self.check_result.get("update_info").get(check_api).update( {old_name: self.database_data.get(check_api).get(key)}) # 数据新增 elif self.swagger_data.get(check_api).get(key) is None: log.info("接口%s 新增%s" % (check_api, tmp_str)) self.check_result.get("update_info").get(check_api).get("cmp_result"). \ append(tmp_method_new) self.check_result.get("update_info").get(check_api).update( {old_name: self.database_data.get(check_api).get(key)}) # 数据修改 else: log.info("接口%s %s发生更新" % (check_api, tmp_str)) self.check_result.get("update_info").get(check_api).get("cmp_result"). \ append(tmp_method_update) self.check_result.get("update_info").get(check_api).update( {old_name: self.database_data.get(check_api).get(key)}) log.info("检查数据库和服务器返回的数据") if self.database_data != self.swagger_data: # 以服务器返回的数据为准,进行全遍历,并将对比过的数据删除 for check_api in list(self.swagger_data.keys()): # 检查接口是否在数据库中存在 try: if check_api in self.database_data.keys(): log.debug("检查接口%s是否有变更" % check_api) if self.swagger_data.get( check_api) != self.database_data.get( check_api): # 接口发生变更,将变更数据记录到结果中 self.check_result.get("update_info").update( {check_api: self.swagger_data.get(check_api)}) self.check_result.get("update_info").get( check_api).update({"cmp_result": []}) # 检查接口描述 # api_info_checker(self.api_key_map_desc, self.database_data) # 检查接口方法 api_info_checker(self.api_key_map_method, self.database_data) # 检查接口入参 api_info_checker(self.api_key_map_inParameter, self.database_data) # 检查接口出参 api_info_checker(self.api_key_map_outParameter, self.database_data) # 检查完成后,将接口从数据中删除,方便下次遍历检查 self.swagger_data.__delitem__(check_api) self.database_data.__delitem__(check_api) else: # 数据库中没有的接口,走新增数据流程 log.debug("发现有新增接口%s" % check_api) self.check_result.get("new_info").update( {check_api: self.swagger_data.get(check_api)}) self.check_result.get("new_info").get( check_api).update(self.new_api) self.swagger_data.__delitem__(check_api) except AttributeError: log.error("数据库未配置swagger地址或地址错误") exit(0) # 剩余database_data中是数据,是服务器删除的数据 if 0 < len(self.database_data.items()): log.debug("发现有删除接口%s" % ",".join(list(self.database_data.keys()))) self.check_result.get("del_info").update(self.database_data)
def updata_database(self, keys, host): """ 读取校验后的数据对数据库进行增删改 :param dict keys: :param str host: :return: """ log.info('执行数据更新接口信息') session = self.__db_engine.creat_session() serverid = session.query(Server.id).filter( or_(Server.qaURL == host, Server.devURL == host), and_(Server.serverStatus == 1)).first() for status in keys.keys(): if status == 'new_info': nwe_info = keys.get(status) for api in nwe_info.keys(): apiparameter = nwe_info.get(api) desc = str(apiparameter.get('desc')) method = str(apiparameter.get('method')) apiId = session.query(Interface.id).filter( and_(Interface.apiPath == api, Interface.apiStatus == 1, Interface.apiServerId == serverid.id)).first() if apiId is None: ifs = Interface(apiPath=api, apiServerId=serverid.id, apiDesc=desc, apiRequestMethod=method, apiStatus=1) session.add(ifs) session.commit() apiId = session.query(Interface.id).filter( and_( Interface.apiPath == api, Interface.apiStatus == 1, Interface.apiServerId == serverid.id)).first() inParameter = json.dumps(apiparameter.get('inParameter')) extParameter = json.dumps(apiparameter.get('outParameter')) ips = InputParameters(apiId=apiId.id, inParameter=inParameter, inParameterStatus=1) eps = ExtractParameters(apiId=apiId.id, extParameter=extParameter, extParameterStatus=1) session.add_all([ips, eps]) session.commit() elif status == 'update_info': update_info = keys.get(status) for api in update_info.keys(): apiparameter = update_info.get(api) apiId = session.query(Interface.id).filter( and_(Interface.apiPath == api, Interface.apiStatus == 1, Interface.apiServerId == serverid.id)).first() inParameter = json.dumps(apiparameter.get('inParameter')) extParameter = json.dumps(apiparameter.get('outParameter')) ips = session.query(InputParameters).filter( and_(InputParameters.apiId == apiId.id, InputParameters.inParameterStatus == 1)).first() ips.inParameter = inParameter eps = session.query(ExtractParameters).filter( and_(ExtractParameters.apiId == apiId.id, ExtractParameters.extParameterStatus == 1)).first() eps.extParameter = extParameter session.commit() elif status == 'del_info': update_info = keys.get(status) for api in update_info.keys(): apiId = session.query(Interface).filter( and_(Interface.apiPath == api, Interface.apiStatus == 1, Interface.apiServerId == serverid.id)).first() ips = session.query(InputParameters).filter( and_(InputParameters.apiId == apiId.id, InputParameters.inParameterStatus == 1)).first() ips.inParameterStatus = 2 eps = session.query(ExtractParameters).filter( and_(ExtractParameters.apiId == apiId.id, ExtractParameters.extParameterStatus == 1)).first() eps.extParameterStatus = 2 apiId.apiStatus = 2 session.commit() self.__db_engine.close_session()
# encoding=utf-8 import sys sys.path.append("..") import utils.tools as tools from utils.log import log from base.collector import Collector from base.root_url import AddRootUrl from html_parser.parser_control import PaserControl def init(): pass if __name__ == '__main__': log.info("--------begin--------") addRootUrl = AddRootUrl() addRootUrl.start() coll = Collector() coll.start() paserCount = int(tools.getConfValue("html_parser", "parser_count")) while paserCount: paser = PaserControl() paser.start() paserCount = paserCount - 1
def begin_callback(): log.info('\n********** VA_APP begin **********') db = MongoDB() db.delete('VAApp_urls', {})
def begin_callback(): # mongo_db = MongoDB() # mongo_db.update('ZHEJIANG_APP_urls', {'depth': 0}, {'status': 0}) log.info('\n********** spider_main begin **********')
def begin_callback(): log.info('\n********** spider_main begin **********')
def end_callback(): log.info('\n********** spider_main end **********')
def begin_callback(): log.info('\n********** template begin **********') db.delete('op_urls', {}) db.delete('op_content_info', {})
def begin_callback(): log.info('\n********** spider_article begin **********')
def deal_article_list(self, req_url, text): """ @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: """ try: # 判断是否为被封的账号, 被封账号没有文章列表 __biz = tools.get_param(req_url, "__biz") if "list" in text: # 取html格式里的文章列表 if "action=home" in req_url: # 解析公众号信息 self.__parse_account_info(text, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(text, regex, fetch_one=True) article_list = article_list.replace(""", '"') publish_time = self.__parse_article_list( article_list, __biz, is_first_page=True) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(text, regex, fetch_one=True) if can_msg_continue == "0": # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) if not new_last_publish_time: # 标记成僵尸号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie(__biz) else: self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(text, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( 10, publish_time), ) else: # json格式 text = tools.get_json(text) article_list = text.get("general_msg_list", {}) publish_time = self.__parse_article_list( article_list, __biz) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = text.get("can_msg_continue") if not can_msg_continue: # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) pass elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") appmsg_token = tools.get_param(req_url, "appmsg_token") # 取offset 在json中 offset = text.get("next_offset", 0) next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( offset, publish_time), ) else: # 该__biz 账号已被封 self._task_manager.sign_account_is_zombie(__biz) pass except Exception as e: log.exception(e) return self._task_manager.get_task()
def end_callback(): log.info('\n********** spider_article end **********')
def begin_callback(): #db.update('WWA_app_urls',{'depth':0}, {'status':0}) db.delete('WWA_search_app_urls') log.info('\n********** wwa begin **********')
('/app/managelist', ManageAllHandler), ('/app/manage/changepasswd', ManageChangePasswordHandler), ('/app/recordlist', RecordAllHandler), ('/app/audio', AudioHandler), ('/app/audiolist', AudioAllHandler), ('/app/upload', UploadHandler), ('/app/events', EventsHandler), ('/server/releasealarm', ReleaseAlarmHandler), #('/static/(.*)', StaticHandler), ('/.*', RedirectHandler), ], **settings) #chrome --allow-running-insecure-content # usage from http://stackoverflow.com/questions/8045698/https-python-client # openssl genrsa -out privatekey.pem 2048 # openssl req -new -key privatekey.pem -out certrequest.csr # openssl x509 -req -in certrequest.csr -signkey privatekey.pem -out certificate.pem if __name__ == "__main__": threading.Thread(target=alarm_sync).start() http_server = tornado.httpserver.HTTPServer( application, #ssl_options={ # "certfile": os.path.join("./", "certificate.pem"), # "keyfile": os.path.join("./", "privatekey.pem"), #} ) http_server.listen(SERVERPORT) log.info("server start") tornado.ioloop.IOLoop.instance().start()
test_file_name = '/home/h4ct1c/omnisync/local/test2' test_event = InotifyEvent( None, {'source': '/home/h4ct1c/omnisync/local/', 'syncers': ['GoogleDrive'], 'target': '/omniSync'}, mask=None, file_name=os.path.basename(test_file_name), base_path=os.path.dirname(test_file_name), source_absolute=test_file_name, isdir=False ) drive = GoogleDrive( progress_callback=lambda syncer, file, progress: log.info("%s: %s %s" % (syncer.name, progress, file)) ) drive.get_credentials() drive.authorize() for item in drive.walk(): print(item) #drive.consume_item(test_event) #drive._create_folder('omniSync') #drive._path_to_ids('/omniSync/', create_missing=True) #drive._get_file('omnisync') # region modline # vim: set tabstop=4 shiftwidth=4 expandtab: # vim: foldmethod=marker foldmarker=region,endregion:
def begin_callback(): log.info('\n********** VA begin **********')
def initialize_saf(database='data/objects_20131126_112742.xls'): global models CSVModel.clear() csv_docs = xls_parse_from_url(database) log.info('Loaded %s' % database) model_instances = {} for k,doc in csv_docs.iteritems(): try: csv_model = CSVModel(doc).create_model('saf_%s' % k) models[csv_model.__name__] = csv_model model_instances[csv_model.__name__] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue from model.refs.saf_instrument_ref import SAFInstrumentRef log.info("Dropping SAF Views") drop_saf_instrument_view(engine) drop_qc_view(engine) log.info("Dropping SAF Models") CSVModel.drop_all(engine) log.info("Creating SAF Models") CSVModel.create_all(engine) log.info("Creating SAF Views") initialize_saf_instrument_view(engine) initialize_qc_view(engine) for k,v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) raise log.info('Initialized %s' % k) log.info('Initialized SAF Data instances') instruments = model_instances['saf_instrument'] instruments = [(i.id, i.data_product_list) for i in instruments] log.info("Loaded instruments into memory") if engine.name == 'postgresql': speedy_saf_ref(instruments) else: linear_saf_ref(instruments, session)
def _upload(self, event, dropbox_path): if event.isdir: if event.type != 'CREATE': return try: self.client.file_create_folder(dropbox_path) except dropbox.rest.ErrorResponse as e: log.exception(e) finally: return with open(event.source_absolute, 'rb') as file: self._put_file(file, event.source_absolute, dropbox_path) # endregion if __name__ == '__main__': sys.path = sys.path[1:] import dropbox remote = Dropbox( progress_callback=lambda syncer, path, progress: log.info("%s: %s %s" % (syncer.name, progress, path)) ) remote.init() remote.walk('/') # region modline # vim: set tabstop=4 shiftwidth=4 expandtab: # vim: foldmethod=marker foldmarker=region,endregion: # endregion
def initialize_saf(database='data/objects_20131126_112742.xls'): global models CSVModel.clear() csv_docs = xls_parse_from_url(database) log.info('Loaded %s' % database) model_instances = {} for k, doc in csv_docs.iteritems(): try: csv_model = CSVModel(doc).create_model('saf_%s' % k) models[csv_model.__name__] = csv_model model_instances[csv_model.__name__] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue from model.refs.saf_instrument_ref import SAFInstrumentRef log.info("Dropping SAF Views") drop_saf_instrument_view(engine) drop_qc_view(engine) log.info("Dropping SAF Models") CSVModel.drop_all(engine) log.info("Creating SAF Models") CSVModel.create_all(engine) log.info("Creating SAF Views") initialize_saf_instrument_view(engine) initialize_qc_view(engine) for k, v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) raise log.info('Initialized %s' % k) log.info('Initialized SAF Data instances') instruments = model_instances['saf_instrument'] instruments = [(i.id, i.data_product_list) for i in instruments] log.info("Loaded instruments into memory") if engine.name == 'postgresql': speedy_saf_ref(instruments) else: linear_saf_ref(instruments, session)
def initialize_database(path=MASTER_DOC): global models CSVModel.clear() csv_docs = xls_parse_from_url(path) log.info('Downloaded %s' % path) model_instances = {} for k, doc in csv_docs.iteritems(): if k in ['IDMap', 'AllScenarios']: continue try: csv_model = CSVModel(doc).create_model(k) models[csv_model.__name__] = csv_model model_instances[k] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue # We want a late load so that the order is preserved and deterministic from model.refs.parameter_ref import ParameterRef log.info('Dropping view') drop_dp_view(engine) drop_view(engine) CSVModel.drop_all(engine) CSVModel.create_all(engine) log.info('Creating view') initialize_view(engine) initialize_dp_view(engine) for k, v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) log.info("Initialized %s" % k) log.info("Initializing Parameter References and Associations") pdicts = [(pdict.scenario, pdict.id, pdict.parameter_ids) for pdict in model_instances['ParameterDictionary']] log.info("Loaded ParameterDictionary into memory") params = {p.id: p.scenario for p in model_instances['ParameterDefs']} log.info("Loaded Parameters into Memory") if engine.name == 'postgresql': speedy_parameter_load(pdicts, params) else: linear_parameter_load(pdicts, params, session)
def end_callback(): # 更新关键词状态 做完 log.info('\n********** VA end **********')
def telemetry_query(datatype, amount, session): """ Datatype specifies the data that wants to be queried inside of TelemetryModel Datatype can be: "GOM", "RTC", "RPI", "GYRO", "THERMO", "PRESSURE", or "ALL" Prints amount number of most recent entries in the database for desired datatype A sqlalchemy session (session) must be passed in """ try: if datatype == "ALL": entries = session.query(TelemetryModel).all() length = len(entries) for entry in range(length - amount, length): logger.info(entries[entry]) elif datatype == "GOM": GOM_query = session.query( TelemetryModel.time_polled, TelemetryModel.GOM_vboost1, TelemetryModel.GOM_vboost2, TelemetryModel.GOM_vboost3, TelemetryModel.GOM_vbatt, TelemetryModel.GOM_curin1, TelemetryModel.GOM_curin2, TelemetryModel.GOM_curin3, TelemetryModel.GOM_cursun, TelemetryModel.GOM_cursys, TelemetryModel.GOM_reserved1, TelemetryModel.GOM_curout1, TelemetryModel.GOM_curout2, TelemetryModel.GOM_curout3, TelemetryModel.GOM_curout4, TelemetryModel.GOM_curout5, TelemetryModel.GOM_curout6, TelemetryModel.GOM_outputs, TelemetryModel.GOM_latchup1, TelemetryModel.GOM_latchup2, TelemetryModel.GOM_latchup3, TelemetryModel.GOM_latchup4, TelemetryModel.GOM_latchup5, TelemetryModel.GOM_latchup6, TelemetryModel.GOM_wdt_i2c_time_left, TelemetryModel.GOM_wdt_gnd_time_left, TelemetryModel.GOM_counter_wdt_i2c, TelemetryModel.GOM_counter_wdt_gnd, TelemetryModel.GOM_counter_boot, TelemetryModel.GOM_bootcause, TelemetryModel.GOM_battmode, TelemetryModel.GOM_temp1, TelemetryModel.GOM_temp2, TelemetryModel.GOM_temp3, TelemetryModel.GOM_temp4, TelemetryModel.GOM_pptmode, TelemetryModel.GOM_reserved2, ).all() length = len(GOM_query) for entry in range(length - amount, length): logger.info(GOM_query[entry]) elif datatype == "RTC": RTC_query = session.query( TelemetryModel.time_polled, TelemetryModel.RTC_measurement_taken).all() length = len(RTC_query) for entry in range(length - amount, length): logger.info(RTC_query[entry]) elif datatype == "RPI": RPI_query = session.query( TelemetryModel.time_polled, TelemetryModel.RPI_cpu, TelemetryModel.RPI_ram, TelemetryModel.RPI_dsk, TelemetryModel.RPI_tmp, TelemetryModel.RPI_boot, TelemetryModel.RPI_uptime, ).all() length = len(RPI_query) for entry in range(length - amount, length): logger.info(RPI_query[entry]) elif datatype == "GYRO": GYRO_query = session.query( TelemetryModel.time_polled, TelemetryModel.GYRO_gyr_x, TelemetryModel.GYRO_gyr_y, TelemetryModel.GYRO_gyr_z, TelemetryModel.GYRO_acc_x, TelemetryModel.GYRO_acc_y, TelemetryModel.GYRO_acc_z, TelemetryModel.GYRO_mag_x, TelemetryModel.GYRO_mag_y, TelemetryModel.GYRO_mag_z, TelemetryModel.GYRO_temperature).all() length = len(GYRO_query) for entry in range(length - amount, length): logger.info(GYRO_query[entry]) elif datatype == "THERMO": THERMO_query = session.query( TelemetryModel.time_polled, TelemetryModel.THERMOCOUPLE_temperature).all() length = len(THERMO_query) for entry in range(length - amount, length): logger.info(THERMO_query[entry]) elif datatype == "PRESSURE": PRESSURE_query = session.query( TelemetryModel.time_polled, TelemetryModel.PRESSURE_pressure).all() length = len(PRESSURE_query) for entry in range(length - amount, length): logger.info(PRESSURE_query[entry]) except: logger.error("error during telemetry_query")