def __init__(self, queue_name): self.__db = DBManager.getInstance(f(seed_cf, 'seed_db', 'type'), queue_name, port=f(seed_cf, 'seed_db', 'port'), host=f(seed_cf, 'seed_db', 'host')) #self.__solr = DBManager.getInstance('solr','seed',server=["spider7:8983","spider7:8984","spider7:8985"]) self.queue_name = queue_name from CommonLib.Logging import Logging self.log = Logging(name=queue_name) self.__data_dic = {}
class Fetcher(object): def __init__(self, queue_name, sub_type, get_db_dict=None, save_db_dict=None): # config = __import__("FetchConfig") # get_db_dict = config.QYXX_GET_DB # save_db_dict = config.QYXX_PUT_DB self.logger = Logging(__name__) # queue_name = queue_name # for debug self.__get_db = DBManager.getInstance(get_db_dict["type"], queue_name, port=get_db_dict["port"], host=get_db_dict["host"]) # self.__save_db = DBManager.getInstance(get_db_dict["type"], # queue_name, # port = get_db_dict["port"], # host = get_db_dict["host"]) self.queue_name = queue_name self.__data_dic = {} def get(self): item = self.__get_db.get() # self.__get_db.save(item) # 临时存放一下,debug if item: self.__data_dic = json.loads(item) return self.__data_dic def update(self, *args, **kwargs): if args: data = filter(lambda x: isinstance(x, dict), args) map(lambda x: self.__data_dic.update(x), data) if kwargs: self.__data_dic.update(kwargs) def save(self, data_dict=None): if data_dict: self.__get_db.save(data_dict) else: self.__get_db.save(self.__data_dic) def backup(self): k = "html_" + str(os.getpid()) + "_" + self.queue_name self.__get_db.keySet(k, self.__data_dic) def hget(self, key=None): item = self.__get_db.hget(key=key) if item: self.__data_dic = json.loads(item) return self.__data_dic else: self.logger.warning("%s hash is empty ,please check", self.queue_name)
def __init__(self, pinyin): """ Initiate the parameters. """ self.pinyin = pinyin self.log = Logging(name=pinyin) self.result_collection = None self.json_mapper_config = dict() self.ignore_key_list = list() self.jbxx_web = None # 存放基本信息WebContent pass
def __init__(self, paras): """ init property use the para instance :param paras: :return: """ self.subcontrol = paras.getProperty("subcontrol") self.inst_name = paras.getProperty("inst_name") self.process_num = paras.getProperty("process_num") self.logger = Logging("qyxx_process_info", stream_flag=False) self.logger.info("logger address")
def __init__(self, queue_name, sub_type, get_db_dict=None, save_db_dict=None): # config = __import__("FetchConfig") # get_db_dict = config.QYXX_GET_DB # save_db_dict = config.QYXX_PUT_DB self.logger = Logging(__name__) # queue_name = queue_name # for debug self.__get_db = DBManager.getInstance(get_db_dict["type"], queue_name, port=get_db_dict["port"], host=get_db_dict["host"]) # self.__save_db = DBManager.getInstance(get_db_dict["type"], # queue_name, # port = get_db_dict["port"], # host = get_db_dict["host"]) self.queue_name = queue_name self.__data_dic = {}
def __init__(self, pinyin, version=None): self.pinyin = pinyin self.ua = cu.get_user_agent() self.version = version self.logging = Logging(name=pinyin) self.recChar = None self.yzm_count = 0 cg = ConfigGet('Config.ini') opt = cg.get("setting", "debug", "false") self.debug = 1 if opt.lower() == "true" else 0 pass
class CrawlerController(object): """ class for control different type of subcontrol such as qyxx, xgxx paramater is the outer input get subcontrol instance dynamically """ def __init__(self, paras): """ init property use the para instance :param paras: :return: """ self.subcontrol = paras.getProperty("subcontrol") self.inst_name = paras.getProperty("inst_name") self.process_num = paras.getProperty("process_num") self.logger = Logging("qyxx_process_info", stream_flag=False) self.logger.info("logger address") def loadSubControl(self): sub_ctrl_inst = ClassFactory.getClassInst(self.subcontrol) return sub_ctrl_inst
def work(self, pro_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=pro_type) log.info("Process begin") pro_type = pro_type.lower() queue_name = pro_type module_name = pro_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="Parser", pinyin=pro_type.lower()) # nb_module_name = pro_type.capitalize() +"Nb" + "Handler" # nb_handler = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower()) normal_table = pro_type + "_data" err_table = normal_table + "_error" # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据 # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092) # debug_normal_table = "new_"+pro_type.lower()+"_data" # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试 # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug fetch = Fetcher(queue_name, "qyxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: # source_dict = fetch.hget() source_dict = fetch.get() if source_dict: # 拷贝 种子信息到解析后的数据里面 if source_dict.has_key("bbd_seed"): seed_dict = {"bbd_seed": source_dict["bbd_seed"]} if source_dict.has_key("BBD_SEED"): seed_dict = {"bbd_seed": source_dict["BBD_SEED"]} log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING, seed_dict) log.info(log_info) # fetch.backup() # 避免进程异常退出引起的数据丢失 res_dict = UniField.cloneNeedColumns(source_dict) log.info("start to a new seed %s", source_dict) #debug # db_inst.changeTable("new_"+pro_type.lower()) # db_inst.save(source_dict); # rowkey=source_dict["rowkey"] # db_inst.hset(rowkey,source_dict) # db_inst.changeTable("new_"+pro_type.lower()+"_processed") # db_inst.save(source_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: db_inst.changeTable(normal_table) res_dict = UniField.unifyParseResult(res_dict) #for debug db_debug.save(res_dict) # db_inst.save(res_dict) # kfk_inst.save(source_dict) # print "kfk size:",kfk_inst.size() log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict) log.info(log_info) else: db_inst.changeTable(err_table) res_dict["html"] = source_dict # db_inst.save(res_dict) db_debug.save(res_dict) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict) log.info(log_info) else: log.info(u"解析%s队列为空, 等待10秒重试", pro_type) time.sleep(10) except Exception as e: print str(e) raise Exception(e)
def work(pro_type, seed=None): def storeResult(src_dict, company_dict=None): # if company_dict.has_key(u"名称"): # src_dict.update({"company_name": company_dict[u"名称"]}) # src_dict.update({"values":company_dict}) src_dict = UniField.unifyRequestResult(src_dict, pro_type) if src_dict.has_key("rowkey"): rowkey = src_dict["rowkey"] print "统一字段后 rowkey=", rowkey src_dict.update({"BBD_SEED": seed.getDict()}) if src_dict["status"] == 0: db_inst.changeTable("new_" + pro_type) db_inst.hset(rowkey, src_dict) db_inst.save(src_dict) else: db_inst.changeTable("new_" + pro_type + "_error") db_inst.hset(rowkey, src_dict) db_inst.save(src_dict) print "rowkey=", rowkey else: print "No Rowkey ,抓取后的结果为:", src_dict def crawlerKeyWordList(keyword_list): """ 一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb :param keyword_list: :return: """ keyword_num = len(keyword_list) for keyword in keyword_list: seed_status = inst.crawl(keyword) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 # log.info("End seed with keyword %s", keyword) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, seed.getDict()) log.info(log_info) break elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0: keyword_num -= 1 # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, seed.getDict()) log.info("Use Key word [%s] get company failed", keyword) continue else: seed.update(status=seed_status.access_type) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, seed.getDict()) log.info(log_info) seed.save() try: from CommonLib.Logging import Logging log = Logging(name=pro_type) log.info("Process begin for %s", pro_type) module_name = "Crawler" + pro_type.capitalize() pro_type = pro_type.lower() inst = ClassFactory.getClassInst(module_name, package_name="qyxx_all", pinyin=pro_type, callbackFromOuterControl=storeResult) db_inst = DBManager.getInstance("ssdb", "new_" + pro_type, host="spider5", port=57888) if seed is None: seed = Seed(pro_type) seed.get() else: seed = seed # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, seed.getDict()) # log.info("start to a new seed %s",log_info) # if seed.url_status: # seed_status = inst.crawlUrl(seed.url, seed.name) # if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, seed.getDict()) # log.info(log_info) # else:# 用url没有抓成功, 用keywordlist 抓 # log.info(" Url get company info failed [%s]", pro_type) # keyword_list = seed.values # crawlerKeyWordList(keyword_list) # else: keyword_list = seed.values crawlerKeyWordList(keyword_list) except Exception as e: print str(e)
def work(bbd_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin") bbd_type = bbd_type.lower() queue_name = bbd_type nb_module_name = bbd_type.capitalize() + "Nb" + "Handler" nb_handler = ClassFactory.getClassInst(nb_module_name, package_name="Parser", pinyin=bbd_type.lower()) bbd_table = "qyxx_data_nb" bbd_src_table = "qyxx_html_nb" normal_table = bbd_type + "_data" + "_nb" err_table = normal_table + "_error" # html_normal_table = bbd_type+"_src"+"_nb" des_db_inst = DBManager.getInstance(des_db_dict["type"], bbd_table, host=des_db_dict["host"], port=des_db_dict["port"]) #存 解析后数据 err_db_inst = DBManager.getInstance(src_db_dict["type"], err_table, host=src_db_dict["host"], port=src_db_dict["port"]) fetch = Fetcher(queue_name + "_nbxx", "qyxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: source_dict = fetch.hget() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) if res_dict.has_key("year"): res_dict["_id"] = UniField.updateId( res_dict['_id'], res_dict['year']) # log.info("start to a new seed %s",seed_dict) res_dict = nb_handler.parse(source_dict, res_dict) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult(res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict) # log.info(log_info) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict) # log.info(log_info) else: log.info(u"解析%s队列为空, 等待10秒重试", bbd_type) time.sleep(10) except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raise Exception(e)
def work(bbd_type, value_list=None): conf_file = "DBConfig.ini" db_conf_dict = \ { 'type':confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host':confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port':int(confGetterFunc(conf_file, 'html_db', 'port')) } def getNbxxDict(src_dict): nbxx_key_list = filter(lambda x: x.startswith("qynb_"), src_dict.keys()) nbxx_list = map(lambda x: {x: src_dict.pop(x)}, nbxx_key_list) return nbxx_list def getYear(nb_dict): key = nb_dict.keys()[0] year = key.split("_")[1] return year def storeResult(src_dict, company_dict=None): """ 回调函数,由爬虫调用,存储数据到ssdb :param src_dict: :param company_dict: :return: """ try: if src_dict["status"] == 0: src_dict = UniField.unifyRequestResult(src_dict, bbd_type) if src_dict.has_key("rowkey"): rowkey = src_dict["rowkey"] nbxx_list = getNbxxDict(src_dict) nb_year_list = [] # 用来向solr接口发送信息 for nb_item in nbxx_list: # 拆分年报成单独的数据条目,使用不同的rowkey, 放入hash year = getYear(nb_item) nb_year_list.append(year) nbxx_dict = UniField.cloneNeedColumns(src_dict) nbxx_dict.update({"bbd_seed": bbd_seed_dict}) nbxx_dict.update(nb_item) db_inst.changeTable(bbd_type + "_nbxx") nb_rk = rowkey + "|_|" + year nbxx_dict["rowkey"] = nb_rk nbxx_dict["year"] = year db_inst.hset(nb_rk, nbxx_dict) log.info(u"存储 %s 年年报 成功,rowkey 为 [ %s ]", year, nb_rk) zch = src_dict["rowkey_dict"]["company_zch"] company_name = src_dict["rowkey_dict"]["company_name"] log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict) log.info(log_info) src_dict.update({"bbd_seed": bbd_seed_dict}) db_inst.changeTable(bbd_type) db_inst.save(src_dict) log.info(u" ,rowkey 为 [ %s ]", rowkey) NbxxApiControler().nbUpdate(company_name=company_name, pinyin=bbd_type, zch=zch, years_list=nb_year_list) else: raise Exception("No rowkey") else: db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) except Exception as e: log.info(str(e)) db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) log.info(u"存储抓取网页原文 失败,rowkey 为 [ %s ]", rowkey) def crawlerKeyWordList(keyword_list): """ 一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb :param keyword_list: :return: """ try: keyword_num = len(keyword_list) for keyword in keyword_list: keyword_num -= 1 seed_status = inst.crawl(keyword) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 # log.info("End seed with keyword %s", keyword) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) log.info(u"种子抓取成功:)") break elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0: # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info(u"种子抓取失败,关键字 [%s]", keyword) continue else: seed.update(status=seed_status.access_type) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info(log_info) log.info(u"种子抓取失败,存储到队列,种子状态为 %s", str(seed_status)) seed.save() except Exception as e: log.info(str(e)) raise Exception(u"种子抓取过程中遇到异常") ################################################################################################################################## try: from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin for %s,logger=%s", bbd_type, str(log)) module_name = "Crawler" + bbd_type.capitalize() bbd_type = bbd_type.lower() inst = ClassFactory.getClassInst(module_name, package_name="qyxx_all", pinyin=bbd_type, callbackFromOuterControl=storeResult) db_inst = DBManager.getInstance(db_conf_dict["type"], bbd_type, host=db_conf_dict["host"], port=db_conf_dict["port"]) bbd_seed_dict = {} if value_list: for keywd_list in value_list: crawlerKeyWordList(keywd_list) else: seed = Seed(bbd_type) while True: seed.get() bbd_seed_dict = seed.getDict() log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict) log.info("starting a new seed %s", log_info) if seed.url_status: seed_status = inst.crawlUrl(seed.url, seed.name) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) else: # 用url没有抓成功, 用keywordlist 抓 log.info(" Url get company info failed [%s]", bbd_type) keyword_list = seed.values crawlerKeyWordList(keyword_list) else: keyword_list = seed.values crawlerKeyWordList(keyword_list) except Exception as e: log.info(str(e)) seed.save() raise Exception(e)
class Seed(object): def __init__(self, queue_name): self.__db = DBManager.getInstance(f(seed_cf, 'seed_db', 'type'), queue_name, port=f(seed_cf, 'seed_db', 'port'), host=f(seed_cf, 'seed_db', 'host')) #self.__solr = DBManager.getInstance('solr','seed',server=["spider7:8983","spider7:8984","spider7:8985"]) self.queue_name = queue_name from CommonLib.Logging import Logging self.log = Logging(name=queue_name) self.__data_dic = {} def get(self): self.__data__() @property def __get__(self): assert hasattr(self.__db, 'get'), '%s has no attribute get' % self.__db for flag in [ '_temp', '_bug', '_nonstatic', '_static', '_error', '_manyan' ]: self.__db.changeTable(self.queue_name + flag) data = self.__db.get() if data: self.__backup__(data) return data warnings.warn(self.queue_name + flag + u"队列为空") time.sleep(100) self.__get__() def __backup__(self, data): pid = os.getpid() self.__db.keySet('seed_%s' % str(pid) + '_' + self.queue_name, data) def getDict(self): return self.__data_dic def __data__(self): self.__data_dic = {} data = self.__get__.decode("UTF-8", "ignore").strip() if not isinstance(data, dict): if data.startswith('{') and data.endswith('}'): self.__data_dic.update(json.loads(data)) elif len(data) < 2 and len(data) > 100: self.log.error('%s Seed length Error!' % data) return elif re.match(r"^\d{%d}" % len(data), data): self.__data_dic[u"zch"] = data elif re.search(u'[\u4e00-\u9fa5].+', unicode(data)): self.__data_dic['name'] = data elif re.match(u'[\d|\w]{%d}' % len(data), data): self.__data_dic['xydm'] = data else: self.__data_dic['name'] = data else: self.__data_dic.update(data) if 'url' in self.__data_dic: self.url_status = True self.__setattr__('url', self.__data_dic['url']) if 'data' in self.__data_dic: self.__setattr__('data', self.__data_dic['data']) else: self.url_status = False self.__setattr__( 'values', map( lambda x: self.__data_dic[x], filter(lambda x: self.__data_dic.get(x), [ 'xydm', 'zch', 'name', ]))) def update(self, *args, **kwargs): if args: data = filter(lambda x: isinstance(x, dict), args) map(lambda x: self.__data_dic.update(x), data) if kwargs: self.__data_dic.update(kwargs) def __save__(self, flag_name): """ for ssdb :param flag_name: :return: """ self.__db.select_queue(self.queue_name + flag_name) self.__db.save(self.__data_dic) self.__data_dic = {} # def __savesolr__(self): # self.__data_dic['do_time'] = time.strftime("%Y-%m-%d") # self.__data_dic['type'] = self.queue_name # for old_k in self.__data_dic: # if old_k == 'id' or old_k.endswith('_seed'): # continue # else: # self.__data_dic[old_k+'_seed'] = self.__data_dic.pop(old_k) # # if 'id' not in self.__data_dic: # key_list=['xydm_seed','zch_seed','name_seed','url_seed'] # ids = filter(lambda x:x in self.__data_dic,key_list) # if ids: # ids = map(lambda x:self.__solr.find({x:self.__data_dic[x]})['docs'],ids)[0] # if ids: # self.__data_dic['id']=ids[0][0]['id'] # self.__solr.update(self.__data_dic) # self.__data_dic = {} def save(self): status = int(self.__data_dic.get('status', 0)) if status == 0: pass #self.__savesolr__() elif status == SeedAccessType.ERROR: self.__save__('_error') elif status == SeedAccessType.NON_COMPANY: self.__save__('_noncompany') elif status == SeedAccessType.INCOMPLETE: self.__save__('_temp') elif status == SeedAccessType.NO_TARGET_SOURCE: self.__save__('_nosource') else: self.log.info('Status Error!') self.__save__('_unknown')
def work(bbd_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin") bbd_type = bbd_type.lower() queue_name = bbd_type module_name = bbd_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="xgxx", pinyin=bbd_type.lower()) bbd_table = bbd_type + "_data" bbd_src_table = bbd_table + "_src" err_table = bbd_table + "_error" # html_normal_table = bbd_type+"_src"+"_nb" des_db_inst = DBManager.getInstance(des_db_dict["type"], bbd_table, host=des_db_dict["host"], port=des_db_dict["port"]) # 存 解析后数据 err_db_inst = DBManager.getInstance(src_db_dict["type"], err_table, host=src_db_dict["host"], port=src_db_dict["port"]) if bbd_type == 'jyyc': while True: for province in [ 'anhui', 'beijing', 'chongqing', 'fujian', 'gansu', 'guangdong', 'guizhou', 'hainan', 'hebei', 'heilongjiang', 'henan', 'hubei', 'hunan', 'jiangsu', 'jiangxi', 'jilin', 'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shanghai', 'shangxixian', 'sichuan', 'tianjin', 'xinjiang', 'xizang', 'yunnan', 'zhejiang', 'zongju', 'shandong' ]: jyyc_queue = 'jyyc_{}'.format(province) fetch = Fetcher(jyyc_queue, "xgxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) while True: try: source_dict = fetch.get() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) res_dict = handler.parse(source_dict, res_dict, province) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult( res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) else: log.info(u"解析%s队列为空, 进入下一队列", jyyc_queue) break except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raw_input('ssss') raise Exception(e) log.info(u'解析完一轮, 一个小时后进入下一轮') time.sleep(1 * 60 * 60) fetch = Fetcher(queue_name, "xgxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: source_dict = fetch.get() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) # log.info("start to a new seed %s",seed_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult(res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) else: log.info(u"解析%s队列为空, 等待10秒重试", bbd_type) time.sleep(10) except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raise Exception(e)
def work(bbd_type, need_seed=True, value_list=None): """ 爬虫外部控制主函数,包括一下功能: 1. 初始化爬虫类 2. 初始化DB连接 3. 获取种子 4. 存储爬虫返回的数据 5. 存储爬取异常的种子信息 :param bbd_type: 爬虫存储的队列名,也会关联到爬虫模块名,注意***** :param value_list: 爬虫种子信息,手动调试使用 :return: """ conf_file = "DBConfig.ini" db_conf_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } seed_db_dict = \ { 'type': confGetterFunc(conf_file, 'seed_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'seed_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'seed_db', 'port')) } def storeResult(src_dict, company_dict=None): """ 回调函数,由爬虫调用,存储数据到ssdb :param src_dict: :param company_dict: :return: """ try: if "bbd_tmp_queue" in src_dict: queue_name = src_dict["bbd"] if src_dict["status"] == 0: src_dict = UniField.unifyRequestResult(src_dict, bbd_type) if "rowkey" in src_dict.keys(): src_dict.update({"bbd_seed": bbd_seed_dict}) if 'table_name' in src_dict: table_name = src_dict.get('table_name') db_inst.changeTable(table_name) else: db_inst.changeTable(bbd_type) db_inst.save(src_dict) else: raise Exception("No rowkey") else: db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) except Exception as e: log.info(str(e)) db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) def crawlerKeyWordList(keyword_list): """ 一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb :param keyword_list: :return: """ keyword_num = len(keyword_list) for keyword in keyword_list: seed_status = inst.crawl(keyword) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 # log.info("End seed with keyword %s", keyword) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) break elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0: keyword_num -= 1 # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info("Use Key word [%s] get company failed", keyword) continue else: seed.update(status=seed_status.access_type) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info(log_info) seed.save() ################################################################################################################################## try: from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin for %s,logger=%s", bbd_type, str(log)) module_name = "Crawler" + bbd_type.capitalize() bbd_type = bbd_type.lower() inst = ClassFactory.getClassInst(module_name, package_name="xgxx", pinyin=bbd_type, callbackFromOuterControl=storeResult) db_inst = DBManager.getInstance(db_conf_dict["type"], bbd_type, host=db_conf_dict["host"], port=db_conf_dict["port"]) if not need_seed: inst.crawl() else: bbd_seed_dict = {} if value_list: for keywd_list in value_list: crawlerKeyWordList(keywd_list) else: seed_db_inst = DBManager.getInstance(seed_db_dict["type"], bbd_type, host=seed_db_dict["host"], port=seed_db_dict["port"]) while True: bbd_seed_dict = seed_db_inst.get() # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict) # log.info("start to a new seed %s",log_info) seed_status = inst.crawl(bbd_seed_dict) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) else: # 用url没有抓成功, 用keywordlist 抓 log.info(u"种子抓取失败,存取到相应队列 [%s]", bbd_type) seed_db_inst.changeTable(bbd_type + "_error") seed_db_inst.save(bbd_seed_dict) except Exception as e: raise Exception(e)
class ParserBase: """ ParserBase is the base parser to provide common implements for parsers @version:1.0 @author:david ding @modify: """ def __init__(self, pinyin): """ Initiate the parameters. """ self.pinyin = pinyin self.log = Logging(name=pinyin) self.result_collection = None self.json_mapper_config = dict() self.ignore_key_list = list() self.jbxx_web = None # 存放基本信息WebContent pass def appendJsonMapperConfig(self, key, value): if not key or not value or not isinstance(value, dict): return self.json_mapper_config[key] = value def parseCommon(self, result, rslt_mapper_config=None): if not result: return None self.result_collection = list() self.rslt_mapper_config = rslt_mapper_config self.current_key = None if isinstance(result, dict): self.parseDict(result) elif isinstance(result, list): self.parseList(result) mapper_config = rslt_mapper_config if rslt_mapper_config else transform company = self.resultMap(self.result_collection, mapper_config) company_cleaned = self.cleanUp(company) # 提取并加入基本信息url if self.jbxx_web: company_cleaned['bbd_url'] = self.jbxx_web.url return company_cleaned def parseDict(self, result): if not result: return None if self.current_key: web = WebContent.getInstanceFromDictionary(result) if web: if self.current_key.startswith("jbxx_"): self.jbxx_web = web return self.parseWebContentByKey(web) for key in result: if key in self.ignore_key_list: continue if key.endswith('_html') or key.endswith('_json'): self.current_key = key else: self.current_key = None value = result[key] if isinstance(value, dict): self.parseDict(value) elif isinstance(value, list): self.parseList(value) def parseList(self, result): if not result: return None for value in result: if isinstance(value, dict): self.parseDict(value) elif isinstance(value, list): self.parseList(value) def parseWebContentByKey(self, web): if not web or not isinstance(web, WebContent): return if web.status_code != 200 or web.time_out or not web.body: return result_list = None if self.current_key.endswith('_html'): result_list = self.parseHtmlTable(web.body) elif self.current_key.endswith('_json'): if not self.current_key: self.log.error(u"解析json内容 %s 时使无对应的key,请检查结果集结构!" % web.body) if not self.json_mapper_config: self.log.error(u"解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key)) elif self.current_key and self.current_key in self.json_mapper_config: result_list = self.parseJson(web.body, self.json_mapper_config[self.current_key]) else: self.log.error(u"解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key)) if result_list: self.result_collection.extend(result_list) def parseWebContentByType(self, web): if not web or not isinstance(web, WebContent): return if web.status_code != 200 or web.time_out or not web.body: return result_list = None if web.content_type == WebContentType.HTML: result_list = self.parseHtmlTable(web.body) elif web.content_type == WebContentType.JSON: if not self.current_key: self.log.error(u"解析json内容 %s 时使无对应的key,请检查结果集结构!" % web.body) if not self.json_mapper_config: self.log.error(u"解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key)) elif self.current_key and self.current_key in self.json_mapper_config: result_list = self.parseJson(web.body, self.json_mapper_config[self.current_key]) else: self.log.error("解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key)) if result_list: self.result_collection.extend(result_list) def parseHtmlTable(self, html): """ 解析html table型的数据,解析为键值对的标准形式 :param html: 待解析的html table页面 :return: """ parser = TableParseUtil(html) info_list = parser.parse() self.log.info("本次模块解析结果:\n %s", json.dumps(info_list)) return info_list def parseJson(self, json_obj, mapper_config): """ 解析json页面内容 :param json_obj:json字符串或json对象 :param mapper_config:映射字典 :return: """ if not json_obj: return None if isinstance(json_obj, basestring): json_obj = json.loads(json_obj) parser = JsonParseUtil() info_list = parser.parse(json_obj, mapper_config) if not info_list: return None self.log.info("本次模块解析结果:\n %s", json.dumps(info_list)) return info_list def resultMap(self, result_list, mapper_config): """ 抓取结果收集,调用CrawlerMapper实现映射 :param result_list:结果集 :param mapper_config:映射文件 :return: """ company_mapped = ParserMapper.doMap(mapper_config, result_list) result_json = json.dumps(company_mapped, ensure_ascii=False) self.log.info(u"企业信息映射结果:\n" + result_json) return company_mapped def cleanUp(self, company): if not company or not isinstance(company, dict): return None company_clean = dict() for key,value in company.items(): if key in self.ignore_key_list: continue if isinstance(value, dict): v_list = list() v_list.append(value) company_clean[key] = v_list else: company_clean[key] = value # 清理复合key中需要忽略的,例如gdxx.u'详情' for key in self.ignore_key_list: if '.' not in key: continue keys = key.split('.') recr_dict = company_clean for k in keys: if k not in recr_dict: if isinstance(recr_dict, list): for dc in recr_dict: if k in dc: del dc[k] break if isinstance(recr_dict[k], basestring): if k == keys[-1]: del recr_dict[k] break else: recr_dict = recr_dict[k] result_json = json.dumps(company_clean, ensure_ascii=False) self.log.info(u"企业信息初步清理后结果:\n" + result_json) # 检查输出结果中是否有应该映射但未映射成功的key for key,value in company_clean.items(): if isinstance(value, list): if key not in ['gdxx','bgxx','baxx','fzjg','xzcf']: self.log.warning("[%s] 可能需要映射,请检查parser_map_config!" % key) return company_clean
class ParserNbBase: """ ParserNbBase is the annual report base parser to provide common implements for parsers @version:1.0 @author:david ding @modify: """ def __init__(self, pinyin): """ Initiate the parameters. """ self.pinyin = pinyin self.log = Logging(name=pinyin) self.result_collection = None self.json_mapper_config = list() self.result_mapper_config = dict() self.ignore_key_list = list() self.result = dict() pass def appendJsonMapperConfig(self, value): if not value or not isinstance(value, dict): return self.json_mapper_config.append(value) def parseCommon(self, pages, rslt_mapper_config=None): if not pages: return None self.result_collection = list() self.result_mapper_config = rslt_mapper_config if rslt_mapper_config else transform self.current_key = None self.parseDict(pages) self.resultCollect() return {u"qynb": self.result} def parseDict(self, result): if not result: return None if self.current_key: web = WebContent.getInstanceFromDictionary(result) if web: return self.parseWebContent(web) for key in result: if key in self.ignore_key_list: continue if key.endswith('_html') or key.endswith('_json'): self.current_key = key self.resultCollect() else: self.current_key = None value = result[key] if isinstance(value, dict): self.parseDict(value) elif isinstance(value, list): self.parseList(value) def resultCollect(self): if not self.result_collection: return company = self.resultMap(self.result_collection, self.result_mapper_config) # company_cleaned = self.cleanUp(company) self.result = company def parseList(self, result): if not result: return None for value in result: if isinstance(value, dict): self.parseDict(value) elif isinstance(value, list): self.parseList(value) def parseWebContent(self, web): if not web or not isinstance(web, WebContent): return if web.status_code != 200 or web.time_out or not web.body: return result_list = None if self.current_key.endswith('_html'): result_list = self.parseHtmlTable(web.body) elif self.current_key.endswith('_json'): for mc in self.json_mapper_config: result_list = self.parseJson(web.body, mc) if result_list: break if result_list: self.result_collection.extend(result_list) def parseHtmlTable(self, html): """ 解析html table型的数据,解析为键值对的标准形式 :param html: 待解析的html table页面 :return: """ parser = TableParseUtil(html) info_list = parser.parse() res_list = self.cleanComplexKey(info_list) self.log.info(u"本次模块解析结果:\n %s", json.dumps(res_list)) return res_list def cleanComplexKey(self, info_list): if not info_list: return res_list = list() for info_dict in info_list: temp_dict = dict() for k,v in info_dict.items(): arr = k.split('.') if len(arr) >= 3: key_new = "%s.%s" % (arr[-2],arr[-1]) temp_dict[key_new] = v if temp_dict: res_list.append(temp_dict) else: res_list.append(info_dict) return res_list def parseJson(self, json_obj, mapper_config): """ 解析json页面内容 :param json_obj:json字符串或json对象 :param mapper_config:映射字典 :return: """ if not json_obj: return None if isinstance(json_obj, basestring): json_obj = json.loads(json_obj) parser = JsonParseUtil() info_list = parser.parse(json_obj, mapper_config) if not info_list: return None self.log.info(u"本次模块解析结果:\n %s", json.dumps(info_list)) return info_list def resultMap(self, result_list, mapper_config): """ 抓取结果收集,调用CrawlerMapper实现映射 :param result_list:结果集 :param mapper_config:映射文件 :return: """ company_mapped = ParserMapper.doMap(mapper_config, result_list) result_json = json.dumps(company_mapped, ensure_ascii=False) self.log.info(u"企业信息映射结果:\n" + result_json) return company_mapped def cleanUp(self, company): if not company or not isinstance(company, dict): return None company_clean = dict() for key,value in company.items(): if key in self.ignore_key_list: continue if isinstance(value, dict): v_list = list() v_list.append(value) company_clean[key] = v_list else: company_clean[key] = value # 清理复合key中需要忽略的,例如gdxx.u'详情' for key in self.ignore_key_list: if '.' not in key: continue keys = key.split('.') recr_dict = company_clean for k in keys: if k not in recr_dict: if isinstance(recr_dict, list): for dc in recr_dict: if k in dc: del dc[k] break if isinstance(recr_dict[k], basestring): if k == keys[-1]: del recr_dict[k] break else: recr_dict = recr_dict[k] result_json = json.dumps(company_clean, ensure_ascii=False) self.log.info(u"企业信息初步清理后结果:\n" + result_json) # 检查输出结果中是否有应该映射但未映射成功的key for key,value in company_clean.items(): if isinstance(value, list): if key not in ['gdxx','bgxx','baxx','fzjg','xzcf']: self.log.warning(u"[%s] 可能需要映射,请检查parser_map_config!" % key) return company_clean