コード例 #1
0
ファイル: Seed.py プロジェクト: chybot/crawler
 def __init__(self, queue_name):
     self.__db = DBManager.getInstance(f(seed_cf, 'seed_db', 'type'),
                                       queue_name,
                                       port=f(seed_cf, 'seed_db', 'port'),
                                       host=f(seed_cf, 'seed_db', 'host'))
     #self.__solr = DBManager.getInstance('solr','seed',server=["spider7:8983","spider7:8984","spider7:8985"])
     self.queue_name = queue_name
     from CommonLib.Logging import Logging
     self.log = Logging(name=queue_name)
     self.__data_dic = {}
コード例 #2
0
class Fetcher(object):
    def __init__(self,
                 queue_name,
                 sub_type,
                 get_db_dict=None,
                 save_db_dict=None):
        # config = __import__("FetchConfig")
        # get_db_dict = config.QYXX_GET_DB
        # save_db_dict = config.QYXX_PUT_DB
        self.logger = Logging(__name__)
        # queue_name = queue_name # for debug
        self.__get_db = DBManager.getInstance(get_db_dict["type"],
                                              queue_name,
                                              port=get_db_dict["port"],
                                              host=get_db_dict["host"])
        # self.__save_db = DBManager.getInstance(get_db_dict["type"],
        #                                        queue_name,
        #                                        port = get_db_dict["port"],
        #                                        host = get_db_dict["host"])
        self.queue_name = queue_name
        self.__data_dic = {}

    def get(self):
        item = self.__get_db.get()
        # self.__get_db.save(item) # 临时存放一下,debug
        if item:
            self.__data_dic = json.loads(item)
            return self.__data_dic

    def update(self, *args, **kwargs):
        if args:
            data = filter(lambda x: isinstance(x, dict), args)
            map(lambda x: self.__data_dic.update(x), data)
        if kwargs:
            self.__data_dic.update(kwargs)

    def save(self, data_dict=None):
        if data_dict:
            self.__get_db.save(data_dict)
        else:
            self.__get_db.save(self.__data_dic)

    def backup(self):
        k = "html_" + str(os.getpid()) + "_" + self.queue_name
        self.__get_db.keySet(k, self.__data_dic)

    def hget(self, key=None):

        item = self.__get_db.hget(key=key)
        if item:
            self.__data_dic = json.loads(item)
            return self.__data_dic
        else:
            self.logger.warning("%s hash is empty ,please check",
                                self.queue_name)
コード例 #3
0
 def __init__(self, pinyin):
     """
     Initiate the parameters.
     """
     self.pinyin = pinyin
     self.log = Logging(name=pinyin)
     self.result_collection = None
     self.json_mapper_config = dict()
     self.ignore_key_list = list()
     self.jbxx_web = None                # 存放基本信息WebContent
     pass
コード例 #4
0
ファイル: CrawlerController.py プロジェクト: chybot/crawler
    def __init__(self, paras):
        """
        init property use the para instance
        :param paras:
        :return:
        """
        self.subcontrol = paras.getProperty("subcontrol")
        self.inst_name = paras.getProperty("inst_name")
        self.process_num = paras.getProperty("process_num")

        self.logger = Logging("qyxx_process_info", stream_flag=False)
        self.logger.info("logger address")
コード例 #5
0
 def __init__(self,
              queue_name,
              sub_type,
              get_db_dict=None,
              save_db_dict=None):
     # config = __import__("FetchConfig")
     # get_db_dict = config.QYXX_GET_DB
     # save_db_dict = config.QYXX_PUT_DB
     self.logger = Logging(__name__)
     # queue_name = queue_name # for debug
     self.__get_db = DBManager.getInstance(get_db_dict["type"],
                                           queue_name,
                                           port=get_db_dict["port"],
                                           host=get_db_dict["host"])
     # self.__save_db = DBManager.getInstance(get_db_dict["type"],
     #                                        queue_name,
     #                                        port = get_db_dict["port"],
     #                                        host = get_db_dict["host"])
     self.queue_name = queue_name
     self.__data_dic = {}
コード例 #6
0
ファイル: HolderUtil.py プロジェクト: chybot/crawler
 def __init__(self, pinyin, version=None):
     self.pinyin = pinyin
     self.ua = cu.get_user_agent()
     self.version = version
     self.logging = Logging(name=pinyin)
     self.recChar = None
     self.yzm_count = 0
     cg = ConfigGet('Config.ini')
     opt = cg.get("setting", "debug", "false")
     self.debug = 1 if opt.lower() == "true" else 0
     pass
コード例 #7
0
ファイル: CrawlerController.py プロジェクト: chybot/crawler
class CrawlerController(object):
    """
    class for control different type of subcontrol
    such as qyxx, xgxx
    paramater is the outer input
    get subcontrol instance dynamically
    """
    def __init__(self, paras):
        """
        init property use the para instance
        :param paras:
        :return:
        """
        self.subcontrol = paras.getProperty("subcontrol")
        self.inst_name = paras.getProperty("inst_name")
        self.process_num = paras.getProperty("process_num")

        self.logger = Logging("qyxx_process_info", stream_flag=False)
        self.logger.info("logger address")

    def loadSubControl(self):
        sub_ctrl_inst = ClassFactory.getClassInst(self.subcontrol)
        return sub_ctrl_inst
コード例 #8
0
def work(self, pro_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=pro_type)

    log.info("Process begin")

    pro_type = pro_type.lower()
    queue_name = pro_type

    module_name = pro_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="Parser",
                                        pinyin=pro_type.lower())

    # nb_module_name = pro_type.capitalize() +"Nb" + "Handler"
    # nb_handler  = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower())

    normal_table = pro_type + "_data"
    err_table = normal_table + "_error"

    # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据

    # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092)
    # debug_normal_table =  "new_"+pro_type.lower()+"_data"
    # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试

    # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug

    fetch = Fetcher(queue_name,
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            # source_dict = fetch.hget()
            source_dict = fetch.get()

            if source_dict:
                # 拷贝 种子信息到解析后的数据里面
                if source_dict.has_key("bbd_seed"):
                    seed_dict = {"bbd_seed": source_dict["bbd_seed"]}
                if source_dict.has_key("BBD_SEED"):
                    seed_dict = {"bbd_seed": source_dict["BBD_SEED"]}
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING,
                                    seed_dict)
                log.info(log_info)
                # fetch.backup() # 避免进程异常退出引起的数据丢失
                res_dict = UniField.cloneNeedColumns(source_dict)
                log.info("start to a new seed %s", source_dict)

                #debug
                # db_inst.changeTable("new_"+pro_type.lower())
                # db_inst.save(source_dict);
                # rowkey=source_dict["rowkey"]
                # db_inst.hset(rowkey,source_dict)
                # db_inst.changeTable("new_"+pro_type.lower()+"_processed")
                # db_inst.save(source_dict)
                res_dict = handler.parse(source_dict, res_dict)

                if res_dict["status"] == 0:
                    db_inst.changeTable(normal_table)
                    res_dict = UniField.unifyParseResult(res_dict)

                    #for debug
                    db_debug.save(res_dict)

                    # db_inst.save(res_dict)
                    # kfk_inst.save(source_dict)
                    # print "kfk size:",kfk_inst.size()
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC,
                                        seed_dict)
                    log.info(log_info)
                else:
                    db_inst.changeTable(err_table)
                    res_dict["html"] = source_dict

                    # db_inst.save(res_dict)
                    db_debug.save(res_dict)

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO,
                                        seed_dict)
                    log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", pro_type)
                time.sleep(10)
        except Exception as e:
            print str(e)
            raise Exception(e)
コード例 #9
0
ファイル: QyxxWorker.py プロジェクト: chybot/crawler
def work(pro_type, seed=None):
    def storeResult(src_dict, company_dict=None):
        # if company_dict.has_key(u"名称"):
        #     src_dict.update({"company_name": company_dict[u"名称"]})
        #     src_dict.update({"values":company_dict})

        src_dict = UniField.unifyRequestResult(src_dict, pro_type)
        if src_dict.has_key("rowkey"):

            rowkey = src_dict["rowkey"]
            print "统一字段后 rowkey=", rowkey
            src_dict.update({"BBD_SEED": seed.getDict()})
            if src_dict["status"] == 0:
                db_inst.changeTable("new_" + pro_type)
                db_inst.hset(rowkey, src_dict)
                db_inst.save(src_dict)

            else:
                db_inst.changeTable("new_" + pro_type + "_error")
                db_inst.hset(rowkey, src_dict)
                db_inst.save(src_dict)
            print "rowkey=", rowkey
        else:
            print "No Rowkey ,抓取后的结果为:", src_dict

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        keyword_num = len(keyword_list)
        for keyword in keyword_list:
            seed_status = inst.crawl(keyword)
            if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                # log.info("End seed with keyword %s", keyword)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                    seed.getDict())
                log.info(log_info)
                break
            elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:
                keyword_num -= 1
                # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, seed.getDict())
                log.info("Use Key word [%s] get company failed", keyword)
                continue
            else:
                seed.update(status=seed_status.access_type)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                    seed.getDict())
                log.info(log_info)
                seed.save()

    try:
        from CommonLib.Logging import Logging
        log = Logging(name=pro_type)
        log.info("Process begin for %s", pro_type)

        module_name = "Crawler" + pro_type.capitalize()
        pro_type = pro_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="qyxx_all",
                                         pinyin=pro_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance("ssdb",
                                        "new_" + pro_type,
                                        host="spider5",
                                        port=57888)
        if seed is None:
            seed = Seed(pro_type)
            seed.get()
        else:
            seed = seed

            # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, seed.getDict())
            # log.info("start to a new seed %s",log_info)
            # if seed.url_status:
            #     seed_status = inst.crawlUrl(seed.url, seed.name)
            #     if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
            #         log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, seed.getDict())
            #         log.info(log_info)
            #     else:# 用url没有抓成功, 用keywordlist 抓
            #         log.info(" Url get company info failed  [%s]", pro_type)
            #         keyword_list = seed.values
            #         crawlerKeyWordList(keyword_list)
            # else:
            keyword_list = seed.values
            crawlerKeyWordList(keyword_list)

    except Exception as e:
        print str(e)
コード例 #10
0
ファイル: QyxxNbParseWorker.py プロジェクト: chybot/crawler
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type

    nb_module_name = bbd_type.capitalize() + "Nb" + "Handler"
    nb_handler = ClassFactory.getClassInst(nb_module_name,
                                           package_name="Parser",
                                           pinyin=bbd_type.lower())

    bbd_table = "qyxx_data_nb"
    bbd_src_table = "qyxx_html_nb"
    normal_table = bbd_type + "_data" + "_nb"
    err_table = normal_table + "_error"
    # html_normal_table = bbd_type+"_src"+"_nb"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  #存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    fetch = Fetcher(queue_name + "_nbxx",
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            source_dict = fetch.hget()
            if source_dict:
                res_dict = UniField.cloneNeedColumns(source_dict)
                if res_dict.has_key("year"):
                    res_dict["_id"] = UniField.updateId(
                        res_dict['_id'], res_dict['year'])
                # log.info("start to a new seed %s",seed_dict)

                res_dict = nb_handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict)
                    # log.info(log_info)
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict)
                    # log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
コード例 #11
0
ファイル: QyxxReqWorker.py プロジェクト: chybot/crawler
def work(bbd_type, value_list=None):
    conf_file = "DBConfig.ini"
    db_conf_dict = \
        {
            'type':confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host':confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port':int(confGetterFunc(conf_file, 'html_db', 'port'))
        }

    def getNbxxDict(src_dict):
        nbxx_key_list = filter(lambda x: x.startswith("qynb_"),
                               src_dict.keys())
        nbxx_list = map(lambda x: {x: src_dict.pop(x)}, nbxx_key_list)
        return nbxx_list

    def getYear(nb_dict):
        key = nb_dict.keys()[0]
        year = key.split("_")[1]
        return year

    def storeResult(src_dict, company_dict=None):
        """
        回调函数,由爬虫调用,存储数据到ssdb
        :param src_dict:
        :param company_dict:
        :return:
        """
        try:
            if src_dict["status"] == 0:
                src_dict = UniField.unifyRequestResult(src_dict, bbd_type)
                if src_dict.has_key("rowkey"):
                    rowkey = src_dict["rowkey"]

                    nbxx_list = getNbxxDict(src_dict)
                    nb_year_list = []  # 用来向solr接口发送信息
                    for nb_item in nbxx_list:
                        # 拆分年报成单独的数据条目,使用不同的rowkey, 放入hash
                        year = getYear(nb_item)
                        nb_year_list.append(year)
                        nbxx_dict = UniField.cloneNeedColumns(src_dict)
                        nbxx_dict.update({"bbd_seed": bbd_seed_dict})
                        nbxx_dict.update(nb_item)
                        db_inst.changeTable(bbd_type + "_nbxx")
                        nb_rk = rowkey + "|_|" + year
                        nbxx_dict["rowkey"] = nb_rk
                        nbxx_dict["year"] = year
                        db_inst.hset(nb_rk, nbxx_dict)
                        log.info(u"存储 %s 年年报 成功,rowkey 为 [ %s ]", year, nb_rk)
                    zch = src_dict["rowkey_dict"]["company_zch"]
                    company_name = src_dict["rowkey_dict"]["company_name"]

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING,
                                        bbd_seed_dict)
                    log.info(log_info)
                    src_dict.update({"bbd_seed": bbd_seed_dict})
                    db_inst.changeTable(bbd_type)
                    db_inst.save(src_dict)
                    log.info(u" ,rowkey 为 [ %s ]", rowkey)
                    NbxxApiControler().nbUpdate(company_name=company_name,
                                                pinyin=bbd_type,
                                                zch=zch,
                                                years_list=nb_year_list)

                else:
                    raise Exception("No rowkey")
            else:
                db_inst.changeTable(bbd_type + "_error")
                db_inst.save(src_dict)
        except Exception as e:
            log.info(str(e))
            db_inst.changeTable(bbd_type + "_error")
            db_inst.save(src_dict)

            log.info(u"存储抓取网页原文 失败,rowkey 为 [ %s ]", rowkey)

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        try:
            keyword_num = len(keyword_list)
            for keyword in keyword_list:
                keyword_num -= 1
                seed_status = inst.crawl(keyword)
                if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                    # log.info("End seed with keyword %s", keyword)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取成功:)")
                    break
                elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                    log.info(u"种子抓取失败,关键字 [%s]", keyword)
                    continue
                else:
                    seed.update(status=seed_status.access_type)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取失败,存储到队列,种子状态为 %s", str(seed_status))
                    seed.save()
        except Exception as e:
            log.info(str(e))
            raise Exception(u"种子抓取过程中遇到异常")

    ##################################################################################################################################
    try:
        from CommonLib.Logging import Logging
        log = Logging(name=bbd_type)
        log.info("Process begin for %s,logger=%s", bbd_type, str(log))

        module_name = "Crawler" + bbd_type.capitalize()
        bbd_type = bbd_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="qyxx_all",
                                         pinyin=bbd_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance(db_conf_dict["type"],
                                        bbd_type,
                                        host=db_conf_dict["host"],
                                        port=db_conf_dict["port"])
        bbd_seed_dict = {}
        if value_list:
            for keywd_list in value_list:
                crawlerKeyWordList(keywd_list)
        else:
            seed = Seed(bbd_type)

            while True:
                seed.get()
                bbd_seed_dict = seed.getDict()
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict)
                log.info("starting a new seed %s", log_info)
                if seed.url_status:
                    seed_status = inst.crawlUrl(seed.url, seed.name)
                    if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                        log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                            bbd_seed_dict)
                        log.info(log_info)
                    else:  # 用url没有抓成功, 用keywordlist 抓
                        log.info(" Url get company info failed  [%s]",
                                 bbd_type)
                        keyword_list = seed.values
                        crawlerKeyWordList(keyword_list)
                else:
                    keyword_list = seed.values
                    crawlerKeyWordList(keyword_list)
    except Exception as e:
        log.info(str(e))
        seed.save()
        raise Exception(e)
コード例 #12
0
ファイル: Seed.py プロジェクト: chybot/crawler
class Seed(object):
    def __init__(self, queue_name):
        self.__db = DBManager.getInstance(f(seed_cf, 'seed_db', 'type'),
                                          queue_name,
                                          port=f(seed_cf, 'seed_db', 'port'),
                                          host=f(seed_cf, 'seed_db', 'host'))
        #self.__solr = DBManager.getInstance('solr','seed',server=["spider7:8983","spider7:8984","spider7:8985"])
        self.queue_name = queue_name
        from CommonLib.Logging import Logging
        self.log = Logging(name=queue_name)
        self.__data_dic = {}

    def get(self):
        self.__data__()

    @property
    def __get__(self):
        assert hasattr(self.__db, 'get'), '%s has no attribute get' % self.__db
        for flag in [
                '_temp', '_bug', '_nonstatic', '_static', '_error', '_manyan'
        ]:
            self.__db.changeTable(self.queue_name + flag)
            data = self.__db.get()
            if data:
                self.__backup__(data)
                return data
            warnings.warn(self.queue_name + flag + u"队列为空")
        time.sleep(100)
        self.__get__()

    def __backup__(self, data):
        pid = os.getpid()
        self.__db.keySet('seed_%s' % str(pid) + '_' + self.queue_name, data)

    def getDict(self):
        return self.__data_dic

    def __data__(self):
        self.__data_dic = {}
        data = self.__get__.decode("UTF-8", "ignore").strip()
        if not isinstance(data, dict):
            if data.startswith('{') and data.endswith('}'):
                self.__data_dic.update(json.loads(data))
            elif len(data) < 2 and len(data) > 100:
                self.log.error('%s Seed length Error!' % data)
                return
            elif re.match(r"^\d{%d}" % len(data), data):
                self.__data_dic[u"zch"] = data
            elif re.search(u'[\u4e00-\u9fa5].+', unicode(data)):
                self.__data_dic['name'] = data
            elif re.match(u'[\d|\w]{%d}' % len(data), data):
                self.__data_dic['xydm'] = data
            else:
                self.__data_dic['name'] = data
        else:
            self.__data_dic.update(data)

        if 'url' in self.__data_dic:
            self.url_status = True
            self.__setattr__('url', self.__data_dic['url'])
            if 'data' in self.__data_dic:
                self.__setattr__('data', self.__data_dic['data'])
        else:
            self.url_status = False
            self.__setattr__(
                'values',
                map(
                    lambda x: self.__data_dic[x],
                    filter(lambda x: self.__data_dic.get(x), [
                        'xydm',
                        'zch',
                        'name',
                    ])))

    def update(self, *args, **kwargs):
        if args:
            data = filter(lambda x: isinstance(x, dict), args)
            map(lambda x: self.__data_dic.update(x), data)
        if kwargs:
            self.__data_dic.update(kwargs)

    def __save__(self, flag_name):
        """
        for ssdb
        :param flag_name:
        :return:
        """
        self.__db.select_queue(self.queue_name + flag_name)
        self.__db.save(self.__data_dic)
        self.__data_dic = {}

    # def __savesolr__(self):
    #     self.__data_dic['do_time'] = time.strftime("%Y-%m-%d")
    #     self.__data_dic['type'] = self.queue_name
    #     for old_k in self.__data_dic:
    #         if old_k == 'id' or old_k.endswith('_seed'):
    #             continue
    #         else:
    #             self.__data_dic[old_k+'_seed'] = self.__data_dic.pop(old_k)
    #
    #     if 'id' not in self.__data_dic:
    #         key_list=['xydm_seed','zch_seed','name_seed','url_seed']
    #         ids = filter(lambda x:x in self.__data_dic,key_list)
    #         if ids:
    #             ids = map(lambda x:self.__solr.find({x:self.__data_dic[x]})['docs'],ids)[0]
    #             if ids:
    #                 self.__data_dic['id']=ids[0][0]['id']
    #     self.__solr.update(self.__data_dic)
    #     self.__data_dic = {}

    def save(self):
        status = int(self.__data_dic.get('status', 0))
        if status == 0:
            pass
            #self.__savesolr__()
        elif status == SeedAccessType.ERROR:
            self.__save__('_error')
        elif status == SeedAccessType.NON_COMPANY:
            self.__save__('_noncompany')
        elif status == SeedAccessType.INCOMPLETE:
            self.__save__('_temp')
        elif status == SeedAccessType.NO_TARGET_SOURCE:
            self.__save__('_nosource')
        else:
            self.log.info('Status Error!')
            self.__save__('_unknown')
コード例 #13
0
ファイル: XgxxParseWorker.py プロジェクト: chybot/crawler
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type

    module_name = bbd_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="xgxx",
                                        pinyin=bbd_type.lower())

    bbd_table = bbd_type + "_data"
    bbd_src_table = bbd_table + "_src"
    err_table = bbd_table + "_error"
    # html_normal_table = bbd_type+"_src"+"_nb"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  # 存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    if bbd_type == 'jyyc':
        while True:
            for province in [
                    'anhui', 'beijing', 'chongqing', 'fujian', 'gansu',
                    'guangdong', 'guizhou', 'hainan', 'hebei', 'heilongjiang',
                    'henan', 'hubei', 'hunan', 'jiangsu', 'jiangxi', 'jilin',
                    'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shanghai',
                    'shangxixian', 'sichuan', 'tianjin', 'xinjiang', 'xizang',
                    'yunnan', 'zhejiang', 'zongju', 'shandong'
            ]:
                jyyc_queue = 'jyyc_{}'.format(province)
                fetch = Fetcher(jyyc_queue,
                                "xgxx",
                                get_db_dict=src_db_dict,
                                save_db_dict=des_db_dict)
                while True:
                    try:
                        source_dict = fetch.get()
                        if source_dict:
                            res_dict = UniField.cloneNeedColumns(source_dict)
                            res_dict = handler.parse(source_dict, res_dict,
                                                     province)
                            if res_dict["status"] == 0:
                                res_dict = UniField.unifyParseResult(
                                    res_dict, bbd_table=bbd_table)
                                des_db_inst.changeTable(bbd_table)
                                des_db_inst.save(res_dict)
                                log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ",
                                         bbd_table, str(des_db_inst.size()))
                                des_db_inst.changeTable(bbd_src_table)
                                des_db_inst.save(source_dict)
                                log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ",
                                         bbd_src_table,
                                         str(des_db_inst.size()))
                            else:
                                source_dict["data"] = res_dict
                                err_db_inst.save(source_dict)
                        else:
                            log.info(u"解析%s队列为空, 进入下一队列", jyyc_queue)
                            break
                    except Exception as e:
                        log.info(str(e))
                        source_dict["data"] = res_dict
                        err_db_inst.save(source_dict)
                        raw_input('ssss')
                        raise Exception(e)
            log.info(u'解析完一轮, 一个小时后进入下一轮')
            time.sleep(1 * 60 * 60)

    fetch = Fetcher(queue_name,
                    "xgxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug
    while True:
        try:
            source_dict = fetch.get()
            if source_dict:
                res_dict = UniField.cloneNeedColumns(source_dict)
                # log.info("start to a new seed %s",seed_dict)
                res_dict = handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
コード例 #14
0
ファイル: XgxxReqWorker.py プロジェクト: chybot/crawler
def work(bbd_type, need_seed=True, value_list=None):
    """
    爬虫外部控制主函数,包括一下功能:
    1. 初始化爬虫类
    2. 初始化DB连接
    3. 获取种子
    4. 存储爬虫返回的数据
    5. 存储爬取异常的种子信息
    :param bbd_type: 爬虫存储的队列名,也会关联到爬虫模块名,注意*****
    :param value_list: 爬虫种子信息,手动调试使用
    :return:
    """
    conf_file = "DBConfig.ini"
    db_conf_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    seed_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'seed_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'seed_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'seed_db', 'port'))
        }

    def storeResult(src_dict, company_dict=None):
        """
        回调函数,由爬虫调用,存储数据到ssdb
        :param src_dict:
        :param company_dict:
        :return:
        """
        try:
            if "bbd_tmp_queue" in src_dict:
                queue_name = src_dict["bbd"]

            if src_dict["status"] == 0:
                src_dict = UniField.unifyRequestResult(src_dict, bbd_type)
                if "rowkey" in src_dict.keys():
                    src_dict.update({"bbd_seed": bbd_seed_dict})
                    if 'table_name' in src_dict:
                        table_name = src_dict.get('table_name')
                        db_inst.changeTable(table_name)
                    else:
                        db_inst.changeTable(bbd_type)
                    db_inst.save(src_dict)
                else:
                    raise Exception("No rowkey")
            else:
                db_inst.changeTable(bbd_type + "_error")
                db_inst.save(src_dict)
        except Exception as e:
            log.info(str(e))
            db_inst.changeTable(bbd_type + "_error")
            db_inst.save(src_dict)

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        keyword_num = len(keyword_list)
        for keyword in keyword_list:
            seed_status = inst.crawl(keyword)
            if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                # log.info("End seed with keyword %s", keyword)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict)
                log.info(log_info)
                break
            elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:
                keyword_num -= 1
                # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                log.info("Use Key word [%s] get company failed", keyword)
                continue
            else:
                seed.update(status=seed_status.access_type)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                log.info(log_info)
                seed.save()

    ##################################################################################################################################
    try:
        from CommonLib.Logging import Logging
        log = Logging(name=bbd_type)
        log.info("Process begin for %s,logger=%s", bbd_type, str(log))

        module_name = "Crawler" + bbd_type.capitalize()
        bbd_type = bbd_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="xgxx",
                                         pinyin=bbd_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance(db_conf_dict["type"],
                                        bbd_type,
                                        host=db_conf_dict["host"],
                                        port=db_conf_dict["port"])
        if not need_seed:
            inst.crawl()
        else:
            bbd_seed_dict = {}
            if value_list:
                for keywd_list in value_list:
                    crawlerKeyWordList(keywd_list)
            else:
                seed_db_inst = DBManager.getInstance(seed_db_dict["type"],
                                                     bbd_type,
                                                     host=seed_db_dict["host"],
                                                     port=seed_db_dict["port"])
                while True:
                    bbd_seed_dict = seed_db_inst.get()
                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict)
                    # log.info("start to a new seed %s",log_info)
                    seed_status = inst.crawl(bbd_seed_dict)
                    if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                        log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict)
                        log.info(log_info)
                    else:  # 用url没有抓成功, 用keywordlist 抓
                        log.info(u"种子抓取失败,存取到相应队列 [%s]", bbd_type)
                        seed_db_inst.changeTable(bbd_type + "_error")
                        seed_db_inst.save(bbd_seed_dict)

    except Exception as e:
        raise Exception(e)
コード例 #15
0
class ParserBase:
    """
    ParserBase is the base parser to provide common implements for parsers
    @version:1.0
    @author:david ding
    @modify:
    """
    def __init__(self, pinyin):
        """
        Initiate the parameters.
        """
        self.pinyin = pinyin
        self.log = Logging(name=pinyin)
        self.result_collection = None
        self.json_mapper_config = dict()
        self.ignore_key_list = list()
        self.jbxx_web = None                # 存放基本信息WebContent
        pass

    def appendJsonMapperConfig(self, key, value):
        if not key or not value or not isinstance(value, dict):
            return
        self.json_mapper_config[key] = value

    def parseCommon(self, result, rslt_mapper_config=None):
        if not result:
            return None
        self.result_collection = list()
        self.rslt_mapper_config = rslt_mapper_config
        self.current_key = None
        if isinstance(result, dict):
            self.parseDict(result)
        elif isinstance(result, list):
            self.parseList(result)
        mapper_config = rslt_mapper_config if rslt_mapper_config else transform
        company = self.resultMap(self.result_collection, mapper_config)
        company_cleaned = self.cleanUp(company)
        # 提取并加入基本信息url
        if self.jbxx_web:
            company_cleaned['bbd_url'] = self.jbxx_web.url
        return company_cleaned

    def parseDict(self, result):
        if not result:
            return None
        if self.current_key:
            web = WebContent.getInstanceFromDictionary(result)
            if web:
                if self.current_key.startswith("jbxx_"):
                    self.jbxx_web = web
                return self.parseWebContentByKey(web)
        for key in result:
            if key in self.ignore_key_list:
                continue
            if key.endswith('_html') or key.endswith('_json'):
                self.current_key = key
            else:
                self.current_key = None
            value = result[key]
            if isinstance(value, dict):
                self.parseDict(value)
            elif isinstance(value, list):
                self.parseList(value)

    def parseList(self, result):
        if not result:
            return None
        for value in result:
            if isinstance(value, dict):
                self.parseDict(value)
            elif isinstance(value, list):
                self.parseList(value)

    def parseWebContentByKey(self, web):
        if not web or not isinstance(web, WebContent):
            return
        if web.status_code != 200 or web.time_out or not web.body:
            return
        result_list = None
        if self.current_key.endswith('_html'):
            result_list = self.parseHtmlTable(web.body)
        elif self.current_key.endswith('_json'):
            if not self.current_key:
                self.log.error(u"解析json内容 %s 时使无对应的key,请检查结果集结构!" % web.body)
            if not self.json_mapper_config:
                self.log.error(u"解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key))
            elif self.current_key and self.current_key in self.json_mapper_config:
                result_list = self.parseJson(web.body, self.json_mapper_config[self.current_key])
            else:
                self.log.error(u"解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key))
        if result_list:
            self.result_collection.extend(result_list)

    def parseWebContentByType(self, web):
        if not web or not isinstance(web, WebContent):
            return
        if web.status_code != 200 or web.time_out or not web.body:
            return
        result_list = None
        if web.content_type == WebContentType.HTML:
            result_list = self.parseHtmlTable(web.body)
        elif web.content_type == WebContentType.JSON:
            if not self.current_key:
                self.log.error(u"解析json内容 %s 时使无对应的key,请检查结果集结构!" % web.body)
            if not self.json_mapper_config:
                self.log.error(u"解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key))
            elif self.current_key and self.current_key in self.json_mapper_config:
                result_list = self.parseJson(web.body, self.json_mapper_config[self.current_key])
            else:
                self.log.error("解析json内容 %s 时使用key=%s无对应的config" % (web.body, self.current_key))
        if result_list:
            self.result_collection.extend(result_list)

    def parseHtmlTable(self, html):
        """
        解析html table型的数据,解析为键值对的标准形式
        :param html: 待解析的html table页面
        :return:
        """
        parser = TableParseUtil(html)
        info_list = parser.parse()
        self.log.info("本次模块解析结果:\n %s", json.dumps(info_list))
        return info_list

    def parseJson(self, json_obj, mapper_config):
        """
        解析json页面内容
        :param json_obj:json字符串或json对象
        :param mapper_config:映射字典
        :return:
        """
        if not json_obj:
            return None
        if isinstance(json_obj, basestring):
            json_obj = json.loads(json_obj)
        parser = JsonParseUtil()
        info_list = parser.parse(json_obj, mapper_config)
        if not info_list:
            return None
        self.log.info("本次模块解析结果:\n %s", json.dumps(info_list))
        return info_list

    def resultMap(self, result_list, mapper_config):
        """
        抓取结果收集,调用CrawlerMapper实现映射
        :param result_list:结果集
        :param mapper_config:映射文件
        :return:
        """
        company_mapped = ParserMapper.doMap(mapper_config, result_list)
        result_json = json.dumps(company_mapped, ensure_ascii=False)
        self.log.info(u"企业信息映射结果:\n" + result_json)
        return company_mapped

    def cleanUp(self, company):
        if not company or not isinstance(company, dict):
            return None
        company_clean = dict()
        for key,value in company.items():
            if key in self.ignore_key_list:
                continue
            if isinstance(value, dict):
                v_list = list()
                v_list.append(value)
                company_clean[key] = v_list
            else:
                company_clean[key] = value
        # 清理复合key中需要忽略的,例如gdxx.u'详情'
        for key in self.ignore_key_list:
            if '.' not in key:
                continue
            keys = key.split('.')
            recr_dict = company_clean
            for k in keys:
                if k not in recr_dict:
                    if isinstance(recr_dict, list):
                        for dc in recr_dict:
                            if k in dc:
                                del dc[k]
                    break
                if isinstance(recr_dict[k], basestring):
                    if k == keys[-1]:
                        del recr_dict[k]
                    break
                else:
                    recr_dict = recr_dict[k]
        result_json = json.dumps(company_clean, ensure_ascii=False)
        self.log.info(u"企业信息初步清理后结果:\n" + result_json)
        # 检查输出结果中是否有应该映射但未映射成功的key
        for key,value in company_clean.items():
            if isinstance(value, list):
                if key not in ['gdxx','bgxx','baxx','fzjg','xzcf']:
                    self.log.warning("[%s] 可能需要映射,请检查parser_map_config!" % key)
        return company_clean
コード例 #16
0
class ParserNbBase:
    """
    ParserNbBase is the annual report base parser to provide common implements for parsers
    @version:1.0
    @author:david ding
    @modify:
    """
    def __init__(self, pinyin):
        """
        Initiate the parameters.
        """
        self.pinyin = pinyin
        self.log = Logging(name=pinyin)
        self.result_collection = None
        self.json_mapper_config = list()
        self.result_mapper_config = dict()
        self.ignore_key_list = list()
        self.result = dict()
        pass

    def appendJsonMapperConfig(self, value):
        if not value or not isinstance(value, dict):
            return
        self.json_mapper_config.append(value)

    def parseCommon(self, pages, rslt_mapper_config=None):
        if not pages:
            return None
        self.result_collection = list()
        self.result_mapper_config = rslt_mapper_config if rslt_mapper_config else transform
        self.current_key = None
        self.parseDict(pages)
        self.resultCollect()
        return {u"qynb": self.result}

    def parseDict(self, result):
        if not result:
            return None
        if self.current_key:
            web = WebContent.getInstanceFromDictionary(result)
            if web:
                return self.parseWebContent(web)
        for key in result:
            if key in self.ignore_key_list:
                continue
            if key.endswith('_html') or key.endswith('_json'):
                self.current_key = key
                self.resultCollect()
            else:
                self.current_key = None
            value = result[key]
            if isinstance(value, dict):
                self.parseDict(value)
            elif isinstance(value, list):
                self.parseList(value)

    def resultCollect(self):
        if not self.result_collection:
            return
        company = self.resultMap(self.result_collection, self.result_mapper_config)
        # company_cleaned = self.cleanUp(company)
        self.result = company

    def parseList(self, result):
        if not result:
            return None
        for value in result:
            if isinstance(value, dict):
                self.parseDict(value)
            elif isinstance(value, list):
                self.parseList(value)

    def parseWebContent(self, web):
        if not web or not isinstance(web, WebContent):
            return
        if web.status_code != 200 or web.time_out or not web.body:
            return
        result_list = None
        if self.current_key.endswith('_html'):
            result_list = self.parseHtmlTable(web.body)
        elif self.current_key.endswith('_json'):
            for mc in self.json_mapper_config:
                result_list = self.parseJson(web.body, mc)
                if result_list:
                    break
        if result_list:
            self.result_collection.extend(result_list)

    def parseHtmlTable(self, html):
        """
        解析html table型的数据,解析为键值对的标准形式
        :param html: 待解析的html table页面
        :return:
        """
        parser = TableParseUtil(html)
        info_list = parser.parse()
        res_list = self.cleanComplexKey(info_list)
        self.log.info(u"本次模块解析结果:\n %s", json.dumps(res_list))
        return res_list

    def cleanComplexKey(self, info_list):
        if not info_list:
            return
        res_list = list()
        for info_dict in info_list:
            temp_dict = dict()
            for k,v in info_dict.items():
                arr = k.split('.')
                if len(arr) >= 3:
                    key_new = "%s.%s" % (arr[-2],arr[-1])
                    temp_dict[key_new] = v
            if temp_dict:
                res_list.append(temp_dict)
            else:
                res_list.append(info_dict)
        return res_list

    def parseJson(self, json_obj, mapper_config):
        """
        解析json页面内容
        :param json_obj:json字符串或json对象
        :param mapper_config:映射字典
        :return:
        """
        if not json_obj:
            return None
        if isinstance(json_obj, basestring):
            json_obj = json.loads(json_obj)
        parser = JsonParseUtil()
        info_list = parser.parse(json_obj, mapper_config)
        if not info_list:
            return None
        self.log.info(u"本次模块解析结果:\n %s", json.dumps(info_list))
        return info_list

    def resultMap(self, result_list, mapper_config):
        """
        抓取结果收集,调用CrawlerMapper实现映射
        :param result_list:结果集
        :param mapper_config:映射文件
        :return:
        """
        company_mapped = ParserMapper.doMap(mapper_config, result_list)
        result_json = json.dumps(company_mapped, ensure_ascii=False)
        self.log.info(u"企业信息映射结果:\n" + result_json)
        return company_mapped

    def cleanUp(self, company):
        if not company or not isinstance(company, dict):
            return None
        company_clean = dict()
        for key,value in company.items():
            if key in self.ignore_key_list:
                continue
            if isinstance(value, dict):
                v_list = list()
                v_list.append(value)
                company_clean[key] = v_list
            else:
                company_clean[key] = value
        # 清理复合key中需要忽略的,例如gdxx.u'详情'
        for key in self.ignore_key_list:
            if '.' not in key:
                continue
            keys = key.split('.')
            recr_dict = company_clean
            for k in keys:
                if k not in recr_dict:
                    if isinstance(recr_dict, list):
                        for dc in recr_dict:
                            if k in dc:
                                del dc[k]
                    break
                if isinstance(recr_dict[k], basestring):
                    if k == keys[-1]:
                        del recr_dict[k]
                    break
                else:
                    recr_dict = recr_dict[k]
        result_json = json.dumps(company_clean, ensure_ascii=False)
        self.log.info(u"企业信息初步清理后结果:\n" + result_json)
        # 检查输出结果中是否有应该映射但未映射成功的key
        for key,value in company_clean.items():
            if isinstance(value, list):
                if key not in ['gdxx','bgxx','baxx','fzjg','xzcf']:
                    self.log.warning(u"[%s] 可能需要映射,请检查parser_map_config!" % key)
        return company_clean