コード例 #1
0
class Qichacha(object):
    def get_info_url(self, tab, key_num, name, page=None):
        if "NONAME" in name:
            name = "%20"
        if None == page or page == 1:
            ret = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab={tab}".format(
                key_num=key_num, name=name, tab=tab)
        else:
            ret = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab={tab}&page={page}".format(
                key_num=key_num, name=name, page=page, tab=tab)

        #if self.config.get('debug'):
        #    print (ret)

        return ret

    def __init__(self,
                 config,
                 batch_id=None,
                 groups=None,
                 refresh=False,
                 request=True,
                 cache_only=False):
        if batch_id is None:
            batch_id = "qichacha0831"
        if config is None:
            raise Exception("error: missing config")

        self.config = config
        self.list_url = "http://www.qichacha.com/search?key={key}&index={index}&p={page}&province={province}"
        #self.base_url = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab=base"
        #self.invest_url = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab=touzi&p={page}"
        #self.legal_url = "http://www.qichacha.com/company_getinfos?unique={key_num}&companyname={name}&tab=susong&p={page}"

        #self.VIP_MAX_PAGE_NUM = 500
        #self.MAX_PAGE_NUM = 10
        #self.NUM_PER_PAGE = config.get('NUM_PER_PAGE',20 )  #10
        self.INDEX_LIST_PERSON = [4, 6, 14]
        self.INDEX_LIST_ORG = [2]
        self.PROVINCE_LIST = {
            "AH": [
                1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18, 24, 25,
                26, 29
            ],
            "BJ": [],
            "CQ": [],
            "FJ": [1, 2, 3, 4, 5, 6, 7, 8, 9, 22],
            "GD": [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                51, 52, 53
            ],
            "GS": [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 22, 23, 24,
                26, 27
            ],
            "GX": [],
            "GZ": [],
            "HAIN": [],
            "HB": [],
            "HEN": [],
            "HLJ": [],
            "HUB": [],
            "HUN": [],
            "JL": [],
            "JS": [],
            "JX": [],
            "LN": [],
            "NMG": [],
            "NX": [],
            "QH": [],
            "SAX": [],
            "SC": [],
            "SD": [],
            "SH": [],
            "SX": [],
            "TJ": [],
            "XJ": [],
            "XZ": [],
            "YN": [],
            "ZJ": []
        }

        self.downloader = Downloader(config=config,
                                     request=request,
                                     batch_id=batch_id,
                                     groups=groups,
                                     refresh=refresh,
                                     cache_only=cache_only)
        self.downloader.login()
        self.parser = QiParser()

    # def list_person_search(self, person_list, limit=None, refresh=False):
    #     """.. :py:method::
    #         need to catch exception of download error
    #
    #     :param person_list: str or list type, search keyword
    #     :param limit: result number of every search keyword
    #     :rtype: {keyword1: {data: {name1: {}, name2: {}, ...}, metadata:{}},
    #               keyword2: {}, ...}
    #     """
    #     return self._list_keyword_search(person_list, self.INDEX_LIST_PERSON, limit, refresh )
    #
    # def list_corporate_search(self, corporate_list, limit=None, refresh=False):
    #     """.. :py:method::
    #         need to catch exception of download error
    #
    #     :param corporate_list: str or list type, search keyword
    #     :param limit: result number of every search keyword
    #     :rtype: {keyword1: {data: {name1: {}, name2: {}, ...}, metadata:{}},
    #               keyword2: {}, ...}
    #     """
    #     return self._list_keyword_search(corporate_list, self.INDEX_LIST_ORG, limit, refresh )

    def list_keyword_search(self,
                            keyword_list,
                            index_list,
                            limit=None,
                            refresh=False,
                            skip_index_max=None):
        """.. :py:method::
            对这样词语列表搜索结果的返回http://www.qichacha.com/search?key=%E5%8C%BB%E8%8D%AF&index=0

        :parameter keyword_list: 要搜索的词语列表
        :parameter index_list: 搜索条件的index编号列表
        :parameter limit: 搜索返回结果数限制,vip-5000,free-1000
        :rtype: hash, key is company, value is result json.
        """
        if not isinstance(keyword_list, list):
            keyword_list = [keyword_list]

        if limit is None:
            limit = self.config["MAX_LIMIT"]
        else:
            limit = min(limit, self.config["MAX_LIMIT"])

        result = {}
        for idx, keyword in enumerate(keyword_list):
            summary_dict = {}
            metadata_dict = collections.Counter()
            sum_e = 0
            sum_a = 0
            for index in index_list:
                result_info = self.get_keyword_search_result_info(
                    keyword, index, refresh)
                index_expect = result_info["total"]

                #max_page = (limit - 1) // result_info["max_page_num"] + 1  #self.NUM_PER_PAGE + 1

                metadata_dict["expect"] += index_expect
                metadata_dict["i{}_e".format(index)] = index_expect
                #metadata_dict["total_[index:{}]_expect".format(index)]=cnt

                province_list = []
                summary_dict_by_index = {}
                if skip_index_max and index_expect >= skip_index_max:
                    print(" ---- undersample [{}][index:{}] 5000+ results".
                          format(keyword, index))
                    self.list_keyword_search_onepass(keyword, index, "", limit,
                                                     metadata_dict,
                                                     summary_dict_by_index,
                                                     refresh)
                    pass
                elif limit is None and index_expect >= self.config["MAX_LIMIT"]:
                    print(
                        " ---- expand [{}][index:{}] auto expand by province , expect {} "
                        .format(keyword, index, index_expect))
                    for province in self.PROVINCE_LIST:
                        self.list_keyword_search_onepass(
                            keyword, index, province, limit, metadata_dict,
                            summary_dict_by_index, refresh)
                elif index_expect > 0:
                    self.list_keyword_search_onepass(keyword, index, "", limit,
                                                     metadata_dict,
                                                     summary_dict_by_index,
                                                     refresh)
                else:
                    print(
                        " ---- skip [{}][index:{}] no expected result".format(
                            keyword, index))
                summary_dict.update(summary_dict_by_index)
                #metadata_dict["i{}_actual".format(index)]=len(summary_dict_by_index)
                i_sum_e = metadata_dict["i{}_sum_e".format(index)]
                sum_e += i_sum_e
                sum_a += metadata_dict["i{}_sum_a".format(index)]
                if index_expect != i_sum_e:
                    print("[{}][index:{}] expect {} but sum_e is {}".format(
                        keyword, index, index_expect, i_sum_e))

            result[keyword] = {"data": summary_dict, "metadata": metadata_dict}
            metadata_dict["actual"] = len(summary_dict)
            if sum_e == sum_a:
                print(u" ---- ok [{}] {} ".format(
                    keyword,
                    json.dumps(metadata_dict,
                               ensure_ascii=False,
                               sort_keys=True)))
            else:
                print(u"[{}] {} ".format(
                    keyword,
                    json.dumps(metadata_dict,
                               ensure_ascii=False,
                               sort_keys=True)))

            #print ( json.dumps(summary_dict.keys(), ensure_ascii=False) )
        #print(json.dumps(result, ensure_ascii=False))
        return result

    def list_keyword_search_onepass(self, keyword, index, province, limit,
                                    metadata_dict, summary_dict_onepass,
                                    refresh):
        """.. :py:method::
            对这样单个词搜索结果的返回http://www.qichacha.com/search?key=%E5%8C%BB%E8%8D%AF&index=0

        :parameter keyword: 要搜索的词语
        :parameter index: 搜索条件的index编号
        :parameter province: 搜索条件的省份(拼音开头字母?)
        :parameter limit: 搜索返回结果数限制,vip-5000,free-1000
        :rtype: hash, key is company, value is result json.
        """
        summary_dict_local = {}
        cnt_expect = 0
        cnt_items = 0

        for page in range(1, 1000):

            url = self.list_url.format(key=keyword,
                                       index=index,
                                       page=page,
                                       province=province)

            source = self.downloader.access_page_with_cache(
                url,
                groups="v0531,search,index{}".format(index),
                refresh=refresh)

            if not source:
                # no more results, cannot get data
                break

            #if self.config.get("debug"):
            #    print (source)

            if "nodata.png" in source:
                # no more results, cannot get data
                break

            tree = lxml.html.fromstring(source)

            if page == 1:
                result_info = self.parser.parse_search_result_info(tree)
                cnt_expect = result_info["total"]
                metadata_dict["i{}_sum_e".format(index)] += cnt_expect
                metadata_dict["num_per_page"] = result_info["num_per_page"]

                #metadata_dict["total_[index:{}]_expect2".format(index)]+=cnt
                #metadata_dict["total_[index:{}][省:{}]_expect2".format(index, province)]=cnt
                if cnt_expect >= self.config["MAX_LIMIT"]:
                    msg = " ---- todo [{}][index:{}][省:{}] TO BE EXPAND , expect {}, ".format(
                        keyword, index, province, cnt_expect)
                    print(msg)
                    metadata_dict["todo_expand"] += 1
                else:
                    if self.config.get("debug"):
                        msg = "---- regular [{}][index:{}][省:{}], expect {}, ".format(
                            keyword, index, province, cnt_expect)
                        print(msg)

            if tree.cssselect("div.noresult .noface"):
                break

            items = self.parser.parse_search_result(tree)
            cnt_items += len(items)
            #print (page, len(temp), json.dumps(temp, ensure_ascii=False))
            for item in items:
                name = item['name']
                summary_dict_local[name] = item

            if cnt_items >= cnt_expect:
                break

            if cnt_items >= limit:
                break

            #if self.config.get("debug"):
            #    print (len(items), page)
            #if len(items)<self.NUM_PER_PAGE:
            #    break

        #if province:
        metadata_dict["i{}_sum_a".format(index)] += cnt_items
        cnt_actual = len(summary_dict_local)
        summary_dict_onepass.update(summary_dict_local)
        if cnt_expect == 0 or cnt_actual == 0 or abs(cnt_expect -
                                                     cnt_actual) > 0:
            url = self.list_url.format(key=keyword,
                                       index=index,
                                       page=0,
                                       province=province)
            msg = " ---- check [{}][{}], expect {} .....  {} items, {} actual".format(
                keyword, url, cnt_expect, cnt_items, cnt_actual)
            print(msg)
            #print ( json.dumps(summary_dict_local.keys(), ensure_ascii=False) )

    def get_keyword_search_result_info(self, keyword, index, refresh=False):
        """.. :py:method::

        :param keyword: search keyword
        :rtype: json
        """
        url = self.list_url.format(key=keyword,
                                   index=index,
                                   page=1,
                                   province="")

        source = self.downloader.access_page_with_cache(
            url, groups="v0531,search,index{}".format(index), refresh=refresh)
        if not source:
            return 0

        #print (url, source)
        tree = lxml.html.fromstring(source)

        result_info = self.parser.parse_search_result_info(tree)
        result_info["keyword"] = keyword
        result_info["index"] = index
        return result_info

    def input_name_output_id(self, name):
        """.. :py:method::

        :param name: standard company name
        :rtype: qichacha id or None
        """
        url = self.list_url.format(key=name, index=0, page=1, province="")
        try:
            source = self.downloader.access_page_with_cache(
                url, refresh=True).replace('<em>', '').replace('</em>', '')
            tree = lxml.html.fromstring(source)
        except:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)

        if tree.cssselect('.table-search-list') and tree.cssselect(
                '.tp2_tit a'):
            items = tree.cssselect('.table-search-list')
            for i in items:
                #from lxml import etree as ET
                #print ("v3",  ET.tostring(i, pretty_print=True))
                if not i.xpath('.//*[@class=\"tp2_tit clear\"]/a/text()'):
                    continue
                item = {}
                item['name'] = i.xpath(
                    './/*[@class=\"tp2_tit clear\"]/a/text()')[0]
                # print(item['name'])
                item['href'] = i.xpath(
                    './/*[@class=\"tp2_tit clear\"]/a/@href')[0]
                item['status'] = i.xpath(
                    './/*[@class=\"tp5 text-center\"]/a/span/text()')[0]
                item['key_num'] = item['href'].split('firm_')[1].split(
                    '.shtml')[0]
                if item['name'] == name:
                    return item['key_num']

    def _crawl_company_detail_by_name_id(self, name, key_num):
        """.. :py:method::
            给定company的name和key_num,获取该公司的详情内容,包括子公司

        :rtype: {name: {"name": name,
                        "key_num", key_num,
                        "info": {},
                        "shareholders": {},
                       }
                }
        """
        url = self.get_info_url("base", key_num, name)
        source = self.downloader.access_page_with_cache(url)
        if not source:
            return {}
        try:
            tree = lxml.html.fromstring(source)
        except:
            if self.config.get("debug"):
                print(source)
            import traceback
            traceback.print_exc(file=sys.stdout)
            return {}

        all_info = self.parser.parse_detail(tree)
        all_info["info"]["name"] = name
        all_info.update({"name": name, "key_num": key_num})
        return {name: all_info}

    def crawl_company_detail(self, name, key_num=None, subcompany=True):
        """.. :py:method::

        :param name: standard company name
        :param key_num: qichacha company id,
                if don"t passed in this parameter,
                need to searching on website
        :param subcompany: whether to crawl subcompanies
        :rtype: json of this company info
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return

        name_info_dict = self._crawl_company_detail_by_name_id(name, key_num)

        if subcompany is True:
            invest_info_dict = self.crawl_company_investment(name, key_num)
            if invest_info_dict is not None:
                name_info_dict[name]["invests"] = invest_info_dict[
                    name].values()
        return name_info_dict

    def _crawl_company_investment_single_page(self,
                                              name,
                                              key_num,
                                              page,
                                              max_page_num=None):
        """.. :py:method::
            if parameter page is 1, parameter max_page_num must be []
        """
        if not hasattr(self, "_re_page_num"):
            setattr(self, "_re_page_num",
                    re.compile("javascript:getTabList\((\d+)"))

        url = self.get_info_url("touzi", key_num, name, page=page)
        source = self.downloader.access_page_with_cache(url)
        if not source:
            return
        try:
            tree = lxml.html.fromstring(source)
        except:
            if self.config.get("debug"):
                print(source)
            import traceback
            traceback.print_exc(file=sys.stdout)
            return

        if tree.cssselect("div.noresult .noface"):
            return

        if page == 1 and max_page_num == []:
            if tree.cssselect(".pagination #ajaxpage") == []:
                max_page_num.append(1)
            else:
                page_num = [ int( self._re_page_num.match( i.get("href") ).group(1) ) \
                    for i in tree.cssselect(".pagination #ajaxpage") ]
                page_num.append(1)
                max_page_num.append(max(page_num))

        return self.parser.parse_company_investment(tree)

    def crawl_company_investment(self, name, key_num):
        """.. :py:method::

        :param name: standard company name
        :param key_num: qichacha company id
        :rtype: {name: {sub_name1: {"name": sub_name1, "key_num": key_num},
                        sub_name2: {"name": sub_name2, "key_num": key_num}, ...}}
        """
        max_page_num = []
        invest_dict = self._crawl_company_investment_single_page(
            name, key_num, 1, max_page_num)
        if invest_dict is None or invest_dict == {}:
            return
        if len(max_page_num) > 0:
            max_page_num = max_page_num[0]

        for page_idx in range(2, max_page_num + 1):
            more_invest_dict = self._crawl_company_investment_single_page(
                name, key_num, page_idx)
            if more_invest_dict:
                invest_dict.update(more_invest_dict)

        return {name: invest_dict}

    def _parse_invests_inqueue(self, name, key_num, already_crawled_names,
                               next_layer_name_id_set, all_name_info_dict):

        name_info_dict = self.crawl_company_detail(name,
                                                   key_num,
                                                   subcompany=True)
        already_crawled_names.add(name)
        if "invests" in name_info_dict.get(name, {}):
            next_layer_name_id_set.update(
                [(i["name"], i["key_num"]) for i in name_info_dict[name]["invests"]\
                    if i["name"] not in already_crawled_names]
            )
        all_name_info_dict.update(name_info_dict)

    def crawl_company_expand(self, name, key_num=None, limit=None):
        """.. :py:method::
            爬取一个公司的子孙公司和父辈公司
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return

        company_raw_one = {}
        temp = self.crawl_descendant_company(name, key_num, limit=limit)
        if temp:
            company_raw_one.update(temp)

        temp = self.crawl_ancestors_company(name, key_num, limit=limit)
        if temp:
            company_raw_one.update(temp)
        return company_raw_one

    def crawl_descendant_company(self, name, key_num=None, limit=None):
        """.. :py:method::
            This company(detail, invests) is first layer, subcompanies(detail,
            invests) is second layer, ans so on.

        :param name: standard company name
        :param key_num: qichacha company id
        :param limit: limitation of descendants layers, None means unlimited
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return
        if limit is None:
            limit = 2**40

        already_crawled_names = set()
        all_name_info_dict = {}
        next_layer_name_id_set = set([(name, key_num)])

        while limit > 0 and next_layer_name_id_set:
            this_layer_name_id_set = next_layer_name_id_set
            next_layer_name_id_set = set()

            while this_layer_name_id_set:
                try:
                    sub_name, sub_key_num = this_layer_name_id_set.pop()
                    if sub_name in already_crawled_names:
                        continue

                    self._parse_invests_inqueue(sub_name, sub_key_num,
                                                already_crawled_names,
                                                next_layer_name_id_set,
                                                all_name_info_dict)
                except KeyError:
                    break
            limit -= 1

        return all_name_info_dict

    def _parse_shareholders_inqueue(self, name, key_num, already_crawled_names,
                                    next_layer_name_id_set,
                                    all_name_info_dict):

        name_info_dict = self.crawl_company_detail(name,
                                                   key_num,
                                                   subcompany=True)
        already_crawled_names.add(name)

        for shareholder in name_info_dict[name]["shareholders"]:
            if shareholder["link"] is not None:
                if shareholder["name"] not in already_crawled_names:
                    key_num = shareholder["link"].rstrip(".shtml").rsplit(
                        "_")[-1]
                    next_layer_name_id_set.add((shareholder["name"], key_num))
        all_name_info_dict.update(name_info_dict)

    def crawl_ancestors_company(self, name, key_num=None, limit=None):
        """.. :py:method::
            This company(detail, invests) is first layer, the shareholders(detail, invests)
            is second layer, shareholders of shareholders is third layer, and so on.

        :param name: standard company name
        :param key_num: qichacha company id
        :param limit: limitation of descendants layers, None means unlimited
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return
        if limit is None:
            limit = 2**40

        already_crawled_names = set()
        all_name_info_dict = {}
        next_layer_name_id_set = set([(name, key_num)])

        while limit > 0 and next_layer_name_id_set:
            this_layer_name_id_set = next_layer_name_id_set
            next_layer_name_id_set = set()

            while this_layer_name_id_set:
                try:
                    name, key_num = this_layer_name_id_set.pop()
                    if name in already_crawled_names:
                        continue

                    self._parse_shareholders_inqueue(name, key_num,
                                                     already_crawled_names,
                                                     next_layer_name_id_set,
                                                     all_name_info_dict)
                except KeyError:
                    break
            limit -= 1
        return all_name_info_dict
コード例 #2
0
class Qichacha(object):
    def __init__(self,
                 config,
                 batch_id=None,
                 groups=None,
                 refresh=False,
                 request=True):
        if batch_id is None:
            batch_id = "qichacha0601"
        if config is None:
            raise Exception("error: missing config")

        self.config = config
        self.list_url = "http://www.qichacha.com/search?key={key}&index={index}&p={page}&province={province}"
        self.base_url = "http://www.qichacha.com/company_base?unique={key_num}&companyname={name}"
        self.invest_url = "http://www.qichacha.com/company_touzi?unique={key_num}&companyname={name}&p={page}"

        #self.VIP_MAX_PAGE_NUM = 500
        #self.MAX_PAGE_NUM = 10
        self.NUM_PER_PAGE = 10
        self.INDEX_LIST_PERSON = [4, 6, 14]
        self.INDEX_LIST_ORG = [2]
        self.PROVINCE_LIST = {
            "AH": [
                1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18, 24, 25,
                26, 29
            ],
            "BJ": [],
            "CQ": [],
            "FJ": [1, 2, 3, 4, 5, 6, 7, 8, 9, 22],
            "GD": [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                51, 52, 53
            ],
            "GS": [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 22, 23, 24,
                26, 27
            ],
            "GX": [],
            "GZ": [],
            "HAIN": [],
            "HB": [],
            "HEN": [],
            "HLJ": [],
            "HUB": [],
            "HUN": [],
            "JL": [],
            "JS": [],
            "JX": [],
            "LN": [],
            "NMG": [],
            "NX": [],
            "QH": [],
            "SAX": [],
            "SC": [],
            "SD": [],
            "SH": [],
            "SX": [],
            "TJ": [],
            "XJ": [],
            "XZ": [],
            "YN": [],
            "ZJ": []
        }

        self.downloader = Downloader(config=config,
                                     request=request,
                                     batch_id=batch_id,
                                     groups=groups,
                                     refresh=refresh)
        self.downloader.login()
        self.parser = QiParser()

    # def list_person_search(self, person_list, limit=None, refresh=False):
    #     """.. :py:method::
    #         need to catch exception of download error
    #
    #     :param person_list: str or list type, search keyword
    #     :param limit: result number of every search keyword
    #     :rtype: {keyword1: {data: {name1: {}, name2: {}, ...}, metadata:{}},
    #               keyword2: {}, ...}
    #     """
    #     return self._list_keyword_search(person_list, self.INDEX_LIST_PERSON, limit, refresh )
    #
    # def list_corporate_search(self, corporate_list, limit=None, refresh=False):
    #     """.. :py:method::
    #         need to catch exception of download error
    #
    #     :param corporate_list: str or list type, search keyword
    #     :param limit: result number of every search keyword
    #     :rtype: {keyword1: {data: {name1: {}, name2: {}, ...}, metadata:{}},
    #               keyword2: {}, ...}
    #     """
    #     return self._list_keyword_search(corporate_list, self.INDEX_LIST_ORG, limit, refresh )

    def list_keyword_search(self,
                            keyword_list,
                            index_list,
                            limit=None,
                            refresh=False,
                            skip_index_max=None):
        if not isinstance(keyword_list, list):
            keyword_list = [keyword_list]

        if limit is None:
            max_page = self.config["MAX_PAGE_NUM"]
        else:
            max_page = (limit - 1) // self.NUM_PER_PAGE + 1
            max_page = min(self.config["MAX_PAGE_NUM"], max_page)

        result = {}
        for idx, keyword in enumerate(keyword_list):
            summary_dict = {}
            metadata_dict = collections.Counter()
            sum_e = 0
            sum_a = 0
            for index in index_list:

                index_expect = self.get_keyword_search_count(
                    keyword, index, refresh)
                metadata_dict["expect"] += index_expect
                metadata_dict["i{}_e".format(index)] = index_expect
                #metadata_dict["total_[index:{}]_expect".format(index)]=cnt

                province_list = []
                summary_dict_by_index = {}
                if skip_index_max and index_expect >= skip_index_max:
                    print(" ---- undersample [{}][index:{}] 5000+ results".
                          format(keyword, index))
                    self.list_keyword_search_onepass(keyword, index, "",
                                                     max_page, metadata_dict,
                                                     summary_dict_by_index,
                                                     refresh)
                    pass
                elif limit is None and index_expect >= self.config[
                        "MAX_PAGE_NUM"] * self.NUM_PER_PAGE:
                    print(
                        " ---- expand [{}][index:{}] auto expand by province , expect {} "
                        .format(keyword, index, index_expect))
                    for province in self.PROVINCE_LIST:
                        self.list_keyword_search_onepass(
                            keyword, index, province, max_page, metadata_dict,
                            summary_dict_by_index, refresh)
                elif index_expect > 0:
                    self.list_keyword_search_onepass(keyword, index, "",
                                                     max_page, metadata_dict,
                                                     summary_dict_by_index,
                                                     refresh)
                else:
                    print(
                        " ---- skip [{}][index:{}] no expected result".format(
                            keyword, index))
                summary_dict.update(summary_dict_by_index)
                #metadata_dict["i{}_actual".format(index)]=len(summary_dict_by_index)
                i_sum_e = metadata_dict["i{}_sum_e".format(index)]
                sum_e += i_sum_e
                sum_a += metadata_dict["i{}_sum_a".format(index)]
                if index_expect != i_sum_e:
                    print("[{}][index:{}] expect {} but sum_e is {}".format(
                        keyword, index, index_expect, i_sum_e))

            result[keyword] = {"data": summary_dict, "metadata": metadata_dict}
            metadata_dict["actual"] = len(summary_dict)
            if sum_e == sum_a:
                print(u" ---- ok [{}] {} ".format(
                    keyword,
                    json.dumps(metadata_dict,
                               ensure_ascii=False,
                               sort_keys=True)))
            else:
                print(u"[{}] {} ".format(
                    keyword,
                    json.dumps(metadata_dict,
                               ensure_ascii=False,
                               sort_keys=True)))

            #print ( json.dumps(summary_dict.keys(), ensure_ascii=False) )
        #print(json.dumps(result, ensure_ascii=False))
        return result

    def list_keyword_search_onepass(self, keyword, index, province, max_page,
                                    metadata_dict, summary_dict_onepass,
                                    refresh):
        summary_dict_local = {}
        cnt_expect = 0
        cnt_items = 0

        for page in range(1, max_page + 1):

            url = self.list_url.format(key=keyword,
                                       index=index,
                                       page=page,
                                       province=province)

            source = self.downloader.access_page_with_cache(
                url,
                groups="v0531,search,index{}".format(index),
                refresh=refresh)
            if not source:
                # no more results, cannot get data
                break

            tree = lxml.html.fromstring(source)

            if page == 1:
                cnt_expect = self.parser.parse_search_result_count(tree)
                metadata_dict["i{}_sum_e".format(index)] += cnt_expect
                #metadata_dict["total_[index:{}]_expect2".format(index)]+=cnt
                #metadata_dict["total_[index:{}][省:{}]_expect2".format(index, province)]=cnt
                if cnt_expect >= self.config[
                        "MAX_PAGE_NUM"] * self.NUM_PER_PAGE:
                    msg = " ---- todo [{}][index:{}][省:{}] TO BE EXPAND , expect {}, ".format(
                        keyword, index, province, cnt_expect)
                    print(msg)
                    metadata_dict["todo_expand"] += 1
                elif province:
                    #msg = "[{}][index:{}][省:{}], expect {}, ".format( keyword,index, province, cnt_expect)
                    #print (msg, end="")
                    pass

            if tree.cssselect("div.noresult .noface"):
                break

            items = self.parser.parse_search_result(tree)
            cnt_items += len(items)
            #print (page, len(temp), json.dumps(temp, ensure_ascii=False))
            for item in items:
                name = item['name']
                summary_dict_local[name] = item

            if cnt_items == cnt_expect:
                break

            if len(items) < self.NUM_PER_PAGE:
                break

        #if province:
        metadata_dict["i{}_sum_a".format(index)] += cnt_items
        cnt_actual = len(summary_dict_local)
        summary_dict_onepass.update(summary_dict_local)
        if cnt_expect == 0 or cnt_actual == 0 or abs(cnt_expect -
                                                     cnt_actual) > 0:
            url = self.list_url.format(key=keyword,
                                       index=index,
                                       page=0,
                                       province=province)
            msg = " ---- check [{}][{}], expect {} .....  {} items, {} actual".format(
                keyword, url, cnt_expect, cnt_items, cnt_actual)
            print(msg)
            #print ( json.dumps(summary_dict_local.keys(), ensure_ascii=False) )

    def get_keyword_search_count(self, keyword, index, refresh=False):
        """.. :py:method::

        :param keyword: search keyword
        :rtype: count
        """
        url = self.list_url.format(key=keyword,
                                   index=index,
                                   page=1,
                                   province="")

        source = self.downloader.access_page_with_cache(
            url, groups="v0531,search,index{}".format(index), refresh=refresh)
        if not source:
            return 0

        #print (url, source)
        tree = lxml.html.fromstring(source)

        return self.parser.parse_search_result_count(tree)

    def input_name_output_id(self, name):
        """.. :py:method::

        :param name: standard company name
        :rtype: qichacha id or None
        """
        url = self.list_url.format(key=name, index=0, page=1, province="")
        try:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)
        except:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)

        if tree.cssselect("div.noresult .noface"):
            return

        for i in tree.cssselect("#searchlist"):
            searched_name = i.cssselect(
                ".name")[0].text_content().strip().encode("utf-8")
            if searched_name == name:
                link = i.cssselect(".list-group-item")[0].attrib["href"]
                return link.rstrip(".shtml").rsplit("_", 1)[-1]

    def _crawl_company_detail_by_name_id(self, name, key_num):
        """
        :rtype: {name: {"name": name,
                        "key_num", key_num,
                        "info": {},
                        "shareholders": {},
                       }
                }
        """
        url = self.base_url.format(name=name, key_num=key_num)
        try:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)
        except:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)

        all_info = self.parser.parse_detail(tree)
        all_info["info"]["name"] = name
        all_info.update({"name": name, "key_num": key_num})
        return {name: all_info}

    def crawl_company_detail(self, name, key_num=None, subcompany=True):
        """.. :py:method::

        :param name: standard company name
        :param key_num: qichacha company id,
                if don"t passed in this parameter,
                need to searching on website
        :param subcompany: whether to crawl subcompanies
        :rtype: json of this company info
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return

        name_info_dict = self._crawl_company_detail_by_name_id(name, key_num)

        if subcompany is True:
            invest_info_dict = self.crawl_company_investment(name, key_num)
            if invest_info_dict is not None:
                name_info_dict[name]["invests"] = invest_info_dict[
                    name].values()
        return name_info_dict

    def _crawl_company_investment_single_page(self,
                                              name,
                                              key_num,
                                              page,
                                              max_page_num=None):
        """.. :py:method::
            if parameter page is 1, parameter max_page_num must be []
        """
        if not hasattr(self, "_re_page_num"):
            setattr(self, "_re_page_num",
                    re.compile("javascript:touzilist\((\d+)\)"))

        url = self.invest_url.format(key_num=key_num, name=name, page=page)
        try:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)
        except:
            source = self.downloader.access_page_with_cache(url)
            tree = lxml.html.fromstring(source)

        if tree.cssselect("div.noresult .noface"):
            return

        if page == 1 and max_page_num == []:
            if tree.cssselect(".pagination #ajaxpage") == []:
                max_page_num.append(1)
            else:
                page_num = [ int( self._re_page_num.match( i.get("href") ).group(1) ) \
                    for i in tree.cssselect(".pagination #ajaxpage") ]
                page_num.append(1)
                max_page_num.append(max(page_num))

        return self.parser.parse_company_investment(tree)

    def crawl_company_investment(self, name, key_num):
        """.. :py:method::

        :param name: standard company name
        :param key_num: qichacha company id
        :rtype: {name: {sub_name1: {"name": sub_name1, "key_num": key_num},
                        sub_name2: {"name": sub_name2, "key_num": key_num}, ...}}
        """
        max_page_num = []
        invest_dict = self._crawl_company_investment_single_page(
            name, key_num, 1, max_page_num)
        if invest_dict is None or invest_dict == {}:
            return
        if len(max_page_num) > 0:
            max_page_num = max_page_num[0]

        for page_idx in range(2, max_page_num + 1):
            invest_dict.update(
                self._crawl_company_investment_single_page(
                    name, key_num, page_idx))
        return {name: invest_dict}

    def _parse_invests_inqueue(self, name, key_num, already_crawled_names,
                               next_layer_name_id_set, all_name_info_dict):

        name_info_dict = self.crawl_company_detail(name,
                                                   key_num,
                                                   subcompany=True)
        already_crawled_names.add(name)
        if "invests" in name_info_dict[name]:
            next_layer_name_id_set.update(
                [(i["name"], i["key_num"]) for i in name_info_dict[name]["invests"]\
                    if i["name"] not in already_crawled_names]
            )
        all_name_info_dict.update(name_info_dict)

    def crawl_descendant_company(self, name, key_num=None, limit=None):
        """.. :py:method::
            This company(detail, invests) is first layer, subcompanies(detail,
            invests) is second layer, ans so on.

        :param name: standard company name
        :param key_num: qichacha company id
        :param limit: limitation of descendants layers, None means unlimited
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return
        if limit is None:
            limit = 2**40

        already_crawled_names = set()
        all_name_info_dict = {}
        next_layer_name_id_set = set([(name, key_num)])

        while limit > 0 and next_layer_name_id_set:
            this_layer_name_id_set = next_layer_name_id_set
            next_layer_name_id_set = set()

            while this_layer_name_id_set:
                try:
                    sub_name, sub_key_num = this_layer_name_id_set.pop()
                    if sub_name in already_crawled_names:
                        continue

                    self._parse_invests_inqueue(sub_name, sub_key_num,
                                                already_crawled_names,
                                                next_layer_name_id_set,
                                                all_name_info_dict)
                except KeyError:
                    break
            limit -= 1

        return all_name_info_dict

    def _parse_shareholders_inqueue(self, name, key_num, already_crawled_names,
                                    next_layer_name_id_set,
                                    all_name_info_dict):

        name_info_dict = self.crawl_company_detail(name,
                                                   key_num,
                                                   subcompany=True)
        already_crawled_names.add(name)

        for shareholder in name_info_dict[name]["shareholders"]:
            if shareholder["link"] is not None:
                if shareholder["name"] not in already_crawled_names:
                    key_num = shareholder["link"].rstrip(".shtml").rsplit(
                        "_")[-1]
                    next_layer_name_id_set.add((shareholder["name"], key_num))
        all_name_info_dict.update(name_info_dict)

    def crawl_ancestors_company(self, name, key_num=None, limit=None):
        """.. :py:method::
            This company(detail, invests) is first layer, the shareholders(detail, invests)
            is second layer, shareholders of shareholders is third layer, and so on.

        :param name: standard company name
        :param key_num: qichacha company id
        :param limit: limitation of descendants layers, None means unlimited
        """
        if key_num is None:
            key_num = self.input_name_output_id(name)
            if key_num is None:
                return
        if limit is None:
            limit = 2**40

        already_crawled_names = set()
        all_name_info_dict = {}
        next_layer_name_id_set = set([(name, key_num)])

        while limit > 0 and next_layer_name_id_set:
            this_layer_name_id_set = next_layer_name_id_set
            next_layer_name_id_set = set()

            while this_layer_name_id_set:
                try:
                    name, key_num = this_layer_name_id_set.pop()
                    if name in already_crawled_names:
                        continue

                    self._parse_shareholders_inqueue(name, key_num,
                                                     already_crawled_names,
                                                     next_layer_name_id_set,
                                                     all_name_info_dict)
                except KeyError:
                    break
            limit -= 1
        return all_name_info_dict