Beispiel #1
0
    def get_money(self, content, esm_index, content_length):
        money_list = []
        money_ret = esm_index.query(content)
        if money_ret:
            find_flag = False
            for ret in money_ret:
                pos = ret[0][1]
                relate_content = content[pos:pos + content_length]

                money = toolsutil.re_find_one('\d+.\d+万', relate_content)
                if money:
                    money, unit = MoneyParser().transfer_money(money)
                    money = unicode(money)
                    money_list.append((money, unit))
                    find_flag = True
            if not find_flag:
                for ret in money_ret:
                    find_flag2 = False
                    pos = ret[0][1]
                    relate_content = content[pos:pos + content_length]
                    if '万元' in relate_content:
                        find_flag2 = True
                    money_yuan = toolsutil.re_find_one('\d+.\d+',
                                                       relate_content)
                    if money_yuan:
                        if find_flag2:
                            money_yuan = money_yuan + '万'
                        money_yuan, unit = MoneyParser().transfer_money(
                            money_yuan)
                        money_yuan = unicode(money_yuan)
                        money_list.append((money_yuan, unit))
        return money_list
Beispiel #2
0
    def format_extract_data(self, extract_data, topic_id):
        '''实体解析抽取数据'''
        entity_data = copy.deepcopy(extract_data)
        tmp_code = extract_data.get("code", "").strip()
        code = toolsutil.re_find_one("\d+", tmp_code)
        if not code:
            code = ""
            self.log.warning("cann't extract code from [%s]" % tmp_code)

        publish_time = extract_data.get("publish_time")

        try:
            tmp_publish_time = int(publish_time[:-3])
            publish_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime(tmp_publish_time))
        except:
            pass

        parser_info = {}
        if entity_data.has_key("content"):
            content = entity_data.get("content", "").strip()
            notice_id = self.get_notice_id(content)
            abstract = content[0:256]
            parser_info["notice_id"] = notice_id
            parser_info["abstract"] = abstract

        entity_data["notice_id"] = parser_info.get("notice_id")
        entity_data["abstract"] = parser_info.get("abstract")
        entity_data["publish_time"] = publish_time
        entity_data["code"] = code

        return entity_data
Beispiel #3
0
 def get_bulletin_date(self, content):
     '''从正文中抽取公告日期'''
     content = unicode(content)[:100]
     print content
     date_ret = toolsutil.re_find_one('\d{4}-\d{1,2}-\d{1,2}', content)
     if date_ret:
         return toolsutil.norm_date_time(date_ret)
     else:
         return ""
Beispiel #4
0
 def get_court(self, content):
     """获取法院"""
     court = ''
     content = unicode(content)
     content_list = toolsutil.my_split(content, self.seps)
     for row_content in content_list:
         if not row_content:
             continue
         row_content = unicode(row_content)
         ret = toolsutil.re_find_one(self.court_regex, row_content)
         if ret and len(ret) <= self.max_court_length:
             court = ret
             break
     if court == '':
         court = self.get_court2nd(content)
     return court
Beispiel #5
0
    def get_region_from_phone(self, content):
        content_ret = self.phone_index.query(content)
        region_count_map = {}
        if content_ret:
            for ret in content_ret:
                pos = ret[0][0]
                phone_content = content[pos:pos + self.content_length]
                phone_num = toolsutil.re_find_one('\d+|\d+-\d+', phone_content)
                if phone_num:
                    region = self.province_parser.get_region_from_phonenum(phone_num)
                    if region:
                        if region not in region_count_map.keys():
                            region_count_map[region] = 0
                        region_count_map[region] += 1

        for k, v in sorted(region_count_map.items(), lambda x, y: cmp(x[1], y[1]), reverse=True):
            return k
        return ''
Beispiel #6
0
    def get_litigant(self, keyword_list, content_list):
        pattern_list = []
        result_list = []
        for ret in keyword_list:
            for end in self.wenshu_conf.company_end_list:
                pattern = ret[1] + '(\S+' + end + ')'
                pattern_list.append(pattern)
            pattern = ret[1] + '(\S+)'
            pattern_list.append(pattern)

        pattern_list = list(set(pattern_list))
        for content in content_list:
            for pattern in pattern_list:
                ret = toolsutil.re_find_one(unicode(pattern), content)
                if ret:
                    if len(unicode(ret)) < self.company_length_limit:
                        result_list.append(ret)

        result_list = self.norm_entity_list(result_list)

        return result_list
Beispiel #7
0
    def get_last_table(self, last_table_str):
        '''获取对应主题最近备份表'''
        self.collection_names = self.db.db.collection_names()

        last_table_map = {}
        for collection in self.collection_names:
            if last_table_str in collection:
                tmp_list = collection.split("_")
                if len(tmp_list) > 2:
                    time_str = tmp_list[-2] + " " + tmp_list[-1]
                    if not toolsutil.re_find_one(
                            "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", time_str):
                        continue
                    time_second = time.mktime(
                        time.strptime(time_str, '%Y-%m-%d %H:%M:%S'))
                    last_table_map[collection] = time_second

        for key, value in sorted(last_table_map.items(),
                                 lambda x, y: cmp(x[1], y[1]),
                                 reverse=True):
            return key

        return ""
Beispiel #8
0
    def deal_time(self, src_data_list, key_list):

        new_data_list = []
        deal_flag = False
        for iter_data in src_data_list:
            for key in key_list:
                try:
                    time_value = str(iter_data[key])
                    ret = toolsutil.re_find_one(u'\d+', time_value)
                    if len(time_value) > 10 and ret == time_value:
                        tmp = int(time_value[:-3])
                        data_value = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime(tmp))
                        iter_data[key] = data_value
                        deal_flag = True
                    else:
                        iter_data[key] = toolsutil.norm_date_time(
                            iter_data[key])
                        deal_flag = True
                except:
                    pass
            new_data_list.append(iter_data)

        return new_data_list
Beispiel #9
0
    def format_extract_data(self, extract_data, topic_id):
        '''实体解析抽取数据'''

        is_baseinfo_page = False

        entity_data = {}
        if extract_data:
            entity_data = copy.deepcopy(extract_data)

            for key in gsxx_conf.gsxx_key_list:
                if entity_data.has_key(key) and entity_data.get(key) == None:
                    entity_data.pop(key)

            if entity_data.has_key("base_info"):
                is_baseinfo_page = True
                entity_data.pop("base_info")
                for item in extract_data["base_info"]:
                    for key, value in item.items():
                        base_key = key[:3]
                        base_value = "value" + key[3:]
                        if base_key == "key" and item.has_key(base_value):
                            key = item[key].encode("utf8")
                            if item.get(base_value) == None:
                                continue
                            value = item.get(base_value, "").encode("utf8")
                            key_values = [(key, value)]
                            key_pars = key.split("\t")
                            value_pars = value.split("\t")
                            if len(key_pars) > 1 and len(key_pars) == len(
                                    value_pars):
                                index = 0
                                while index < len(key_pars):
                                    key_values.append(
                                        (key_pars[index], value_pars[index]))
                                    index += 1
                            for key, value in key_values:
                                value = value.strip()
                                if key in self.mapping_conf:
                                    if not entity_data.has_key(
                                            self.mapping_conf[key]):
                                        entity_data[
                                            self.mapping_conf[key]] = value
                                    if self.mapping_conf[key] == 'code':
                                        if value != None and len(value) >= 18:
                                            entity_data[
                                                "unified_social_credit_code"] = value
                                        else:
                                            entity_data[
                                                "registered_code"] = value
                                    break
            company = entity_data.get("company", "")
            # Check whether this page is the base info page
            # F**k this code a thousand times!
            if entity_data.has_key("unified_social_credit_code") or \
                    entity_data.has_key("registered_code") or \
                    entity_data.has_key("code"):
                is_baseinfo_page = True
                if company == "":
                    self.log.error("base info without company " +
                                   json.dumps(entity_data))

                if entity_data.has_key("shareholder_information"):
                    shareholder_information = entity_data.get(
                        "shareholder_information")
                    new_shareholder_information = []
                    for each in shareholder_information:
                        if each.has_key("subscription_detail") and each.get(
                                "subscription_detail") != None:
                            each["subscription_detail"] = self.deal_data(
                                each.get("subscription_detail", []),
                                ["subscription_amount"])
                            each["subscription_detail"] = self.deal_time(
                                each.get("subscription_detail", []), [
                                    "subscription_time",
                                    "subscription_publish_time"
                                ])

                        if each.has_key("paied_detail"
                                        ) and each.get("paied_detail") != None:
                            each["paied_detail"] = self.deal_data(
                                each.get("paied_detail", []), ["paied_amount"])
                            each["paied_detail"] = self.deal_time(
                                each.get("paied_detail", []),
                                ["paied_time", "paied_publish_time"])

                        new_shareholder_information.append(each)

                    entity_data["shareholder_information"] = self.deal_data(
                        new_shareholder_information,
                        ["subscription_amount", "paied_amount"])

                if entity_data.has_key("contributor_information"):
                    shareholder_information = entity_data.get(
                        "contributor_information")
                    new_shareholder_information = []
                    for each in shareholder_information:
                        if each.has_key("subscription_detail") and each.get(
                                "subscription_detail") != None:
                            each["subscription_detail"] = self.deal_data(
                                each.get("subscription_detail", []),
                                ["subscription_amount"])
                            each["subscription_detail"] = self.deal_time(
                                each.get("subscription_detail", []), [
                                    "subscription_time",
                                    "subscription_publish_time"
                                ])

                        if each.has_key("paied_detail"
                                        ) and each.get("paied_detail") != None:
                            each["paied_detail"] = self.deal_data(
                                each.get("paied_detail", []), ["paied_amount"])
                            each["paied_detail"] = self.deal_time(
                                each.get("paied_detail", []),
                                ["paied_time", "paied_publish_time"])

                        new_shareholder_information.append(each)

                    entity_data["contributor_information"] = self.deal_data(
                        new_shareholder_information,
                        ["subscription_amount", "paied_amount"])

                if entity_data.has_key("code"):
                    value = entity_data["code"]
                    # entity_data.pop("code")
                    if len(value) == 18:
                        entity_data["unified_social_credit_code"] = value
                    else:
                        entity_data["registered_code"] = value

                src_registered_capital = entity_data.get(
                    'src_registered_capital')
                if not src_registered_capital:
                    src_registered_capital = entity_data.get(
                        'registered_capital')
                if src_registered_capital:
                    entity_data[
                        'src_registered_capital'] = src_registered_capital
                    registered_capital, registered_capital_unit = self.parser_tool.money_parser.transfer_money(
                        src_registered_capital)
                    entity_data["registered_capital"] = registered_capital
                    entity_data[
                        "registered_capital_unit"] = registered_capital_unit

                if entity_data.has_key("period_from"):
                    start_time = self.parser_tool.date_parser.get_date_list(
                        entity_data["period_from"])
                    entity_data.pop("period_from")
                    if entity_data.has_key("period_to"):
                        if entity_data.get("period_to") == None:
                            entity_data.pop("period_to")
                        else:
                            end_time = self.parser_tool.date_parser.get_date_list(
                                entity_data["period_to"])
                            entity_data["period"] = toolsutil.norm_date(
                                start_time) + u"至" + toolsutil.norm_date(
                                    end_time)
                            entity_data.pop("period_to")
                    else:
                        entity_data["period"] = toolsutil.norm_date(
                            start_time) + u"至"

                if entity_data.has_key("period"):
                    period = entity_data.get("period", "")
                    ret = toolsutil.re_findone(self.period_regex, period)
                    if ret and len(ret) == 2:
                        start_time = toolsutil.norm_date(ret[0])
                        end_time = toolsutil.norm_date(ret[1])
                        period = start_time + u"至" + end_time
                    else:
                        ret2 = toolsutil.re_findone(self.period_regex2, period)
                        if ret2:
                            start_time = toolsutil.norm_date(ret2)
                            period = start_time + u"至"
                        else:
                            period = u"--"
                    entity_data["period"] = period

                if company != "":
                    province, city = self.cal_province_city(entity_data)
                    entity_data["province"] = entity_data.get(
                        "province") if entity_data.get(
                            "province") else province
                    entity_data["city"] = entity_data.get(
                        "city") if entity_data.get("city") else city

                    if not self.filter_company(company):
                        entity_data["delete"] = 1

                    entity_data['company'] = company.replace('(', '(').replace(
                        ')', ')')

            if entity_data.has_key("changerecords"):
                changerecords_list = []
                used_name_list = []
                for item in entity_data["changerecords"]:
                    change_item = item.get("change_item", "")
                    change_item = unicode(change_item)
                    if change_item in gsxx_conf.used_name_change_item_list:
                        after_name = unicode(item.get("after_content", ""))
                        befor_name = unicode(item.get("before_content", ""))

                        checked_after_name = self.check_name(
                            company, after_name)
                        checked_befor_name = self.check_name(
                            company, befor_name)

                        if checked_after_name:
                            used_name_list.append(checked_after_name)
                        if checked_befor_name:
                            used_name_list.append(checked_befor_name)

                    change_date = item.get("change_date", "")
                    if isinstance(change_date, basestring):
                        change_date = toolsutil.norm_date_time(
                            self.parser_tool.date_parser.get_date_list(
                                item.get("change_date", "")))
                    else:
                        change_date = str(change_date)
                        ret = toolsutil.re_find_one(u'\d+', change_date)
                        if len(change_date) > 10 and ret == change_date:
                            tmp = int(change_date[:-3])
                            data_value = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime(tmp))
                            change_date = data_value

                    item["change_date"] = change_date
                    changerecords_list.append(item)
                used_name_list = list(set(used_name_list))
                entity_data["used_name_list"] = used_name_list
                entity_data["changerecords"] = changerecords_list

            if entity_data.has_key("invested_companies"):
                invested_companies_list = []
                for single in entity_data["invested_companies"]:
                    invest_amount, invest_amount_unit = self.parser_tool.money_parser.transfer_money(
                        single.get("invest_amount", ""))
                    single["invest_amount"] = invest_amount
                    single["invest_amount_unit"] = invest_amount_unit
                    invested_companies_list.append(single)
                entity_data["invested_companies"] = invested_companies_list

            if entity_data.has_key("investor_change"):
                entity_data["investor_change"] = self.deal_time(
                    entity_data.get("investor_change", []), ["change_date"])

            if entity_data.has_key("business_status"):
                entity_data["business_status"] = entity_data.get(
                    "business_status", "").replace(',', ',')

            if entity_data.has_key("registered_code"):
                value = entity_data.get("registered_code", "")
                if len(value.strip()) >= 18:
                    entity_data["registered_code"] = ""
            else:
                entity_data["registered_code"] = ""

            if not entity_data.get("industry"):
                entity_data[
                    "industry"] = self.parser_tool.industry_parser.predict(
                        company)

            for key, value in entity_data.items():
                if value is None or (isinstance(value, basestring)
                                     and value.strip() == ''):
                    del entity_data[key]

        return entity_data
Beispiel #10
0
    def do_parser(self, content, title):
        '''获取公司'''

        # 1 根据关键字查找公司名

        houxuanzhongbiao_company_list = self.get_company_list(
            content, self.houxuanzhongbiao_company_index,
            bid_conf.zhongbiao_company_end_list, self.zhongbiao_offset)
        zhongbiao_company_list = self.get_company_list(
            content, self.zhongbiao_company_index,
            bid_conf.zhongbiao_company_end_list, self.zhongbiao_offset)

        zhaobiao_company_list = self.get_company_list(
            content, self.zhaobiao_company_index,
            bid_conf.zhaobiao_company_end_list, self.zhaobiao_offset)
        agent_company_list = self.get_company_list(
            content, self.agent_company_index, bid_conf.agent_company_end_list,
            self.agent_offset)

        zhaobiao_company = None
        if zhaobiao_company_list:
            if len(zhaobiao_company_list) >= 2:
                find_flag = False
                for item in zhaobiao_company_list:
                    if u'公司' not in unicode(item):
                        zhaobiao_company = item
                        find_flag = True
                        break
                if not find_flag:
                    zhaobiao_company = zhaobiao_company_list[0]
            else:
                zhaobiao_company = zhaobiao_company_list[0]

        agent = ''
        if agent_company_list:
            agent = agent_company_list[0]

        # 2 若招标或者代理公司未找到,则匹配(\S*)受(\S+)委托
        content = unicode(content)
        ret = toolsutil.re_find_one(u'(\S*)受(\S+)委托', content)
        if ret:
            tmp_agent = self.company_parser.get_one_company(
                ret[0], bid_conf.agent_company_end_list)
            tmp_zhaobiao_company = self.company_parser.get_one_company(
                ret[1], bid_conf.zhaobiao_company_end_list)
            agent = tmp_agent if tmp_agent else agent
            zhaobiao_company = tmp_zhaobiao_company if tmp_zhaobiao_company else zhaobiao_company

        # 3 若招标公司还未找到,则从标题中寻找
        if not zhaobiao_company:
            zhaobiao_company = self.company_parser.get_one_company(
                title, bid_conf.zhaobiao_company_end_list)

        result_data = self.filter_company_list(zhaobiao_company,
                                               zhongbiao_company_list,
                                               houxuanzhongbiao_company_list,
                                               agent)

        result_data["zhaobiao"] = unicode(result_data["zhaobiao"])
        result_data["agent"] = unicode(result_data["agent"])

        for i in range(len(result_data["zhongbiao"])):
            result_data["zhongbiao"][i] = unicode(result_data["zhongbiao"][i])

        for i in range(len(result_data["houxuan_zhongbiao"])):
            result_data["houxuan_zhongbiao"][i] = unicode(
                result_data["houxuan_zhongbiao"][i])

        return result_data
Beispiel #11
0
    def deal_time(self,src_data_list,key_list):

        new_data_list = []
        deal_flag     = False
        for iter_data in src_data_list:
            for key in key_list:
                try:
                    time_value = str(iter_data[key])
                    ret = toolsutil.re_find_one(u'\d+',time_value)
                    if len(time_value) > 10 and ret == time_value:
                        tmp = int(time_value[:-3])
                        data_value = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tmp))
                        iter_data[key] = data_value
                        deal_flag = True
                    else:
                        iter_data[key] = toolsutil.norm_date_time(iter_data[key])
                        deal_flag = True
                except:
                    pass
            new_data_list.append(iter_data)

        return new_data_list

# if __name__ == '__main__':
#     import sys
#     sys.path.append('../../')
#     topic_id = 136
#     import pytoml
#     from conf import get_config
#     from bdp.i_crawler.i_extractor.ttypes import BaseInfo, CrawlInfo, ExtractInfo, PageParseInfo
#     with open('../../entity.toml', 'rb') as config:
#         config = pytoml.load(config)
#     conf = get_config(config)
#     import common
#     from entity_extractor_route import EntityExtractorRoute
#     route = EntityExtractorRoute()
#     topic_info = route.all_topics.get(topic_id, None)
#     obj = AnnualReportsExtractor(topic_info, common.log)
#     extract_data = {
#     "base_info": [
#         {
#             "key": "统一社会信用代码/注册号",
#             "value": "911400001123101349"
#         },
#         {
#             "key": "企业名称",
#             "value": "华晋焦煤有限责任公司"
#         },
#         {
#             "key": "企业通信地址",
#             "value": "山西省吕梁市离市区久安路57号"
#         },
#         {
#             "key": "邮政编码",
#             "value": "033000"
#         },
#         {
#             "key": "企业联系电话",
#             "value": "0358-8296368"
#         },
#         {
#             "key": "企业电子邮箱",
#             "value": "*****@*****.**"
#         },
#         {
#             "key": "从业人数",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "企业经营状态",
#             "value": "开业"
#         },
#         {
#             "key": "是否有网站或网店",
#             "value": "是"
#         },
#         {
#             "key": "有限责任公司本年度是否发生股东股权转让",
#             "value": "否"
#         },
#         {
#             "key": "是否有投资信息或购买其他公司股权",
#             "value": "有"
#         },
#         {
#             "key": "对外提供保证担保信息",
#             "value": "否"
#         }
#     ],
#     "enterprise_asset_status_information": [
#         {
#             "key": "资产总额",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "所得者权益合计",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "营业总收入",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "利润总额",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "营业总收入中主营业务收入",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "净利润",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "纳税总额",
#             "value": "企业选择不公示"
#         },
#         {
#             "key": "负债总额",
#             "value": "企业选择不公示"
#         }
#     ],
#     "invested_companies": [
#         {
#             "company_name": "华晋煤层气综合利用有限责任公司",
#             "registered_code": "140000110106177"
#         },
#         {
#             "company_name": "山西华晋吉宁煤业有限责任公司",
#             "registered_code": "140000105974138"
#         },
#         {
#             "company_name": "山西华晋明珠煤业有限责任公司",
#             "registered_code": "140000206970138"
#         },
#         {
#             "company_name": "山西焦煤华晋寨圪塔能源有限责任公司",
#             "registered_code": "141000000074580"
#         },
#         {
#             "company_name": "石太铁路客运专线有限责任公司",
#             "registered_code": "140100103043161"
#         },
#         {
#             "company_name": "山西汾河焦煤股份有限公司",
#             "registered_code": "140000100099469"
#         },
#         {
#             "company_name": "山西焦煤集团汾河物业管理有限公司",
#             "registered_code": "140100103047124"
#         },
#         {
#             "company_name": "山西焦煤集团房地产开发有限公司",
#             "registered_code": "140100103020695"
#         },
#         {
#             "company_name": "山西焦煤交通能源投资有限公司",
#             "registered_code": "140000110111179"
#         }
#     ],
#     "province": "山西",
#     "shareholder_information": [
#         {
#             "paied_amount": "42354.798018",
#             "paied_time": "1204646400000",
#             "paied_type": "货币",
#             "shareholder_name": "山西焦煤集团有限责任公司",
#             "subscription_amount": "42354.798018",
#             "subscription_time": "1204646400000",
#             "subscription_type": "货币"
#         },
#         {
#             "paied_amount": "40693.825547",
#             "paied_time": "2017年6月4日",
#             "paied_type": "货币",
#             "shareholder_name": "中国中煤能源股份有限公司",
#             "subscription_amount": "40693.825547",
#             "subscription_time": "1204646400000",
#             "subscription_type": "货币"
#         }
#     ],
#     "websites": [
#         {
#             "name": "华晋焦煤有限责任公司",
#             "site": "http://www.sx.xinhuanet.com/qyzx/hjjm/",
#             "type": "网站"
#         }
#     ],
#     "year": "2015年度"
# }
#     src_url = "www.baidu.com"
#     data = json.dumps(extract_data)
#     extract_info = ExtractInfo(ex_status=2, extract_data=data)
#     base_info = BaseInfo(url=src_url)
#     parser_info = PageParseInfo(base_info=base_info, extract_info=extract_info)
#     entity_data = obj.entity_extract(parser_info, extract_data)
#     #data = obj.after_extract(src_url, entity_data, extract_data)
#     print data
#     for key, value in entity_data.items():
#         if isinstance(value, list):
#             for i in value:
#                 print key, ":", i
#         elif isinstance(value, dict):
#             for key2, value2 in value.items():
#                 print key2, ":", value2
#         else:
#             print key, ":", value
Beispiel #12
0
    def get_parser_data(self, content, bulletin_type):
        '''获取实体信息,当事人,原告,被告,公告类型'''

        plaintiff_list = []
        defendant_list = []
        norm_content = unicode(content).replace(" ", "")
        content_list = toolsutil.my_split(norm_content,
                                          [',', ',', '。', '\r\n', '\t'])
        find_flag = False

        #1 获取原告
        for rowcontent in content_list:
            for plaintiff_regex in self.plaintiff_regex_list:
                ret = toolsutil.re_findone(plaintiff_regex,
                                           unicode(rowcontent))
                if ret:
                    plaintiff_list = toolsutil.my_split(
                        ret, self.litiants_seps)
                    #print "原告:",plaintiff_regex.pattern,','.join(plaintiff_list)
                    find_flag = True
                    break
            if find_flag:
                break

        #2 获取被告
        if unicode(bulletin_type) in self.bulletin_type_list:
            find_flag = False
            for rowcontent in content_list:
                for defendant_regex in self.defendant_regex_list:
                    ret = toolsutil.re_findone(defendant_regex,
                                               unicode(rowcontent))
                    if ret:
                        if u'你' in unicode(ret):
                            defendant_list = toolsutil.my_split(
                                content_list[0], self.litiants_seps)
                        else:
                            defendant_list = toolsutil.my_split(
                                ret, self.litiants_seps)
                        if plaintiff_list == []:
                            plaintiff_list = defendant_list
                            defendant_list = []

                        # print "被告:", defendant_regex.pattern, ','.join(defendant_list)
                        # print "原告:", ','.join(plaintiff_list)
                        find_flag = True
                        break

                for defendant_pattern in fygg_conf.defendant_pattern_list:
                    ret = toolsutil.re_find_one(defendant_pattern,
                                                unicode(rowcontent))
                    if ret:
                        defendant_list = toolsutil.my_split(
                            ret, self.litiants_seps)
                        find_flag = True
                        break
                if find_flag:
                    break
            plaintiff_list, defendant_list = self.format_litigant(
                plaintiff_list, defendant_list,
                fygg_conf.litigant_replace_str_list)

        else:
            content_list = toolsutil.my_split(norm_content,
                                              ['。', '\r\n', '\t', ','])
            for rowcontent in content_list:
                tmp_list = re.split(':|:|;', rowcontent)
                if len(tmp_list) == 2:

                    defendant_list = toolsutil.my_split(
                        tmp_list[0], self.litiants_seps)
                    replace_str_list = fygg_conf.defendant_keyword_list + fygg_conf.plaintiff_keyword_list
                    plaintiff_list, defendant_list = self.format_litigant(
                        plaintiff_list, defendant_list, replace_str_list)
                    break

        info = {
            "plaintiff_list": plaintiff_list,
            "defendant_list": defendant_list,
            "bulletin_type": bulletin_type,
        }

        return info
Beispiel #13
0
    def profit_format_extract_data(self, extract_data):
        '''利润表解析抽取数据'''
        month = year = publish_time = code = ''
        entity_data = copy.deepcopy(extract_data)

        if entity_data.has_key('year_month'):
            year_month = entity_data.get('year_month', '')
            for year_mode in self.year_mode_list:
                if year_mode in year_month:
                    year = year_mode
                    break

            for key, value in self.month_mode_map.items():
                if key in year_month:
                    month = value
                    break

            for key in self.quarter_map.keys():
                if key in year_month:
                    month = key
                    break

        if month and year:
            publish_time = year + month

        if entity_data.has_key('code'):
            code = toolsutil.re_find_one('\d+', entity_data.get('code', ''))
            if code:
                entity_data["code"] = code

        if entity_data.has_key('money_unit'):
            money_unit = entity_data.get("money_unit", "")
            found = False
            for money_type in self.money_type_list:
                if money_type in unicode(money_unit):
                    entity_data["money_unit"] = money_type
                    found = True
                    break
            if not found:
                entity_data["money_unit"] = u'元'

        caibao_type = self.quarter_map.get(month,"")
        title = publish_time + caibao_type + u'利润表'

        data_info = {}
        if entity_data.has_key("info"):
            entity_data.pop("info")

            for item in extract_data["info"]:
                for key, value in item.items():
                    base_key = key[:3]
                    base_value = "value" + key[3:]
                    if base_key == "key" and item.has_key(base_value):

                        key = item[key].encode("utf8")
                        value = item[base_value].encode("utf8")
                        key_values = [(key, value)]
                        key_pars = key.split("\t")
                        value_pars = value.split("\t")
                        if len(key_pars) > 1 and len(key_pars) == len(value_pars):
                            index = 0
                            while index < len(key_pars):
                                key_values.append((key_pars[index], value_pars[index]))
                                index += 1
                        for key, value in key_values:
                            key = key.strip().replace(" ", "")
                            if toolsutil.re_find_one(u'\d+\.\d+', key):
                                continue
                            data_info[key] = value
                            for key_conf, value_conf in self.profit_mapping_conf.items():
                                if key_conf in key:
                                    entity_data[value_conf] = value
                                    break

        entity_data["data_info"] = data_info
        entity_data["title"] = title
        entity_data["caibao_type"] = caibao_type
        entity_data["publish_time"] = publish_time

        return entity_data