def __init__(self, ner_model_dir_path, ner_blacklist_file_path): self.html_parser = Parser() self.config = None self.ner_tagger = NERTagger.NERTagger(ner_model_dir_path, ner_blacklist_file_path) self.com_abbr_dict = {} self.com_full_dict = {} self.com_abbr_ner_dict = {} with codecs.open(FLAGS.resource.config_file, encoding='utf-8', mode='r') as fp: self.config = json.loads(fp.read()) self.table_dict_field_pattern_dict = {} for table_dict_field in self.config['table_dict']['fields']: field_name = table_dict_field['fieldName'] if field_name is None: continue convert_method = table_dict_field['convertMethod'] if convert_method is None: continue pattern = table_dict_field['pattern'] if pattern is None: continue col_skip_pattern = None if 'colSkipPattern' in table_dict_field: col_skip_pattern = table_dict_field['colSkipPattern'] row_skip_pattern = None if 'rowSkipPattern' in table_dict_field: row_skip_pattern = table_dict_field['rowSkipPattern'] self.table_dict_field_pattern_dict[field_name] = \ TableDictFieldPattern(field_name=field_name, convert_method=convert_method, pattern=pattern, col_skip_pattern=col_skip_pattern, row_skip_pattern=row_skip_pattern)
def __init__(self): self.html_parser = Parser() self.config = None self.name = {} self.month = None self.money = None self.shangxian = None self.xiaxian = None
def __init__(self): self.html_parser = Parser() self.config = None self.name = {} self.month = None self.money = None with codecs.open(FLAGS.resource.config_file, encoding='utf-8', mode='r') as fp: #open config self.config = json.loads(fp.read()) self.table_dict_field_pattern_dict = {} for table_dict_field in self.config['table_dict']['fields']: self.table_dict_field_pattern_dict[table_dict_field['fieldName']] = \ TableDictFieldPattern(field_name=table_dict_field['fieldName'], convert_method=table_dict_field['convertMethod'], pattern=table_dict_field['pattern'], col_skip_pattern=table_dict_field['colSkipPattern'] if 'colSkipPattern' in table_dict_field else None, row_skip_pattern=table_dict_field['rowSkipPattern'] if 'rowSkipPattern' in table_dict_field else None )
class HeTongExtractor(object): def __init__(self): self.html_parser = Parser() self.config = None self.name = {} self.month = None self.money = None self.shangxian = None self.xiaxian = None # with codecs.open(FLAGS.resource.config_file, encoding='utf-8', mode='r') as fp: #open config # self.config = json.loads(fp.read()) # self.table_dict_field_pattern_dict = {} # for table_dict_field in self.config['table_dict']['fields']: # self.table_dict_field_pattern_dict[table_dict_field['fieldName']] = \ # TableDictFieldPattern(field_name=table_dict_field['fieldName'], # convert_method=table_dict_field['convertMethod'], # pattern=table_dict_field['pattern'], # col_skip_pattern=table_dict_field['colSkipPattern'] if 'colSkipPattern' in table_dict_field else None, # row_skip_pattern=table_dict_field['rowSkipPattern'] if 'rowSkipPattern' in table_dict_field else None ) def extract_from_html_dir(self, html_dir_path): map = { "公告id": [], "甲方": [], "乙方": [], "项目名称": [], "合同名称": [], "合同金额上限": [], "合同金额下限": [], "联合体成员": [] } config = load_config(FLAGS.resource.config_file2) with open(FLAGS.resource.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.resource.ckpt_dir, load_word2vec, config, id_to_char, False) trans = model.trans.eval() for html_id in tqdm(os.listdir(html_dir_path)): self._extract_from_html_dir(html_dir_path, html_id, map, sess, trans, model, id_to_tag, tag_to_id, char_to_id) dataframe = pd.DataFrame(data=map, columns=[ "公告id", "甲方", "乙方", "项目名称", "合同名称", "合同金额上限", "合同金额下限", "联合体成员" ], dtype=None, copy=False) if os.path.exists('ht_result.csv'): os.remove('ht_result.csv') dataframe.to_csv("ht_result.csv", encoding="utf_8_sig") def _extract_from_html_dir(self, html_dir_path, html_id, map, sess, trans, model, id_to_tag, tag_to_id, char_to_id): record_list = [] for record in self.extract(os.path.join(html_dir_path, html_id), html_id, sess, trans, model, id_to_tag, tag_to_id, char_to_id): if record is not None and \ record.yifang is not None: record_list.append("%s\t%s" % (html_id[:-5], record.to_result())) for record in record_list: records = record.split('\t') map['公告id'].append(records[0]) map['甲方'].append(records[1]) map['乙方'].append(records[2]) map['项目名称'].append(records[3]) map['合同名称'].append(records[4]) map['合同金额上限'].append(records[5]) map['合同金额下限'].append(records[6]) map['联合体成员'].append(records[7]) def extract(self, html_dir_path, html_id, sess, trans, model, id_to_tag, tag_to_id, char_to_id): self.shangxian = self.xiaxian = None rs = [] # 1. 解析other print(html_dir_path) self._extract_shangxiaxian( self.html_parser.parse_content(html_dir_path)) # 2. 解析... map = return_html_entity(html_id, sess, trans, model, id_to_tag, tag_to_id, char_to_id) for file, res in map.items(): for line, gg in res.items(): rs.append( HeTongRecord(gg["jiafang"], gg["yifang"], gg["xiangmu"], gg["hetong"], self.shangxian, self.xiaxian)) return rs def _extract_shangxiaxian(self, paragraphs): for para in paragraphs: targets = re.finditer( r"(中标|合同)总?(价|金额|额)(总计|合计|:|:)?(约为|为)?(人民币|US\$)?(共计)?约?(?P<num>\d{1,15})", para) # targets = re.finditer(r"(中标|合同)总?(价|金额|额)(总计|合计|:|:)?为?(人民币)?(?P<num>\d{1,15})(元)", para) for target in targets: print(target) num = target.group('num') if (num != "0" and num != "1"): self.xiaxian = self.shangxian = num if num == "8340": self.xiaxian = self.shangxian = "83400000" break
class DingZengExtractor(object): def __init__(self): self.html_parser = Parser() self.config = None self.name = {} self.month = None self.money = None with codecs.open(FLAGS.resource.config_file, encoding='utf-8', mode='r') as fp: #open config self.config = json.loads(fp.read()) self.table_dict_field_pattern_dict = {} for table_dict_field in self.config['table_dict']['fields']: self.table_dict_field_pattern_dict[table_dict_field['fieldName']] = \ TableDictFieldPattern(field_name=table_dict_field['fieldName'], convert_method=table_dict_field['convertMethod'], pattern=table_dict_field['pattern'], col_skip_pattern=table_dict_field['colSkipPattern'] if 'colSkipPattern' in table_dict_field else None, row_skip_pattern=table_dict_field['rowSkipPattern'] if 'rowSkipPattern' in table_dict_field else None ) def extract_from_html_dir(self, html_dir_path): map = {"公告id": [], "增发对象": [], "增发数量": [], "增发金额": [], "锁定期": [], "认购方式": []} for html_id in tqdm(os.listdir(html_dir_path)): self._extract_from_html_dir(html_dir_path, html_id, map) dataframe = pd.DataFrame(data=map, columns=["公告id", "增发对象", "增发数量", "增发金额", "锁定期", "认购方式"], dtype=None, copy=False) if os.path.exists('dz_result.csv'): os.remove('dz_result.csv') dataframe.to_csv("dz_result.csv", encoding="utf_8_sig") def _extract_from_html_dir(self, html_dir_path, html_id, map): record_list = [] for record in self.extract(os.path.join(html_dir_path, html_id)): if record is not None and \ record.name is not None: print("record ", record.to_result()) record_list.append("%s\t%s" % (html_id[:-5], record.to_result())) for record in record_list: records = record.split('\t') map['公告id'].append(records[0]) map['增发对象'].append(records[1]) map['增发数量'].append(records[2]) map['增发金额'].append(records[3]) map['锁定期'].append(records[4]) map['认购方式'].append(records[5]) return record_list def extract(self, html_file_path): rs = [] # 1. 解析other # paragraphs = self.html_parser.parse_text(html_file_path) # self._extract_money(paragraphs) # self._extract_time(paragraphs) # rs_paragraphs = self._extract_from_paragraphs(paragraphs) # 2. 解析table for table_dict in self.html_parser.parse_table(html_file_path): rs.extend(self._extract_from_table_dict(table_dict)) return rs def _extract_money(self, paragraphs): for para in paragraphs: targets = re.findall(r"(以)?现金(方式)?.{0,5}?认购", para) if len(targets) > 0: self.money = "现金" break def _extract_time(self, paragraphs): for para in paragraphs: targets = re.finditer(r"(本次)?(发行结束之日起|限售期为)[,,]?(?P<month>.{1,3}?)个月(内不得转让)?", para) for target in targets: print(target) self.month = target.group('month') if self.month == "三十六": self.month = '36' if self.month == "十二": self.month = '12' if self.month == "六十": self.month = '60' return def _extract_from_table_dict(self, table_dict): # check none rs = [] if table_dict is None or len(table_dict) <= 0: return rs # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段 # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False head_row = table_dict[0] col_length = len(head_row) row_length = len(table_dict) field_col_dict = {} skip_row_set = set() danwei = {'shuliang': '', 'jine': ''} # {'num':,'money':.} print(" head",head_row) for i in range(col_length): text = head_row[i] for (field_name, table_dict_field_pattern) in self.table_dict_field_pattern_dict.items(): col_good, _danwei = table_dict_field_pattern.is_match_pattern(text) if col_good and not table_dict_field_pattern.is_match_col_skip_pattern(text): if field_name not in field_col_dict: field_col_dict[field_name] = i if field_name in ["jine","shuliang"]: danwei[field_name] = _danwei if _danwei else "" # 逐行扫描这个字段的取值,如果满足 row_skip_pattern 则丢弃整行 row for j in range(1, row_length): try: text = table_dict[j][i] if table_dict_field_pattern.is_match_row_skip_pattern(text): skip_row_set.add(j) except KeyError: pass if len(field_col_dict) <= 0: return rs # 2. 遍历每个有效行,获取 record for row_index in range(1, row_length): if row_index in skip_row_set: continue record = DingZengRecord(None, None, None, None, None) for (field_name, col_index) in field_col_dict.items(): try: text = table_dict[row_index][col_index] print(" text",text) if field_name == 'duixiang': record.name = self.table_dict_field_pattern_dict.get(field_name).convert(text) elif field_name == 'shuliang': record.shuliang = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["shuliang"]+"股")) elif field_name == 'jine': record.jine = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["jine"]+"元")) elif field_name == 'rengoufangshi': record.money = self.table_dict_field_pattern_dict.get(field_name).convert(text) if not record.money: record.money = self.money elif field_name == 'suodingqi': record.time = self.table_dict_field_pattern_dict.get(field_name).convert(text) if not record.time: record.time = self.month else: pass except KeyError: pass rs.append(record) return rs
class HeTongExtractor(object): def __init__(self): self.html_parser = Parser() self.config = None self.name = {} self.month = None self.money = None with codecs.open(FLAGS.resource.config_file, encoding='utf-8', mode='r') as fp: #open config self.config = json.loads(fp.read()) self.table_dict_field_pattern_dict = {} for table_dict_field in self.config['table_dict']['fields']: self.table_dict_field_pattern_dict[table_dict_field['fieldName']] = \ TableDictFieldPattern(field_name=table_dict_field['fieldName'], convert_method=table_dict_field['convertMethod'], pattern=table_dict_field['pattern'], col_skip_pattern=table_dict_field['colSkipPattern'] if 'colSkipPattern' in table_dict_field else None, row_skip_pattern=table_dict_field['rowSkipPattern'] if 'rowSkipPattern' in table_dict_field else None ) def extract(self, html_file_path): rs = [] # 1. 解析other paragraphs = self.html_parser.parse_text(html_file_path) self._extract_shangxiaxian(paragraphs) # self._extract_time(paragraphs) # rs_paragraphs = self._extract_from_paragraphs(paragraphs) # 1. 解析everything map = return_html_entity(FLAGS) for file, res in map.items(): for line, gg in res.items(): rs.append(HeTongRecord(file, gg["jiafang"],gg["yifang"],gg["hetong"],gg["xiangmu"])) # 2. 解析table # for table_dict in self.html_parser.parse_table(html_file_path): # rs.extend(self._extract_from_table_dict(table_dict)) return rs def _extract_shangxiaxian(self,paragraphs): for para in paragraphs: targets = re.finditer(r"(中标|合同)总?(价|金额|额)(总计|合计|:|:)?为?(人民币)?(?P<num>.{1,15})(亿|万)?元", para) for target in targets: print(target) self.shangxian = target.group('num') self.xiaxian = target.group('num') return # def _extract_money(self, paragraphs): # for para in paragraphs: # targets = re.findall(r"(以)?现金(方式)?.{0,5}?认购", para) # if len(targets) > 0: # self.money = "现金" # break # def _extract_time(self, paragraphs): # for para in paragraphs: # targets = re.finditer(r"(本次)?(发行结束之日起|限售期为)[,,]?(?P<month>.{1,3}?)个月(内不得转让)?", para) # for target in targets: # print(target) # self.month = target.group('month') # if self.month == "3十6": # self.month = '36' # if self.month == "十2": # self.month = '12' # if self.month == "6十": # self.month = '60' # return def _extract_from_table_dict(self, table_dict): # check none rs = [] if table_dict is None or len(table_dict) <= 0: return rs # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段 # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False head_row = table_dict[0] col_length = len(head_row) row_length = len(table_dict) field_col_dict = {} skip_row_set = set() danwei = {'shuliang': '', 'jine': ''} # {'num':,'money':.} print(" head",head_row) for i in range(col_length): text = head_row[i] for (field_name, table_dict_field_pattern) in self.table_dict_field_pattern_dict.items(): col_good, _danwei = table_dict_field_pattern.is_match_pattern(text) if col_good and not table_dict_field_pattern.is_match_col_skip_pattern(text): if field_name not in field_col_dict: field_col_dict[field_name] = i if field_name in ["jine","shuliang"]: danwei[field_name] = _danwei if _danwei else "" # 逐行扫描这个字段的取值,如果满足 row_skip_pattern 则丢弃整行 row for j in range(1, row_length): try: text = table_dict[j][i] if table_dict_field_pattern.is_match_row_skip_pattern(text): skip_row_set.add(j) except KeyError: pass if len(field_col_dict) <= 0: return rs # 2. 遍历每个有效行,获取 record for row_index in range(1, row_length): if row_index in skip_row_set: continue record = DingZengRecord(None, None, None, None, None) for (field_name, col_index) in field_col_dict.items(): try: text = table_dict[row_index][col_index] if field_name == 'duixiang': record.name = self.table_dict_field_pattern_dict.get(field_name).convert(text) elif field_name == 'shuliang': record.shuliang = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["shuliang"]+"股")) elif field_name == 'jine': record.jine = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["jine"]+"元")) elif field_name == 'rengoufangshi': record.money = self.table_dict_field_pattern_dict.get(field_name).convert(text) if not record.money: record.money = self.money elif field_name == 'suodingqi': record.time = self.table_dict_field_pattern_dict.get(field_name).convert(text) if not record.time: record.time = self.month else: pass except KeyError: pass rs.append(record) return rs def _extract_from_paragraphs(self, paragraphs): self.clearComAbbrDict() change_records = [] # change_after_records = [] record_list = [] for para in paragraphs: change_records_para, change_after_records_para = self.__extract_from_paragraph(para) change_records += change_records_para
class ZengJianChiExtractor(object): def __init__(self, ner_model_dir_path, ner_blacklist_file_path): self.html_parser = Parser() self.config = None self.ner_tagger = NERTagger.NERTagger(ner_model_dir_path, ner_blacklist_file_path) self.com_abbr_dict = {} self.com_full_dict = {} self.com_abbr_ner_dict = {} with codecs.open(FLAGS.resource.config_file, encoding='utf-8', mode='r') as fp: self.config = json.loads(fp.read()) self.table_dict_field_pattern_dict = {} for table_dict_field in self.config['table_dict']['fields']: field_name = table_dict_field['fieldName'] if field_name is None: continue convert_method = table_dict_field['convertMethod'] if convert_method is None: continue pattern = table_dict_field['pattern'] if pattern is None: continue col_skip_pattern = None if 'colSkipPattern' in table_dict_field: col_skip_pattern = table_dict_field['colSkipPattern'] row_skip_pattern = None if 'rowSkipPattern' in table_dict_field: row_skip_pattern = table_dict_field['rowSkipPattern'] self.table_dict_field_pattern_dict[field_name] = \ TableDictFieldPattern(field_name=field_name, convert_method=convert_method, pattern=pattern, col_skip_pattern=col_skip_pattern, row_skip_pattern=row_skip_pattern) def extract_from_html_dir(self, html_dir_path): map = { "公告id": [], "股东全称": [], "股东简称": [], "变动截止日期": [], "变动价格": [], "变动数量": [], "变动后持股数": [], "变动后持股比例": [] } for html_id in tqdm(os.listdir(html_dir_path)): self._extract_from_html_dir(html_dir_path, html_id, map) dataframe = pd.DataFrame(data=map, columns=[ "公告id", "股东全称", "股东简称", "变动截止日期", "变动价格", "变动数量", "变动后持股数", "变动后持股比例" ], dtype=None, copy=False) if os.path.exists('zjc_result.csv'): os.remove('zjc_result.csv') dataframe.to_csv("zjc_result.csv", encoding="utf_8_sig") def _extract_from_html_dir(self, html_dir_path, html_id, map): record_list = [] for record in self.extract(os.path.join(html_dir_path, html_id)): if record is not None and \ record.shareholderFullName is not None and \ len(record.shareholderFullName) > 1 and \ record.finishDate is not None and \ len(record.finishDate) >= 6 and\ record.shareholderFullName not in ["股东名称","董事、监事、高级管理人员本次增持情况","久立集团增持前后持股明细","久立集团以及董事、监事、高级管理人员本次增持总体情况"]: record_list.append("%s\t%s" % (html_id[:-5], record.to_result())) for record in record_list: records = record.split('\t') map['公告id'].append(records[0]) map['股东全称'].append(records[1]) map['股东简称'].append(records[2]) map['变动截止日期'].append(records[3]) map['变动价格'].append(records[4]) map['变动数量'].append(records[5]) map['变动后持股数'].append(records[6]) map['变动后持股比例'].append(records[7]) return record_list def extract(self, html_file_path): # 1. 解析 Table Dict rs = [] paragraphs = self.html_parser.parse_content(html_file_path) rs_paragraphs = self._extract_from_paragraphs(paragraphs) for table_dict in self.html_parser.parse_table(html_file_path): rs_table = self._extract_from_table_dict(table_dict) if len(rs_table) > 0: if len(rs) > 0: self.__mergeRecord(rs, rs_table) break else: rs.extend(rs_table) # 2. 如果没有 Table Dict 则解析文本部分 if len(rs) <= 0: return rs_paragraphs else: for record in rs: full_company_name, abbr_company_name = self.getShareholder( record.shareholderFullName) if full_company_name is not None and len(full_company_name) > 0 \ and abbr_company_name is not None and len(abbr_company_name) > 0: record.shareholderFullName = full_company_name record.shareholderShortName = abbr_company_name else: record.shareholderShortName = record.shareholderFullName return rs def _extract_from_table_dict(self, table_dict): # print(table_dict) rs = [] if table_dict is None or len(table_dict) <= 0: return rs row_length = len(table_dict) field_col_dict = {} skip_row_set = set() # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段 # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False head_row = table_dict[0] col_length = len(head_row) danwei = {'sharePrice': '', 'shareNum': ''} for i in range(col_length): text = head_row[i] for (field_name, table_dict_field_pattern ) in self.table_dict_field_pattern_dict.items(): col_good, _danwei = table_dict_field_pattern.is_match_pattern( text) if col_good and not table_dict_field_pattern.is_match_col_skip_pattern( text): if field_name not in field_col_dict: field_col_dict[field_name] = i if field_name in ["sharePrice", "shareNum"]: danwei[field_name] = _danwei if _danwei else "" if _danwei is not None: print(field_name, _danwei) for j in range(1, row_length): try: text = table_dict[j][i] if table_dict_field_pattern.is_match_row_skip_pattern( text): skip_row_set.add(j) except KeyError: pass if len(field_col_dict) <= 0: return rs # 2. 遍历每个有效行,获取 record for row_index in range(1, row_length): if row_index in skip_row_set: continue record = ZengJianChiRecord(None, None, None, None, None, None, None) for (field_name, col_index) in field_col_dict.items(): try: text = table_dict[row_index][col_index] if field_name == 'shareholderFullName': record.shareholderFullName = self.table_dict_field_pattern_dict.get( field_name).convert(text) elif field_name == 'finishDate': record.finishDate = self.table_dict_field_pattern_dict.get( field_name).convert(text) elif field_name == 'sharePrice': record.sharePrice = self.table_dict_field_pattern_dict.get( field_name).convert( normalize(text + danwei["sharePrice"] + "元")) elif field_name == 'shareNum': record.shareNum = self.table_dict_field_pattern_dict.get( field_name).convert( normalize(text + danwei["shareNum"] + "股")) elif field_name == 'shareNumAfterChg': record.shareNumAfterChg = self.table_dict_field_pattern_dict.get( field_name).convert(text) elif field_name == 'sharePcntAfterChg': record.sharePcntAfterChg = self.table_dict_field_pattern_dict.get( field_name).convert(text) else: pass except KeyError: pass rs.append(record) return rs def _extract_from_paragraphs(self, paragraphs): self.clearComAbbrDict() change_records = [] change_after_records = [] record_list = [] for para in paragraphs: change_records_para, change_after_records_para = self.__extract_from_paragraph( para) change_records += change_records_para change_after_records += change_after_records_para self.__mergeRecord(change_records, change_after_records) for record in change_records: record_list.append(record) return record_list def __extract_from_paragraph(self, paragraph): # lalal tag_res = self.ner_tagger.ner(paragraph, self.com_abbr_ner_dict) tagged_str = tag_res.get_tagged_str() if self.___extract_company_name(tagged_str) > 0: tag_res = self.ner_tagger.ner(paragraph, self.com_abbr_ner_dict) tagged_str = tag_res.get_tagged_str() # extract change_records = self.___extract_change(tagged_str) change_after_records = self.___extract_change_after(tagged_str) return change_records, change_after_records def ___extract_company_name(self, paragraph): # print(paragraph) targets = re.finditer( r"<org>(?P<com>.{1,28}?)</org>[((].{0,5}?简称:?[\"“](?P<com_abbr>.{2,6}?)[\"”][))]", paragraph) size_before = len(self.com_abbr_ner_dict) for target in targets: com_abbr = target.group("com_abbr") com_name = target.group("com") if com_abbr != None and com_name != None: self.com_abbr_dict[com_abbr] = com_name self.com_full_dict[com_name] = com_abbr self.com_abbr_ner_dict[com_abbr] = "Ni" return len(self.com_abbr_ner_dict) - size_before def ___extract_change(self, paragraph): records = [] targets = re.finditer( r"(出售|减持|增持|买入)了?[^,。,:;!??]*?(股票|股份).{0,30}?<num>(?P<share_num>.{1,20}?)</num>股", paragraph) for target in targets: share_num = target.group("share_num") start_pos = target.start() end_pos = target.end() #查找公司 pat_com = re.compile(r"<org>(.*?)</org>") m_com = pat_com.findall(paragraph, 0, end_pos) shareholder = "" if m_com != None and len(m_com) > 0: shareholder = m_com[-1] else: pat_person = re.compile(r"<person>(.*?)</person>") m_person = pat_person.findall(paragraph, 0, end_pos) if m_person != None and len(m_person) > 0: shareholder = m_person[-1] #查找日期 pat_date = re.compile(r"<date>(.*?)</date>") m_date = pat_date.findall(paragraph, 0, end_pos) change_date = "" if m_date != None and len(m_date) > 0: change_date = m_date[-1] #查找变动价格 pat_price = re.compile( r"(均价|平均(增持|减持|成交)?(价格|股价))(:|:)?<num>(?P<share_price>.*?)</num>" ) m_price = pat_price.search(paragraph, start_pos) share_price = "" if m_price != None: share_price = m_price.group("share_price") if shareholder == None or len(shareholder) == 0: continue full_name, short_name = self.getShareholder(shareholder) records.append( ZengJianChiRecord(full_name, short_name, change_date, share_price, share_num, "", "")) return records def ___extract_change_after(self, paragraph): records = [] targets = re.finditer( r"(增持后|减持后|变动后).{0,30}?持有.{0,30}?<num>(?P<share_num_after>.*?)</num>(股|万股|百万股|亿股)", paragraph) for target in targets: share_num_after = target.group("share_num_after") start_pos = target.start() end_pos = target.end() #查找公司 pat_com = re.compile(r"<org>(.*?)</org>") m_com = pat_com.findall(paragraph, 0, end_pos) shareholder = "" if m_com != None and len(m_com) > 0: shareholder = m_com[-1] else: pat_person = re.compile(r"<person>(.*?)</person>") m_person = pat_person.findall(paragraph, 0, end_pos) if m_person != None and len(m_person) > 0: shareholder = m_person[-1] #查找变动后持股比例 pat_percent_after = re.compile( r"<percent>(?P<share_percent>.*?)</percent>") m_percent_after = pat_percent_after.search(paragraph, start_pos) share_percent_after = "" if m_percent_after != None: share_percent_after = m_percent_after.group("share_percent") if shareholder == None or len(shareholder) == 0: continue full_name, short_name = self.getShareholder(shareholder) records.append( ZengJianChiRecord(full_name, short_name, "", "", "", share_num_after, share_percent_after)) return records def __mergeRecord(self, changeRecords, changeAfterRecords): if len(changeRecords) == 0 or len(changeAfterRecords) == 0: return last_record = None for record in changeRecords: if last_record != None and record.shareholderFullName != last_record.shareholderFullName: self.___mergeChangeAfterInfo(last_record, changeAfterRecords) last_record = record self.___mergeChangeAfterInfo(last_record, changeAfterRecords) def ___mergeChangeAfterInfo(self, changeRecord, changeAfterRecords): for record in changeAfterRecords: if record.shareholderFullName == changeRecord.shareholderFullName: changeRecord.shareNumAfterChg = record.shareNumAfterChg changeRecord.sharePcntAfterChg = record.sharePcntAfterChg def clearComAbbrDict(self): self.com_abbr_dict = {} self.com_full_dict = {} self.com_abbr_ner_dict = {} def getShareholder(self, shareholder): #归一化公司全称简称 if shareholder in self.com_full_dict: return shareholder, self.com_full_dict.get(shareholder, "") if shareholder in self.com_abbr_dict: return self.com_abbr_dict.get(shareholder, ""), shareholder #股东为自然人时不需要简称 return shareholder, ""