-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenization_entitylist_only.py
180 lines (163 loc) · 9.76 KB
/
tokenization_entitylist_only.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# coding=utf-8
import os
import re
from pyltp import Segmentor
from htmlconvert2text import convert2txt
# from anago.utils import load_data_and_labels
class tokenization_entis():
def __init__(self):
self.LTP_DATA_DIR = "/home/mm/Downloads/ltp_data_v3.4.0/"
self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
self.segmentor = Segmentor() # 初始化实例
self.segmentor.load(self.cws_model_path) # 加载模型
self.train_res = self.read_train_res() # 读取tag文本,防止里面有空格去掉空格
# self.all_co_names = self.FDDC_co_list()
def read_train_res(self):
with open('/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train') as rf:
train_res = rf.read()
train_res = re.sub(r'\(', '(', train_res)
train_res = re.sub(r'\)', ')', train_res)
return train_res
def ltp_segmentor_release(self):
self.segmentor.release()
def tokenize_enti(self, path11):
texx, entity_string = convert2txt(path11)
# sentences = re.split(r'。', texx)
# sentences.sort(key=len, reverse=True)
entities = list(set(re.split(r'[\s~、,;/]', entity_string)))
entities.sort(key=len)
entities_arrows_list = list(set([x if '~' in x else '' for x in re.split(r'\s', entity_string)]))
entities_arrows_list.sort(key=len, reverse=True)
entities_arrows_list = entities_arrows_list[:-1]
# 找出结果数据行并且把最后的回车符号去掉
patt_index = re.findall(r'\d{4,10}', path11)[0]
res_rows = re.findall(r'(?<=\n){}[^\n]+(?=\n)'.format(patt_index), self.train_res)
# 以下是整理train——res
# 遍历结果,发现有简称全称的,把匹配的另一半加进去。
"""主要目的是修正train——res文件,里面有简称或者全称,并不统一,为了让简称全称都出现,
使用正则提取对应的简称或全称,如果有顿号,把那些字串也分开提取,作为标注的标的,当然是先
把字符长度小的匹配出来,分词之后也是先把长度长的连起来。没问题的"""
res_paired = {} # 临时定义一个res的列表,存储修改后的train res
for x in range(len(res_rows)):
res_row = res_rows[x]
for y in range(6):
res_paired[str(x) + str(y)] = [re.split(r'\t', res_row)[y]]
for arrow_str in entities_arrows_list:
for index, result_row in enumerate(res_rows):
for indi, res_value in enumerate(re.split(r'\t', result_row)):
if indi in [0, 1, 4, 5]:
continue
res_value_list = res_value.split('、')
for res_value_split in res_value_list:
if res_value_split in entities and res_value_split in arrow_str:
# 找出配对的简称或者全称,添加,如果是股权/估值法/金额直接添加并且continue
niki, fullna = re.split(r'~', arrow_str)
fullna_first = fullna.split(',')[0]
niki_split_list = re.split(r'[/、]', niki)
# 对应的全称满足三个条件,长度/逗号 以及含有简称的几个字
if res_value_split in niki_split_list \
and len(fullna_first) < 18 \
and re.search(re.sub(r'(?<=[^屄\s])', '\s?', res_value_split), fullna_first):
res_paired[str(index) + str(indi)].append(fullna_first)
""" 由全称查简称时候要避免 公司/本公司/上市公司/发起人/申请人/,
含有这几个字的要剔除 """
if res_value_split == fullna_first:
# 对应的简称满足几个条件: 包含在全程里面,不长于4个字,不等于
for niki_split in niki_split_list:
if re.search(re.sub(r'(?<=[^屄\s])', '\s?', fullna_first), niki_split) \
and not re.search(r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|对方|发行|对象|股东|对手|单位)',
re.sub(r'\s', '', niki_split)):
res_paired[str(index) + str(indi)].append(niki_split)
# 遍历公告的每一句,把每一句送进模型。
# words_n_words = ''
# for i in sentences:
words = self.segmentor.segment(entity_string)
words = ' '.join(words)
# 分词要使用更好的策略,更长一些,避免太短的句子,重复循环浪费流程
# # 下面是把所有目标主体合并在一起, 把55%股权这样的先分出来,
# for ent in entities:
# # 把words中所有是实体的中间去掉空格。使用双层sub
# # 正则还是要多注释啊
# """ re.sub(r'(?<=\w)(?=\w)'','\s?',ent) 是把实体里面的每个字符中间插入“\s?”
# 表示匹配任何以此序列出现但中间可能有空格的情况,分词之后join成空格分割的。然后找出words
# 中出现这个序列的地方,将其换成没空格的"""
# if len(ent) > 1:
# if not re.search(r'([\d.]+%的?(?:股权|股份|权益))', ent): # 如果没有股权关键字,直接加上空格匹配pattern
# patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent)
# elif len(ent) > 7: # 如果有股权关键字,且长度比较长,就把前面主体提出来,单独分词
# patt_ent = re.sub(r'(?<=\w)(?=\w)',r'\s?', re.sub(r'的?[\d.]+%的?(股权|股份|权益)','', ent))
# else:
# patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent)
# # 下面一句把words中所有符合主体列表的项目,可能被分词分开的,重新合并起来,单独成行,在test时使用
# words = re.sub(r'{}'.format(patt_ent), '\s' + ent + '\s', words)
# 然后把空格都换成回车,words竖起来了。
# words = re.sub(r'\s', '\n', words)
# words = re.sub(r'\n+', '\n', words)
"""把words中所有是结果键值的,后缀上tab键和结果索引号。否则后缀tab键和字母o
目的是好的,就是让模型更容易找到目标,模型不需要判断开始和结束,
但是这样的正则太难了, 我无法将所有合适的实体
全部抽出来,而导致标注的缺失,那么还是把任务给模型了"""
# for x in range(len(res_rows)):
# for y in range(6):
# index = str(x)+str(y)
# tags_list = res_paired[index]
for index, tags_list in res_paired.items():
# 表中的小表,可能有一个或多个成员,遍历一下,包括顿号分割的那些都可以标出来了,不影响合并好的实体字符串。
for sub_res in sorted(tags_list, key=len, reverse=True):
if not index.endswith('0') and len(sub_res) > 1:
patt_sub_res = re.sub(r'(?<=[^屄\s])', '\s?', sub_res)
if re.search(r'{}'.format(patt_sub_res), words):
spliter = re.findall(patt_sub_res, words)[0]
words_split_list = re.split(spliter, words)
spliter_tagged = re.sub(r'\s', '屄{}'.format(index[1]), spliter)
words = spliter_tagged.join(words_split_list)
# print(words)
# words=re.sub(patt_sub_res, sub_res)
# words= re.sub(r'{}(?=\n)'.format(sub_res), '\n{}\t{}\n'.format(sub_res, index), words)
# train——result标注完了,现在标注o,就是把非数字结尾的行加上tab和o
words = re.sub(r'\s', '\to\n', words)
words = re.sub(r'(?<=屄\d)', '\n', words)
words = re.sub(r'屄', '\t', words)
return words
# words_n_words += words
#
# # print(words)
# with open('/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/' + res_paired['00'][0] + '.txt',
# 'w') as af:
# af.write(words)
# print(path11.split("/")[-1])
# # print(words)
# # print("%%%%%%%%%%%%%%%%%%%%%%%%this is {}".format(i))
if __name__ == "__main__":
path_in = "/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/"
path_out = '/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/'
tnt = tokenization_entis()
done_list = os.listdir(path_out)
comming_list = os.listdir(path_in)
for i in comming_list:
dest_name = i.split('.')[0] + '.txt'
if dest_name in done_list:
continue
else:
print("now it comes {}".format(i))
try:
tnt.tokenize_enti(path_in + i)
except Exception as e:
print("sth going wrong wowo {}".format(e))
continue
tnt.ltp_segmentor_release()
# def tokenit(path):
# path1 = path
# tnt = tokenization()
# for htm in os.listdir(path1):
# tnt.tokenize_enti(path1+htm)
# # 改成多进程的,方便在阿里云高效运行
# if __name__=="__main__":
# import multiprocessing as mp
# process_jobs = []
# path = "/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/"
# for i in range(5):
# p = mp.Process(target=tokenit(path))
# process_jobs.append(p)
# p.start()
# p.join()