def init_train_test_data(path): """ 提取所有标注的数据,格式为字典 {para(原话):"", ([(实体,类型)])} :param path: :return: """ pp = [] paras = [] result = [] with open(path, 'r', encoding='utf-8') as fp: lines = fp.readlines() for line in lines: sents = policy_util.cut_sentence(line) paras += sents for para in paras: p = re.sub("[A-Za-z0-9\$\#\*\]\[]", "", para) entity = extract_mark_label(para) if re.search(r'“.*”', p): event = str(re.search(r'“.*”', p).group()) entity.append((event[1:-1], "Event")) if not entity: continue if len(p) > 100 or len(entity) < 2: continue pp.append(p) d = {'sent': p, 'entities': entity} result.append(d) print(len(set(pp))) return result
def build_vocabulary(data: pd.DataFrame): counter = Counter() vocabulary = dict() for index, row in data.iterrows(): para = row['content'] sentences = policy_util.cut_sentence(para) for sent in sentences: for char in sent: counter[char] += 1 num_most_common = len(counter) for char, _ in counter.most_common(num_most_common): vocabulary[char] = vocabulary.get(char, len(vocabulary)) return vocabulary
def get_vectorization_init(data): sents = [] for index, row in data.iterrows(): content = row['content'] paras = policy_util.cut_sentence(content) for para in paras: sent = [] words = posseg.cut(para) for word in words: if word.flag != 'w': sent.append(word.word) sents.append(sent) return sents
def get_entity_para(content, entity): if not content: return [] paras = policy_util.cut_sentence(content) result = [] include_entity = [] for para in paras: entities = [] count = 0 for e in entity: if para.find(e) >= 0: entities.append(e) count += 1 if count >= 2: result.append(para) include_entity.append(entities) return result, include_entity
def tf_idf_statistic(data): """ 使用TF-IDF判断文章的词频 :param data: :return: """ all_word = [] for index, row in data.iterrows(): content = row['content'] paras = policy_util.cut_sentence(content) words = extract_entity(paras) word = ' '.join(words) all_word.append(word) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(all_word)) word = vectorizer.get_feature_names() weight = tfidf.toarray() return word, weight
def extract_entity(paragraphes): """ 从一个句子数组里提取名词词语 :param paragraphes: :return: """ extract_term = [] for para in paragraphes: sentences = policy_util.cut_sentence(para) for sent in sentences: print(sent) term = [] sent_seged = jieba.posseg.cut(sent.strip()) for s in sent_seged: sege = str(s.flag) if (sege.find('n') >= 0 or sege.find('j') >= 0) and sege.find('v') < 0: term.append(s.word) extract_term.append(term) extract_term = [str(term) for term in extract_term] return extract_term
def condition_extract(data): json_res = [] for index, row in data.iterrows(): content = row['content'] paras = policy_util.cut_sentence(content) res = [] for para in paras: if re.search(r"\d+%|\d+.*元|\d+天|\d+工作日", para): res.append(para) conditions = '|'.join(res) print(index) print(conditions) r = { 'policy_id': index + 1, 'content': str(row['content_html']), 'condition': conditions } json_res.append(json.dumps(r, ensure_ascii=False)) with open("data/policy_content.json", 'w', encoding='utf-8') as fp: for j in json_res: fp.write(j + '\n')