def extract_items(text_in): _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in text_in] _X1_MASK = [1] * len(_X1) _X1 = torch.tensor([_X1], dtype=torch.long, device=device) # [1,s1] _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device) _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device) with torch.no_grad(): _k1, _k2, _ = subject_model(device, _X1, _X1_SEG, _X1_MASK) # _k1:[1,s] _k1 = _k1[0, :].detach().cpu().numpy() _k2 = _k2[0, :].detach().cpu().numpy() _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0] _subjects = [] if len(_k1) and len(_k2): for i in _k1: j = _k2[_k2 >= i] if len(j) > 0: j = j[0] _subject = text_in[i:j + 1] if _subject in kb2id: if _subject in freq: if freq[_subject]['per'] > 0: _subjects.append((_subject, str(i), str(j + 1))) else: _subjects.append((_subject, str(i), str(j + 1))) # subject补余 for _s in match2(text_in): if _s[0] in freq: if freq[_s[0]]['per'] > 0.8: _subjects.append(_s) return list(set(_subjects))
def extract_items(text_in): _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in text_in] _X1_MASK = [1] * len(_X1) _X1 = torch.tensor([_X1], dtype=torch.long, device=device) # [1,s1] _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device) _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device) with torch.no_grad(): _k1, _k2, _ = subject_model(device, _X1, _X1_SEG, _X1_MASK) # _k1:[1,s] _k1 = _k1[0, :].detach().cpu().numpy() _k2 = _k2[0, :].detach().cpu().numpy() _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0] _subjects = [] if len(_k1) and len(_k2): for i in _k1: j = _k2[_k2 >= i] if len(j) > 0: j = j[0] _subject = text_in[i:j + 1] if _subject in kb2id: if _subject in freq: if freq[_subject]['per'] > 0: _subjects.append((_subject, str(i), str(j + 1))) else: _subjects.append((_subject, str(i), str(j + 1))) # subject补余 for _s in match2(text_in): if _s[0] in freq: if freq[_s[0]]['per'] > 0.8: _subjects.append(_s) _subjects = list(set(_subjects)) _subjects_new = _subjects.copy() for _s, _s_s, _s_e in _subjects: for _i, _i_s, _i_e in _subjects: if _s_s == _i_s and _s_e != _i_e and _s in group: if group[_s]['group_labeled_per'] > 1.5 * group[_s][ 's_same_per'] and (_i, _i_s, _i_e) in _subjects_new: _subjects_new.remove((_i, _i_s, _i_e)) if group[_s]['s_same_per'] > 1.5 * group[_s][ 'group_labeled_per'] and (_s, _s_s, _s_e) in _subjects_new: _subjects_new.remove((_s, _s_s, _s_e)) if _s_s != _i_s and _s_e == _i_e and _s in group: if group[_s]['group_labeled_per'] > 1.5 * group[_s][ 'e_same_per'] and (_i, _i_s, _i_e) in _subjects_new: _subjects_new.remove((_i, _i_s, _i_e)) if group[_s]['e_same_per'] > 1.5 * group[_s][ 'group_labeled_per'] and (_s, _s_s, _s_e) in _subjects_new: _subjects_new.remove((_s, _s_s, _s_e)) return list(set(_subjects_new))
def extract_items(text_in): _x1_tokens = jieba.lcut(text_in) _x1 = ''.join(_x1_tokens) assert len(_x1) == len(text_in) _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in _x1] _X1_MASK = [1] * len(_X1) _X1 = torch.tensor([_X1], dtype=torch.long, device=device) # [1,s1] _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device) _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device) _X1_WV = torch.tensor(seq2vec([_x1_tokens]), dtype=torch.float32, device=device) with torch.no_grad(): _k1, _k2, _x1_hs, _x1_h = subject_model('x1', device, _X1_WV, _X1, _X1_SEG, _X1_MASK) # _k1:[1,s] _k1 = _k1[0, :].detach().cpu().numpy() _k2 = _k2[0, :].detach().cpu().numpy() _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0] _subjects = [] if len(_k1) and len(_k2): for i in _k1: j = _k2[_k2 >= i] if len(j) > 0: j = j[0] _subject = text_in[i:j + 1] if _subject in kb2id: _subjects.append((_subject, str(i), str(j + 1))) # subject补余 for _s in match2(text_in): if _s[0] in freq: if freq[_s[0]]['per'] > 0.8: _subjects.append(_s) return list(set(_subjects))
def extract_items(text_in): _x1_tokens = jieba.lcut(text_in) _x1 = ''.join(_x1_tokens) assert len(_x1) == len(text_in) _X1 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in _x1] _X1_MASK = [1] * len(_X1) _X1 = torch.tensor([_X1], dtype=torch.long, device=device) # [1,s1] _X1_MASK = torch.tensor([_X1_MASK], dtype=torch.long, device=device) _X1_SEG = torch.zeros(*_X1.size(), dtype=torch.long, device=device) _X1_WV = torch.tensor(seq2vec([_x1_tokens]), dtype=torch.float32, device=device) with torch.no_grad(): _k1, _k2, _x1_hs, _x1_h = subject_model('x1', device, _X1_WV, _X1, _X1_SEG, _X1_MASK) # _k1:[1,s] _k1 = _k1[0, :].detach().cpu().numpy() _k2 = _k2[0, :].detach().cpu().numpy() _k1, _k2 = np.where(_k1 > 0.3)[0], np.where(_k2 > 0.5)[0] _subjects = [] if len(_k1) and len(_k2): for i in _k1: j = _k2[_k2 >= i] if len(j) > 0: j = j[0] _subject = text_in[i:j + 1] _subjects.append((_subject, str(i), str(j + 1))) # subject补余 for _s in match2(text_in): if _s[0] in freq: if freq[_s[0]]['per'] > 0.8: _subjects.append(_s) _subjects = list(set(_subjects)) if _subjects: R = [] _X2, _X2_MASK, _Y, _X2_wv = [], [], [], [] _S, _IDXS = [], {} for _X1 in _subjects: if _X1[0] in ['的']: continue _y = np.zeros(len(text_in)) _y[int(_X1[1]):int(_X1[2])] = 1 _IDXS[_X1] = kb2id.get(_X1[0], []) # 每个subject只取10个链指 for idx, i in enumerate(_IDXS[_X1]): if idx > 15: break _x2 = id2kb[i]['subject_desc'] _x2_tokens = jieba.lcut(_x2) _x2 = ''.join(_x2_tokens) _x2 = [bert_vocab.get(c, bert_vocab.get('[UNK]')) for c in _x2] _x2_mask = [1] * len(_x2) _X2.append(_x2) _X2_MASK.append(_x2_mask) _Y.append(_y) _S.append(_X1) _X2_wv.append(_x2_tokens) if _X2: _O = [] _X2 = torch.tensor(seq_padding(_X2), dtype=torch.long) # [b,s2] _X2_MASK = torch.tensor(seq_padding(_X2_MASK), dtype=torch.long) _X2_SEG = torch.zeros(*_X2.size(), dtype=torch.long) _Y = torch.tensor(seq_padding(_Y), dtype=torch.float32) _X1_HS = _x1_hs.expand(_X2.size(0), -1, -1) # [b,s1,h] _X1_H = _x1_h.expand(_X2.size(0), -1) # [b,s1] _X1_MASK = _X1_MASK.expand(_X2.size(0), -1) # [b,s1] _X1_wv = _X1_WV.expand(_X2.size(0), -1, -1) # [b,s1,200] _X2_wv = torch.tensor(seq2vec(_X2_wv), dtype=torch.float32) eval_dataloader = DataLoader(TensorDataset(_X2, _X2_SEG, _X2_MASK, _X1_HS, _X1_H, _X1_MASK, _Y, _X1_wv, _X2_wv), batch_size=64) for batch_idx, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) _X2, _X2_SEG, _X2_MASK, _X1_HS, _X1_H, _X1_MASK, _Y, _X1_wv, _X2_wv = batch with torch.no_grad(): _x2, _x2_h = subject_model('x2', None, None, None, None, None, _X2, _X2_SEG, _X2_MASK) _o, _, _ = object_model(_X1_HS, _X1_H, _X1_MASK, _Y, _x2, _x2_h, _X2_MASK, _X1_wv, _X2_wv) # _o:[b,1] _o = _o.detach().cpu().numpy() _O.extend(_o) for k, v in groupby(zip(_S, _O), key=lambda x: x[0]): v = np.array([j[1] for j in v]) kbid = _IDXS[k][np.argmax(v)] R.append((k[0], k[1], kbid)) return list(set(R)) else: return []
def freq(): id2kb = {} for l in tqdm((Path(data_dir) / 'kb_data').open()): _ = json.loads(l) subject_id = _['subject_id'] subject_alias = list(set([_['subject']] + _.get('alias', []))) subject_alias = [sa.lower() for sa in subject_alias] subject_desc = '' for i in _['data']: if '摘要' in i['predicate']: subject_desc = i['object'] break else: subject_desc += f'{i["predicate"]}:{i["object"]}\n' subject_desc = subject_desc[:300].lower() if subject_desc: id2kb[subject_id] = { 'subject_alias': subject_alias, 'subject_desc': subject_desc } kb2id = defaultdict(list) # subject: [sid1, sid2,...] for i, j in tqdm(id2kb.items()): for k in j['subject_alias']: kb2id[k].append(i) train_data = (Path(data_dir) / 'train.json').open() freq_dic = defaultdict(dict) cnt = 0 for i, l in tqdm(enumerate(train_data)): # if i > 20000: # break l = json.loads(l) t = l['text'] exp_words = [(k, sidx) for k, sidx, _ in match2(t)] labeled_words = [(m['mention'], m['offset']) for m in l['mention_data']] if not set(exp_words).issuperset(set(labeled_words)): cnt += 1 for w, start_idx in exp_words: if 'exp' not in freq_dic[w]: freq_dic[w]['exp'] = 1 else: freq_dic[w]['exp'] += 1 if (w, start_idx) in labeled_words: if 'labeled' not in freq_dic[w]: freq_dic[w]['labeled'] = 1 else: freq_dic[w]['labeled'] += 1 # if match_rules(word) 21825 # if word != '' 16347 print(f'cnt: {cnt}') for w in freq_dic: if 'labeled' not in freq_dic[w]: freq_dic[w]['labeled'] = 0 freq_dic[w]['per'] = freq_dic[w]['labeled'] / freq_dic[w]['exp'] p = (Path(data_dir) / 'el_freq_dic_1.json').open('w') json.dump(freq_dic, p, ensure_ascii=False)
def group(): id2kb = {} for l in tqdm((Path(data_dir) / 'kb_data').open(), desc='kb_data'): _ = json.loads(l) subject_id = _['subject_id'] subject_alias = list(set([_['subject']] + _.get('alias', []))) subject_alias = [sa.lower() for sa in subject_alias] subject_desc = '' for i in _['data']: if '摘要' in i['predicate']: subject_desc = i['object'] break else: subject_desc += f'{i["predicate"]}:{i["object"]}\n' subject_desc = subject_desc[:300].lower() if subject_desc: id2kb[subject_id] = { 'subject_alias': subject_alias, 'subject_desc': subject_desc } kb2id = defaultdict(list) # subject: [sid1, sid2,...] for i, j in tqdm(id2kb.items()): for k in j['subject_alias']: kb2id[k].append(i) train_data = (Path(data_dir) / 'train.json').open() freq_dic = defaultdict(dict) cnt = 0 tmp_p = (Path(data_dir) / 'el_group_word.json').open('w') tmp_dic = {} for i, l in tqdm(enumerate(train_data), desc='train_data 1'): # if i > 20000: # break l = json.loads(l) t = l['text'] exp_words = [(k, sidx, int(sidx) + len(k)) for k, sidx, _ in match2(t)] labeled_words = [(m['mention'], m['offset'], int(m['offset']) + len(m['mention'])) for m in l['mention_data']] if not set(exp_words).issuperset(set(labeled_words)): for lw, lw_s, lw_e in labeled_words: for ew, ew_s, ew_e in exp_words: if lw_s == ew_s and lw_e != ew_e and ew.startswith(lw): if ew not in tmp_dic: tmp_dic[ew] = defaultdict(list) tmp_dic[ew]['s_same'].append(lw) if lw_e == ew_e and lw_s != ew_s and ew.endswith(lw): if ew not in tmp_dic: tmp_dic[ew] = defaultdict(list) tmp_dic[ew]['e_same'].append(lw) if lw_e == ew_e and lw_s != ew_s and ew.endswith(lw): if ew not in tmp_dic: tmp_dic[ew] = defaultdict(list) tmp_dic[ew]['e_same'].append(lw) cnt += 1 for i, l in tqdm(enumerate((Path(data_dir) / 'train.json').open()), desc='train_data 2'): l = json.loads(l) ews = [k for k, _, _ in match2(l['text'])] lws = [m['mention'] for m in l['mention_data']] for w in ews: if w in tmp_dic: tmp_dic[w]['group_exp'].append(1) for w in lws: if w in tmp_dic: tmp_dic[w]['group_labeled'].append(1) tmp_sum_dic = defaultdict(dict) for w in tqdm(tmp_dic, desc='tmp_sum_dic'): tmp_sum_dic[w]['group_exp_cnt'] = len(tmp_dic[w]['group_exp']) tmp_sum_dic[w]['group_labeled_cnt'] = len(tmp_dic[w]['group_labeled']) tmp_sum_dic[w]['s_same_cnt'] = len(tmp_dic[w]['s_same']) tmp_sum_dic[w]['e_same_cnt'] = len(tmp_dic[w]['e_same']) tmp_sum_dic[w]['group_labeled_per'] = tmp_sum_dic[w][ 'group_labeled_cnt'] / tmp_sum_dic[w]['group_exp_cnt'] tmp_sum_dic[w]['s_same_per'] = tmp_sum_dic[w][ 's_same_cnt'] / tmp_sum_dic[w]['group_exp_cnt'] tmp_sum_dic[w]['e_same_per'] = tmp_sum_dic[w][ 'e_same_cnt'] / tmp_sum_dic[w]['group_exp_cnt'] json.dump(tmp_sum_dic, tmp_p, ensure_ascii=False)