def test_p2t_dataset(r_path: str, w_path: str) -> List[Dict]: ''' value, text, field, lpos, rpos, pivot, entity ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] for d in ori_datas: datas.append({'source': d['pivot'], 'target': d['text']}) dumps(datas, open(w_path, 'w'))
def train_p2t_dataset(r_path: str, w_path: str, index: List[int]) -> List[Dict]: ''' value, text, field, lpos, rpos, pivot, entity ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] index = set(index) for i, d in enumerate(ori_datas): if i in index: #datas.append({'source': d['pivot'], 'target': d['text']}) datas.append(get_filter_data(d)) else: datas.append({'source': d['entity'], 'target': d['text']}) dumps(datas, open(w_path, 'w'))
def print_result_into_file(self, model_outputs: Dict, dataset: Dataset): model_ids = model_outputs['output_ids'] sources, model_words = [], [] fields, lposs, rposs = [], [], [] _fields, _lposs, _rposs = [], [], [] for data in dataset.read(): sources.append(data['value'].split(' ')) fields.append(data['field'].split(' ')) lposs.append(data['lpos'].split(' ')) rposs.append(data['rpos'].split(' ')) for ids, source, field, lpos, rpos in zip(model_ids, sources, fields, lposs, rposs): words = [s for id, s in zip(ids, source) if id > 0] _field = [s for id, s in zip(ids, field) if id > 0] _lpos = [s for id, s in zip(ids, lpos) if id > 0] _rpos = [s for id, s in zip(ids, rpos) if id > 0] model_words.append(' '.join(words)) _fields.append(' '.join(_field)) _lposs.append(' '.join(_lpos)) _rposs.append(' '.join(_rpos)) with open( os.path.join(self.data_path, 'predict-{0}.txt'.format(self.scale)), 'w') as f: print('\n'.join(model_words), file=f) ori_datas = loads(open(os.path.join(self.data_path, 'test.p2t.jsonl'))) for m, d, f, l, r in zip(model_words, ori_datas[1:], _fields, _lposs, _rposs): d['source'] = m d['field'] = f d['lpos'] = l d['rpos'] = r dumps( ori_datas, open( os.path.join(self.data_path, 'test.predict.{0}.jsonl'.format(self.scale)), 'w'))
def test_t2p_dataset(r_path: str, w_path: str) -> List[Dict]: ''' value, label, field, lpos, rpos ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] for d in ori_datas: data = { 'value': d['value'], 'label': d['label'], 'field': d['field'], 'lpos': d['lpos'], 'rpos': d['rpos'] } datas.append(data) dumps(datas, open(w_path, 'w'))
def test_parallel_dataset(r_path: str, w_path: str) -> List[Dict]: ''' source: value; target: text. ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] for d in ori_datas: data = { 'source': d['value'], 'target': d['text'], 'field': d['field'], 'lpos': d['lpos'], 'rpos': d['rpos'] } datas.append(data) dumps(datas, open(w_path, 'w'))
datas = [] for s, t in zip(source_datas, target_datas): datas.append({'source': s.lower(), 'target': t.lower()}) return datas def get_aner_map(path: str) -> List: datas = torchfile.load(path, utf8_decode_strings=True) return datas if __name__ == '__main__': train_datas = transform( os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.train')) test_datas = transform( os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.test')) valid_datas = transform( os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.valid')) aner_datas = get_aner_map( os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.map.t7')) jsonl.dumps(train_datas, open(os.path.join(pwkp_data_path, 'train.jsonl'), 'w')) jsonl.dumps(test_datas, open(os.path.join(pwkp_data_path, 'test.jsonl'), 'w')) jsonl.dumps(valid_datas, open(os.path.join(pwkp_data_path, 'dev.jsonl'), 'w')) json.dump(aner_datas, open(os.path.join(pwkp_data_path, 'aner.json'), 'w'))
statistic = {'length': len(ori_datas)} datas = [statistic] count = 0 src_len, tgt_len = [], [] for d in ori_datas: data = { 'source': add_noise(d['source'], noise_prob), 'target': d['target'] } datas.append(data) src_len.append(len(data['source'].split(' '))) tgt_len.append(len(data['target'].split(' '))) count += 1 if count % 100000 == 0: print(count) print('max len: ', max(src_len), max(tgt_len)) print('avg len: ', sum(src_len) * 1.0 / len(src_len), sum(tgt_len) * 1.0 / len(tgt_len)) return datas if __name__ == '__main__': train_noise_datas = construct_noise_corpus(join('train.p2t.jsonl'), noise_prob=0.2) dumps(train_noise_datas, open(os.path.join(data_path, 'train.noise.jsonl'), 'w'))
data = extract_pivot(data) datas.append(data) src_len.append(len(data['value'].split(' '))) tgt_len.append(len(data['text'].split(' '))) count += 1 if count % 10000 == 0: print(count) print('max len: ', max(src_len), max(tgt_len)) print('avg len: ', sum(src_len) * 1.0 / len(src_len), sum(tgt_len) * 1.0 / len(tgt_len)) return datas if __name__ == '__main__': train_datas = transform(os.path.join(data_path, 'train')) test_datas = transform(os.path.join(data_path, 'test')) valid_datas = transform(os.path.join(data_path, 'valid')) write_into_file(os.path.join(data_path, 'train.pivot'), train_datas) write_into_file(os.path.join(data_path, 'test.pivot'), test_datas) write_into_file(os.path.join(data_path, 'valid.pivot'), valid_datas) jsonl.dumps(train_datas, open(os.path.join(data_path, 'train.pivot.jsonl'), 'w')) jsonl.dumps(test_datas, open(os.path.join(data_path, 'test.pivot.jsonl'), 'w')) jsonl.dumps(valid_datas, open(os.path.join(data_path, 'valid.pivot.jsonl'), 'w'))
print('max len: ', max(src_len), max(tgt_len)) print('avg len: ', sum(src_len) * 1.0 / len(src_len), sum(tgt_len) * 1.0 / len(tgt_len)) return datas if __name__ == '__main__': indexes = get_partion_index(join('train.jsonl'), 10000) write_into_file(join('index.txt'), indexes) train_t2p_datas = construct_table2pivot(join('train.pivot.jsonl'), indexes) test_t2p_datas = construct_table2pivot(join('test.pivot.jsonl')) valid_t2p_datas = construct_table2pivot(join('valid.pivot.jsonl')) train_p2t_datas = construct_pivot2text(join('train.pivot.jsonl'), indexes) test_p2t_datas = construct_pivot2text(join('test.pivot.jsonl')) valid_p2t_datas = construct_pivot2text(join('valid.pivot.jsonl')) dumps(train_t2p_datas, open(os.path.join(data_path, 'train.t2p.jsonl'), 'w')) dumps(test_t2p_datas, open(os.path.join(data_path, 'test.t2p.jsonl'), 'w')) dumps(valid_t2p_datas, open(os.path.join(data_path, 'valid.t2p.jsonl'), 'w')) dumps(train_p2t_datas, open(os.path.join(data_path, 'train.p2t.jsonl'), 'w')) dumps(test_p2t_datas, open(os.path.join(data_path, 'test.p2t.jsonl'), 'w')) dumps(valid_p2t_datas, open(os.path.join(data_path, 'valid.p2t.jsonl'), 'w'))