コード例 #1
0
ファイル: construct_corpus.py プロジェクト: lancopku/Pivot
def test_p2t_dataset(r_path: str, w_path: str) -> List[Dict]:
    '''
    value, text, field, lpos, rpos, pivot, entity
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    for d in ori_datas:
        datas.append({'source': d['pivot'], 'target': d['text']})

    dumps(datas, open(w_path, 'w'))
コード例 #2
0
ファイル: construct_corpus.py プロジェクト: lancopku/Pivot
def train_p2t_dataset(r_path: str, w_path: str,
                      index: List[int]) -> List[Dict]:
    '''
    value, text, field, lpos, rpos, pivot, entity
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]
    index = set(index)

    for i, d in enumerate(ori_datas):
        if i in index:
            #datas.append({'source': d['pivot'], 'target': d['text']})
            datas.append(get_filter_data(d))
        else:
            datas.append({'source': d['entity'], 'target': d['text']})

    dumps(datas, open(w_path, 'w'))
コード例 #3
0
ファイル: pivot.py プロジェクト: lancopku/Pivot
    def print_result_into_file(self, model_outputs: Dict, dataset: Dataset):
        model_ids = model_outputs['output_ids']
        sources, model_words = [], []
        fields, lposs, rposs = [], [], []
        _fields, _lposs, _rposs = [], [], []

        for data in dataset.read():
            sources.append(data['value'].split(' '))
            fields.append(data['field'].split(' '))
            lposs.append(data['lpos'].split(' '))
            rposs.append(data['rpos'].split(' '))

        for ids, source, field, lpos, rpos in zip(model_ids, sources, fields,
                                                  lposs, rposs):
            words = [s for id, s in zip(ids, source) if id > 0]
            _field = [s for id, s in zip(ids, field) if id > 0]
            _lpos = [s for id, s in zip(ids, lpos) if id > 0]
            _rpos = [s for id, s in zip(ids, rpos) if id > 0]
            model_words.append(' '.join(words))
            _fields.append(' '.join(_field))
            _lposs.append(' '.join(_lpos))
            _rposs.append(' '.join(_rpos))

        with open(
                os.path.join(self.data_path,
                             'predict-{0}.txt'.format(self.scale)), 'w') as f:
            print('\n'.join(model_words), file=f)

        ori_datas = loads(open(os.path.join(self.data_path, 'test.p2t.jsonl')))
        for m, d, f, l, r in zip(model_words, ori_datas[1:], _fields, _lposs,
                                 _rposs):
            d['source'] = m
            d['field'] = f
            d['lpos'] = l
            d['rpos'] = r

        dumps(
            ori_datas,
            open(
                os.path.join(self.data_path,
                             'test.predict.{0}.jsonl'.format(self.scale)),
                'w'))
コード例 #4
0
ファイル: construct_corpus.py プロジェクト: lancopku/Pivot
def test_t2p_dataset(r_path: str, w_path: str) -> List[Dict]:
    '''
    value, label, field, lpos, rpos
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    for d in ori_datas:
        data = {
            'value': d['value'],
            'label': d['label'],
            'field': d['field'],
            'lpos': d['lpos'],
            'rpos': d['rpos']
        }
        datas.append(data)

    dumps(datas, open(w_path, 'w'))
コード例 #5
0
ファイル: construct_corpus.py プロジェクト: lancopku/Pivot
def test_parallel_dataset(r_path: str, w_path: str) -> List[Dict]:
    '''
    source: value;
    target: text.
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    for d in ori_datas:
        data = {
            'source': d['value'],
            'target': d['text'],
            'field': d['field'],
            'lpos': d['lpos'],
            'rpos': d['rpos']
        }
        datas.append(data)

    dumps(datas, open(w_path, 'w'))
コード例 #6
0
ファイル: process_pkwp.py プロジェクト: lancopku/WEAN
    datas = []

    for s, t in zip(source_datas, target_datas):
        datas.append({'source': s.lower(), 'target': t.lower()})

    return datas


def get_aner_map(path: str) -> List:
    datas = torchfile.load(path, utf8_decode_strings=True)

    return datas


if __name__ == '__main__':
    train_datas = transform(
        os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.train'))
    test_datas = transform(
        os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.test'))
    valid_datas = transform(
        os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.valid'))
    aner_datas = get_aner_map(
        os.path.join(pwkp_data_path, 'PWKP_108016.tag.80.aner.map.t7'))

    jsonl.dumps(train_datas,
                open(os.path.join(pwkp_data_path, 'train.jsonl'), 'w'))
    jsonl.dumps(test_datas,
                open(os.path.join(pwkp_data_path, 'test.jsonl'), 'w'))
    jsonl.dumps(valid_datas,
                open(os.path.join(pwkp_data_path, 'dev.jsonl'), 'w'))
    json.dump(aner_datas, open(os.path.join(pwkp_data_path, 'aner.json'), 'w'))
コード例 #7
0
    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    count = 0
    src_len, tgt_len = [], []

    for d in ori_datas:
        data = {
            'source': add_noise(d['source'], noise_prob),
            'target': d['target']
        }
        datas.append(data)
        src_len.append(len(data['source'].split(' ')))
        tgt_len.append(len(data['target'].split(' ')))
        count += 1
        if count % 100000 == 0:
            print(count)

    print('max len: ', max(src_len), max(tgt_len))
    print('avg len: ',
          sum(src_len) * 1.0 / len(src_len),
          sum(tgt_len) * 1.0 / len(tgt_len))

    return datas


if __name__ == '__main__':
    train_noise_datas = construct_noise_corpus(join('train.p2t.jsonl'),
                                               noise_prob=0.2)
    dumps(train_noise_datas,
          open(os.path.join(data_path, 'train.noise.jsonl'), 'w'))
コード例 #8
0
        data = extract_pivot(data)
        datas.append(data)
        src_len.append(len(data['value'].split(' ')))
        tgt_len.append(len(data['text'].split(' ')))
        count += 1
        if count % 10000 == 0:
            print(count)

    print('max len: ', max(src_len), max(tgt_len))
    print('avg len: ',
          sum(src_len) * 1.0 / len(src_len),
          sum(tgt_len) * 1.0 / len(tgt_len))

    return datas


if __name__ == '__main__':
    train_datas = transform(os.path.join(data_path, 'train'))
    test_datas = transform(os.path.join(data_path, 'test'))
    valid_datas = transform(os.path.join(data_path, 'valid'))

    write_into_file(os.path.join(data_path, 'train.pivot'), train_datas)
    write_into_file(os.path.join(data_path, 'test.pivot'), test_datas)
    write_into_file(os.path.join(data_path, 'valid.pivot'), valid_datas)

    jsonl.dumps(train_datas,
                open(os.path.join(data_path, 'train.pivot.jsonl'), 'w'))
    jsonl.dumps(test_datas,
                open(os.path.join(data_path, 'test.pivot.jsonl'), 'w'))
    jsonl.dumps(valid_datas,
                open(os.path.join(data_path, 'valid.pivot.jsonl'), 'w'))
コード例 #9
0
    print('max len: ', max(src_len), max(tgt_len))
    print('avg len: ',
          sum(src_len) * 1.0 / len(src_len),
          sum(tgt_len) * 1.0 / len(tgt_len))

    return datas


if __name__ == '__main__':
    indexes = get_partion_index(join('train.jsonl'), 10000)
    write_into_file(join('index.txt'), indexes)

    train_t2p_datas = construct_table2pivot(join('train.pivot.jsonl'), indexes)
    test_t2p_datas = construct_table2pivot(join('test.pivot.jsonl'))
    valid_t2p_datas = construct_table2pivot(join('valid.pivot.jsonl'))

    train_p2t_datas = construct_pivot2text(join('train.pivot.jsonl'), indexes)
    test_p2t_datas = construct_pivot2text(join('test.pivot.jsonl'))
    valid_p2t_datas = construct_pivot2text(join('valid.pivot.jsonl'))

    dumps(train_t2p_datas, open(os.path.join(data_path, 'train.t2p.jsonl'),
                                'w'))
    dumps(test_t2p_datas, open(os.path.join(data_path, 'test.t2p.jsonl'), 'w'))
    dumps(valid_t2p_datas, open(os.path.join(data_path, 'valid.t2p.jsonl'),
                                'w'))

    dumps(train_p2t_datas, open(os.path.join(data_path, 'train.p2t.jsonl'),
                                'w'))
    dumps(test_p2t_datas, open(os.path.join(data_path, 'test.p2t.jsonl'), 'w'))
    dumps(valid_p2t_datas, open(os.path.join(data_path, 'valid.p2t.jsonl'),
                                'w'))