Beispiel #1
0
def construct_noise_corpus(path: str, noise_prob: float) -> List[Dict]:

    ori_datas = loads(open(path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    count = 0
    src_len, tgt_len = [], []

    for d in ori_datas:
        data = {
            'source': add_noise(d['source'], noise_prob),
            'target': d['target']
        }
        datas.append(data)
        src_len.append(len(data['source'].split(' ')))
        tgt_len.append(len(data['target'].split(' ')))
        count += 1
        if count % 100000 == 0:
            print(count)

    print('max len: ', max(src_len), max(tgt_len))
    print('avg len: ',
          sum(src_len) * 1.0 / len(src_len),
          sum(tgt_len) * 1.0 / len(tgt_len))

    return datas
Beispiel #2
0
def construct_table2pivot(path: str, index: List[int] = None) -> List[Dict]:

    ori_datas = loads(open(path))[1:]
    if index is not None:
        ori_datas = partion_list(ori_datas, index)

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    count = 0
    src_len, tgt_len = [], []

    for d in ori_datas:
        data = {
            'value': d['value'],
            'label': d['label'],
            'field': d['field'],
            'lpos': d['lpos'],
            'rpos': d['rpos']
        }
        datas.append(data)
        src_len.append(len(data['value'].split(' ')))
        tgt_len.append(len(data['label'].split(' ')))
        count += 1
        if count % 100000 == 0:
            print(count)

    print('max len: ', max(src_len), max(tgt_len))
    print('avg len: ',
          sum(src_len) * 1.0 / len(src_len),
          sum(tgt_len) * 1.0 / len(tgt_len))

    return datas
Beispiel #3
0
def construct_pivot2text(path: str, index: List[int] = None) -> List[Dict]:

    ori_datas = loads(open(path))[1:]
    if index is not None:
        index = set(index)

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    count = 0
    src_len, tgt_len = [], []

    for i, d in enumerate(ori_datas):
        if index is None or i in index:
            data = {'source': d['pivot'], 'target': d['text']}
        else:
            data = {'source': d['entity'], 'target': d['text']}
        datas.append(data)
        src_len.append(len(data['source'].split(' ')))
        tgt_len.append(len(data['target'].split(' ')))
        count += 1
        if count % 100000 == 0:
            print(count)

    print('max len: ', max(src_len), max(tgt_len))
    print('avg len: ',
          sum(src_len) * 1.0 / len(src_len),
          sum(tgt_len) * 1.0 / len(tgt_len))

    return datas
Beispiel #4
0
def construct_super_corpus(path: str) -> List[Dict]:

    ori_datas = loads(open(path))[1:]
    ori_datas = partion_list(ori_datas, index)

    statistic = {'length': len(ori_datas)}
    datas = [statistic] + ori_datas

    return datas
Beispiel #5
0
def test_p2t_dataset(r_path: str, w_path: str) -> List[Dict]:
    '''
    value, text, field, lpos, rpos, pivot, entity
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    for d in ori_datas:
        datas.append({'source': d['pivot'], 'target': d['text']})

    dumps(datas, open(w_path, 'w'))
Beispiel #6
0
def train_p2t_dataset(r_path: str, w_path: str,
                      index: List[int]) -> List[Dict]:
    '''
    value, text, field, lpos, rpos, pivot, entity
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]
    index = set(index)

    for i, d in enumerate(ori_datas):
        if i in index:
            #datas.append({'source': d['pivot'], 'target': d['text']})
            datas.append(get_filter_data(d))
        else:
            datas.append({'source': d['entity'], 'target': d['text']})

    dumps(datas, open(w_path, 'w'))
Beispiel #7
0
    def print_result_into_file(self, model_outputs: Dict, dataset: Dataset):
        model_ids = model_outputs['output_ids']
        sources, model_words = [], []
        fields, lposs, rposs = [], [], []
        _fields, _lposs, _rposs = [], [], []

        for data in dataset.read():
            sources.append(data['value'].split(' '))
            fields.append(data['field'].split(' '))
            lposs.append(data['lpos'].split(' '))
            rposs.append(data['rpos'].split(' '))

        for ids, source, field, lpos, rpos in zip(model_ids, sources, fields,
                                                  lposs, rposs):
            words = [s for id, s in zip(ids, source) if id > 0]
            _field = [s for id, s in zip(ids, field) if id > 0]
            _lpos = [s for id, s in zip(ids, lpos) if id > 0]
            _rpos = [s for id, s in zip(ids, rpos) if id > 0]
            model_words.append(' '.join(words))
            _fields.append(' '.join(_field))
            _lposs.append(' '.join(_lpos))
            _rposs.append(' '.join(_rpos))

        with open(
                os.path.join(self.data_path,
                             'predict-{0}.txt'.format(self.scale)), 'w') as f:
            print('\n'.join(model_words), file=f)

        ori_datas = loads(open(os.path.join(self.data_path, 'test.p2t.jsonl')))
        for m, d, f, l, r in zip(model_words, ori_datas[1:], _fields, _lposs,
                                 _rposs):
            d['source'] = m
            d['field'] = f
            d['lpos'] = l
            d['rpos'] = r

        dumps(
            ori_datas,
            open(
                os.path.join(self.data_path,
                             'test.predict.{0}.jsonl'.format(self.scale)),
                'w'))
Beispiel #8
0
def test_t2p_dataset(r_path: str, w_path: str) -> List[Dict]:
    '''
    value, label, field, lpos, rpos
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    for d in ori_datas:
        data = {
            'value': d['value'],
            'label': d['label'],
            'field': d['field'],
            'lpos': d['lpos'],
            'rpos': d['rpos']
        }
        datas.append(data)

    dumps(datas, open(w_path, 'w'))
Beispiel #9
0
def test_parallel_dataset(r_path: str, w_path: str) -> List[Dict]:
    '''
    source: value;
    target: text.
    '''
    ori_datas = loads(open(r_path))[1:]

    statistic = {'length': len(ori_datas)}
    datas = [statistic]

    for d in ori_datas:
        data = {
            'source': d['value'],
            'target': d['text'],
            'field': d['field'],
            'lpos': d['lpos'],
            'rpos': d['rpos']
        }
        datas.append(data)

    dumps(datas, open(w_path, 'w'))
Beispiel #10
0
def get_partion_index(path: str, limit: int) -> List[int]:
    ori_datas = loads(open(path))[1:]
    index = list(range(len(ori_datas)))
    random.shuffle(index)
    return index[:limit]