Ejemplo n.º 1
0
     predict_output_json_file_name, mode='w',
     encoding='utf8') as fw_json, codecs.open(predict_output_file_name,
                                              mode='w',
                                              encoding='utf8') as fw:
 for i, row_json in enumerate(json_file_iter):
     if i % 100 == 0:
         logger.info('predict row:{}'.format(i))
     sentence = row_json.get('sentence', '')
     keywords = row_json.get('keywords', '')
     text = remove_blank('{}{}'.format(sentence, keywords))
     input_data = []
     test_data = [list(text)]
     # logger.info('test_data len:{}'.format(len(test_data)))
     # logger.warn('输入数据预处理')
     dataset = DataSet({Const.INPUT: test_data})
     dataset.add_seq_len(field_name=Const.INPUT)
     dataset.set_input(Const.INPUT, Const.INPUT_LEN)
     char_vocab.index_dataset(dataset, field_name=Const.INPUT)
     # logger.info('处理后dataset:\n{}'.format(dataset[:5]))
     # features = [Const.INPUT, Const.INPUT_LEN]
     batch_output = predictor.predict(
         data=dataset, seq_len_field_name=Const.INPUT_LEN)
     # logger.info('batch_output : {}'.format(batch_output))
     pred_results = batch_output.get('pred')
     # logger.info('pred results:{}'.format(pred_results[:5]))
     label_id = pred_results[0]
     label_desc = target_vocab.to_word(label_id)
     # 组装成所需格式
     row_data = OrderedDict()
     row_data['id'] = row_json['id']
     row_data['label'] = label_link_dict[label_desc]
Ejemplo n.º 2
0
print(
    f"In total {len(target_word2bpes)} target words, {len(train_word2bpes)} words."
)

pad_id = data_bundle.pad_id
lg_dict = getattr(data_bundle, 'lg_dict')
lg_shifts = getattr(data_bundle, 'lg_shift')
train_lg_shifts = getattr(data_bundle, 'train_lg_shift')

train_data = DataSet()
for name, ds in data_bundle.iter_datasets():
    if 'train' in name:
        for ins in ds:
            train_data.append(ins)

train_data.add_seq_len('input')
train_data.set_input('input', 'language_ids')
train_data.set_target('target')
train_data.set_pad_val('input', pad_id)

clip_max_length(train_data, data_bundle, max_sent_len=50)

model = JointBertReverseDict(pre_name,
                             train_word2bpes,
                             target_word2bpes,
                             pad_id=pad_id,
                             num_languages=3)

if torch.cuda.is_available():
    model.cuda()