predict_output_json_file_name, mode='w', encoding='utf8') as fw_json, codecs.open(predict_output_file_name, mode='w', encoding='utf8') as fw: for i, row_json in enumerate(json_file_iter): if i % 100 == 0: logger.info('predict row:{}'.format(i)) sentence = row_json.get('sentence', '') keywords = row_json.get('keywords', '') text = remove_blank('{}{}'.format(sentence, keywords)) input_data = [] test_data = [list(text)] # logger.info('test_data len:{}'.format(len(test_data))) # logger.warn('输入数据预处理') dataset = DataSet({Const.INPUT: test_data}) dataset.add_seq_len(field_name=Const.INPUT) dataset.set_input(Const.INPUT, Const.INPUT_LEN) char_vocab.index_dataset(dataset, field_name=Const.INPUT) # logger.info('处理后dataset:\n{}'.format(dataset[:5])) # features = [Const.INPUT, Const.INPUT_LEN] batch_output = predictor.predict( data=dataset, seq_len_field_name=Const.INPUT_LEN) # logger.info('batch_output : {}'.format(batch_output)) pred_results = batch_output.get('pred') # logger.info('pred results:{}'.format(pred_results[:5])) label_id = pred_results[0] label_desc = target_vocab.to_word(label_id) # 组装成所需格式 row_data = OrderedDict() row_data['id'] = row_json['id'] row_data['label'] = label_link_dict[label_desc]
print( f"In total {len(target_word2bpes)} target words, {len(train_word2bpes)} words." ) pad_id = data_bundle.pad_id lg_dict = getattr(data_bundle, 'lg_dict') lg_shifts = getattr(data_bundle, 'lg_shift') train_lg_shifts = getattr(data_bundle, 'train_lg_shift') train_data = DataSet() for name, ds in data_bundle.iter_datasets(): if 'train' in name: for ins in ds: train_data.append(ins) train_data.add_seq_len('input') train_data.set_input('input', 'language_ids') train_data.set_target('target') train_data.set_pad_val('input', pad_id) clip_max_length(train_data, data_bundle, max_sent_len=50) model = JointBertReverseDict(pre_name, train_word2bpes, target_word2bpes, pad_id=pad_id, num_languages=3) if torch.cuda.is_available(): model.cuda()