def _format_to_bert(params, sent_count=5): json_file, args, save_file = params if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return bert = BertData(args) logger.info('Processing %s' % json_file) jobs = json.load(open(json_file)) datasets = [] for d in jobs: doc_id, source, tgt = d['docId'], d['src'], d['tgt'] if (args.oracle_mode == 'greedy'): oracle_ids = greedy_selection(source, tgt, sent_count) elif (args.oracle_mode == 'combination'): oracle_ids = combination_selection(source, tgt, sent_count) #print(oracle_ids) b_data = bert.preprocess(source, tgt, oracle_ids) if (b_data is None): continue indexed_tokens, labels, segments_ids, cls_ids, src_txt, tgt_txt = b_data #print(labels) b_data_dict = { "doc_id": doc_id, "src": indexed_tokens, "labels": labels, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt } datasets.append(b_data_dict) logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()