Ejemplo n.º 1
0
def _format_to_bert(params, sent_count=5):
    json_file, args, save_file = params
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        doc_id, source, tgt = d['docId'], d['src'], d['tgt']
        if (args.oracle_mode == 'greedy'):
            oracle_ids = greedy_selection(source, tgt, sent_count)
        elif (args.oracle_mode == 'combination'):
            oracle_ids = combination_selection(source, tgt, sent_count)
        #print(oracle_ids)
        b_data = bert.preprocess(source, tgt, oracle_ids)
        if (b_data is None):
            continue
        indexed_tokens, labels, segments_ids, cls_ids, src_txt, tgt_txt = b_data
        #print(labels)
        b_data_dict = {
            "doc_id": doc_id,
            "src": indexed_tokens,
            "labels": labels,
            "segs": segments_ids,
            'clss': cls_ids,
            'src_txt': src_txt,
            "tgt_txt": tgt_txt
        }
        datasets.append(b_data_dict)
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()