Beispiel #1
0
def do_format_to_bert(args):
    print(time.clock())
    data_builder.format_to_bert(args)
    print(time.clock())
Beispiel #2
0
def summarize(input, num_sen=3):
    print(input)
    input = json.loads(input)
    articles = input.get("articles")

    bert_data_path = './files/my_bert_data/'
    log_file = './files/logs/output.log'
    model_path = './files/models/cnndm_bertsum_classifier_best.pt'
    results_path = './files/results/'
    json_path = './files/json'

    bert_data, all_sentences = sentence_splitter.get_articles_json(articles)

    with open(os.path.join(json_path, "test.1.json"), 'w+') as f:
        f.write(json.dumps(bert_data))


# Format to Bert
    args = Namespace()
    args.dataset = 'test'
    args.raw_path = json_path
    args.save_path = './files/bert.pt/'
    args.log_file = log_file
    args.oracle_mode = 'greedy'
    args.map_path = './files/data/'
    args.shard_size = 2000
    args.min_nsents = 3
    args.max_nsents = 100
    args.min_src_ntokens = 5
    args.max_src_ntokens = 200
    args.lower = True
    args.n_cpus = 2

    data_builder.format_to_bert(args)
    # Rename the file
    shutil.move("./files/bert.pt/test.1.bert.pt", "./files/bert.pt/.test.pt")

    # Get the predictions
    args = Namespace()
    args.encoder = 'classifier'
    args.mode = 'test'
    args.bert_data_path = './files/bert.pt/'
    args.model_path = './files/models/'
    args.result_path = results_path
    args.temp_dir = './temp'
    args.batch_size = 1000
    args.use_interval = True
    args.large = False
    args.hidden_size = 128
    args.ff_size = 512
    args.heads = 4
    args.inter_layers = 2
    args.rnn_size = 512
    args.param_init = 0
    args.param_init_glorot = True
    args.dropout = 0.1
    args.optim = 'adam'
    args.lr = 1
    args.beta1 = 0.9
    args.beta2 = 0.999
    args.decay_method = ''
    args.warmup_steps = 8000
    args.max_grad_norm = 0
    args.save_checkpoint_steps = 5
    args.accum_count = 1
    args.world_size = 1
    args.report_every = 1
    args.train_steps = 1000
    args.recall_eval = False
    args.visible_gpus = '-1'
    args.gpu_ranks = '0'
    args.log_file = log_file
    args.dataset = ''
    args.seed = 358
    args.test_all = False
    args.model_name = model_path
    args.train_from = ''
    args.report_rouge = True
    args.block_trigram = True
    args.num_sen = num_sen

    args.gpu_ranks = [int(i) for i in args.gpu_ranks.split(',')]
    os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus

    #init_logger(args.log_file)
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    device_id = 0 if device == "cuda" else -1

    cp = args.model_name
    #step = int(cp.split('.')[-2].split('_')[-1])
    step = 1000000

    test(args, device_id, cp, step)

    # Format the output (at this stage all summaries are in lower case - lw)
    with open(os.path.join(results_path, "_step1000000.candidate"), 'r') as f:
        summaries = f.read()
    output = []
    count = 0
    for summary_lw in summaries.splitlines():
        sentences = all_sentences[count]
        summary = ''
        for sentence_lw in summary_lw.split('<q>'):
            sentence = match_sentence(sentences, sentence_lw)
            output.append(sentence.strip())
        count += 1
    return {"output": output}
Beispiel #3
0
    # parser.add_argument("-valid_src_path", default='data/train-small/short_text_t.txt')
    # parser.add_argument("-valid_tgt_path", default='data/train-small/summary_t.txt')
    # parser.add_argument("-test_src_path", default='data/test/short_text_t.txt')
    # parser.add_argument("-test_tgt_path", default='data/test/summary_t.txt')

    parser.add_argument("-train_src_path", default='small_data/train/short_text.txt')
    parser.add_argument("-train_tgt_path", default='small_data/train/summary.txt')
    parser.add_argument("-valid_src_path", default='small_data/train/short_text.txt')
    parser.add_argument("-valid_tgt_path", default='small_data/train/summary.txt')
    parser.add_argument("-test_src_path", default='small_data/test/short_text.txt')
    parser.add_argument("-test_tgt_path", default='small_data/test/summary.txt')

    parser.add_argument('-min_nsents', default=3, type=int)
    parser.add_argument('-max_nsents', default=100, type=int)
    parser.add_argument('-min_src_ntokens', default=0, type=int)
    parser.add_argument('-max_src_ntokens', default=200, type=int)

    parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True)

    parser.add_argument('-log_file', default='')

    parser.add_argument('-dataset', nargs='+', default=['train', 'valid', 'test'], help='train, valid or test, defaul will process all datasets')

    parser.add_argument('-n_cpus', default=2, type=int)


    args = parser.parse_args()
    init_logger(args.log_file)
    result = data_builder.tokenize(args)
    data_builder.format_to_bert(args, result)
Beispiel #4
0
                        type=str,
                        help='format_to_lines or format_to_bert')
    parser.add_argument(
        "-oracle_mode",
        default='greedy',
        type=str,
        help=
        'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.'
    )
    parser.add_argument("-map_path", default='../data/')
    parser.add_argument("-raw_path", default='../my_json_data/')
    parser.add_argument("-save_path", default='../bert_data_final/')
    parser.add_argument("-shard_size", default=2000, type=int)
    parser.add_argument('-min_nsents', default=3, type=int)
    parser.add_argument('-max_nsents', default=100, type=int)
    parser.add_argument('-min_src_ntokens', default=5, type=int)
    parser.add_argument('-max_src_ntokens', default=200, type=int)
    parser.add_argument("-lower",
                        type=str2bool,
                        nargs='?',
                        const=True,
                        default=True)
    parser.add_argument('-log_file', default='../../logs/preprocess.log')
    parser.add_argument(
        '-dataset',
        default='test',
        help='train, valid or test, defaul will process all datasets')
    parser.add_argument('-n_cpus', default=2, type=int)
    args = parser.parse_args()
    data_builder.format_to_bert(args)
Beispiel #5
0
def summarize(text):
    with io.open('../raw_stories/test.story', 'w', encoding="utf8") as file:
        file.write(text.strip() + "\n\n@highlight\n\n" + "tim")

    # TOKENIZE
    # raw_stories -> merged_stories_tokenized
    parser = argparse.ArgumentParser()
    parser.add_argument("-mode",
                        default='',
                        type=str,
                        help='format_to_lines or format_to_bert')
    parser.add_argument(
        "-oracle_mode",
        default='greedy',
        type=str,
        help=
        'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.'
    )
    parser.add_argument("-map_path", default='../data/')
    parser.add_argument("-raw_path", default='../raw_stories/')
    parser.add_argument("-save_path", default='../merged_stories_tokenized/')
    parser.add_argument("-shard_size", default=2000, type=int)
    parser.add_argument('-min_nsents', default=3, type=int)
    parser.add_argument('-max_nsents', default=100, type=int)
    parser.add_argument('-min_src_ntokens', default=5, type=int)
    parser.add_argument('-max_src_ntokens', default=200, type=int)
    parser.add_argument("-lower",
                        type=str2bool,
                        nargs='?',
                        const=True,
                        default=True)
    parser.add_argument('-log_file', default='../logs/cnndm.log')
    parser.add_argument(
        '-dataset',
        default='',
        help='train, valid or test, defaul will process all datasets')
    parser.add_argument('-n_cpus', default=2, type=int)
    args = parser.parse_args()
    data_builder.tokenize(args)

    # FORMAT TO LINES
    # merged_stories_tokenized -> my_json_data
    parser = argparse.ArgumentParser()
    parser.add_argument("-mode",
                        default='',
                        type=str,
                        help='format_to_lines or format_to_bert')
    parser.add_argument(
        "-oracle_mode",
        default='greedy',
        type=str,
        help=
        'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.'
    )
    parser.add_argument("-map_path", default='../data/')
    parser.add_argument("-raw_path", default='../merged_stories_tokenized/')
    parser.add_argument("-save_path", default='../my_json_data/')
    parser.add_argument("-shard_size", default=2000, type=int)
    parser.add_argument('-min_nsents', default=3, type=int)
    parser.add_argument('-max_nsents', default=100, type=int)
    parser.add_argument('-min_src_ntokens', default=5, type=int)
    parser.add_argument('-max_src_ntokens', default=200, type=int)
    parser.add_argument("-lower",
                        type=str2bool,
                        nargs='?',
                        const=True,
                        default=True)
    parser.add_argument('-log_file', default='../logs/cnndm.log')
    parser.add_argument(
        '-dataset',
        default='',
        help='train, valid or test, defaul will process all datasets')
    parser.add_argument('-n_cpus', default=2, type=int)
    args = parser.parse_args()
    data_builder.format_to_lines_only_test(args)

    # FORMAT TO BERT
    # my_json_data -> bert_data_final
    parser = argparse.ArgumentParser()
    parser.add_argument("-mode",
                        default='',
                        type=str,
                        help='format_to_lines or format_to_bert')
    parser.add_argument(
        "-oracle_mode",
        default='greedy',
        type=str,
        help=
        'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.'
    )
    parser.add_argument("-map_path", default='../data/')
    parser.add_argument("-raw_path", default='../my_json_data/')
    parser.add_argument("-save_path", default='../bert_data_final/')
    parser.add_argument("-shard_size", default=2000, type=int)
    parser.add_argument('-min_nsents', default=3, type=int)
    parser.add_argument('-max_nsents', default=100, type=int)
    parser.add_argument('-min_src_ntokens', default=5, type=int)
    parser.add_argument('-max_src_ntokens', default=200, type=int)
    parser.add_argument("-lower",
                        type=str2bool,
                        nargs='?',
                        const=True,
                        default=True)
    parser.add_argument('-log_file', default='../../logs/preprocess.log')
    parser.add_argument(
        '-dataset',
        default='test',
        help='train, valid or test, defaul will process all datasets')
    parser.add_argument('-n_cpus', default=2, type=int)
    args = parser.parse_args()
    data_builder.format_to_bert(args)

    # GENERATE SUMMARY
    test_iter = data_loader.Dataloader(model_args,
                                       load_dataset(model_args,
                                                    'test',
                                                    shuffle=False),
                                       model_args.batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    trainer = build_trainer(model_args, device_id, model, None)
    result_string = trainer.test(test_iter, step)

    os.remove("../raw_stories/test.story")
    os.remove("../merged_stories_tokenized/test.story.json")
    os.remove("../my_json_data/test.0.json")
    os.remove("../bert_data_final/test.0.bert.pt")

    return result_string
def do_format_to_bert(args):
    print(time.clock())
    args.mode = "format_to_bert"
    data_builder.format_to_bert(args)
    print(time.clock())