def do_tokenize(args): print(time.clock()) data_builder.tokenize(args) print(time.clock())
parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../logs/cnndm.log') parser.add_argument( '-dataset', default='', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.tokenize(args) # FORMAT TO LINES # merged_stories_tokenized -> my_json_data parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' )
# parser.add_argument("-train_tgt_path", default='data/train-large/summary_t.txt') # parser.add_argument("-valid_src_path", default='data/train-small/short_text_t.txt') # parser.add_argument("-valid_tgt_path", default='data/train-small/summary_t.txt') # parser.add_argument("-test_src_path", default='data/test/short_text_t.txt') # parser.add_argument("-test_tgt_path", default='data/test/summary_t.txt') parser.add_argument("-train_src_path", default='small_data/train/short_text.txt') parser.add_argument("-train_tgt_path", default='small_data/train/summary.txt') parser.add_argument("-valid_src_path", default='small_data/train/short_text.txt') parser.add_argument("-valid_tgt_path", default='small_data/train/summary.txt') parser.add_argument("-test_src_path", default='small_data/test/short_text.txt') parser.add_argument("-test_tgt_path", default='small_data/test/summary.txt') parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=0, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='') parser.add_argument('-dataset', nargs='+', default=['train', 'valid', 'test'], help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() init_logger(args.log_file) result = data_builder.tokenize(args) data_builder.format_to_bert(args, result)
def summarize(text): with io.open('../raw_stories/test.story', 'w', encoding="utf8") as file: file.write(text.strip() + "\n\n@highlight\n\n" + "tim") # TOKENIZE # raw_stories -> merged_stories_tokenized parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../raw_stories/') parser.add_argument("-save_path", default='../merged_stories_tokenized/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../logs/cnndm.log') parser.add_argument( '-dataset', default='', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.tokenize(args) # FORMAT TO LINES # merged_stories_tokenized -> my_json_data parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../merged_stories_tokenized/') parser.add_argument("-save_path", default='../my_json_data/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../logs/cnndm.log') parser.add_argument( '-dataset', default='', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.format_to_lines_only_test(args) # FORMAT TO BERT # my_json_data -> bert_data_final parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../my_json_data/') parser.add_argument("-save_path", default='../bert_data_final/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../../logs/preprocess.log') parser.add_argument( '-dataset', default='test', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.format_to_bert(args) # GENERATE SUMMARY test_iter = data_loader.Dataloader(model_args, load_dataset(model_args, 'test', shuffle=False), model_args.batch_size, device, shuffle=False, is_test=True) trainer = build_trainer(model_args, device_id, model, None) result_string = trainer.test(test_iter, step) os.remove("../raw_stories/test.story") os.remove("../merged_stories_tokenized/test.story.json") os.remove("../my_json_data/test.0.json") os.remove("../bert_data_final/test.0.bert.pt") return result_string