def evaluate(args): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) model = BertAbs.from_pretrained( "remi/bertabs-finetuned-extractive-abstractive-summarization") model.to(args.device) model.eval() symbols = { "BOS": tokenizer.vocab["[unused0]"], "EOS": tokenizer.vocab["[unused1]"], "PAD": tokenizer.vocab["[PAD]"], } if args.compute_rouge: reference_summaries = [] generated_summaries = [] import nltk import rouge nltk.download("punkt") rouge_evaluator = rouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=True, length_limit=args.beam_size, length_limit_type="words", apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True, ) # these (unused) arguments are defined to keep the compatibility # with the legacy code and will be deleted in a next iteration. args.result_path = "" args.temp_dir = "" data_iterator = build_data_iterator(args, tokenizer) predictor = build_predictor(args, tokenizer, symbols, model) logger.info("***** Running evaluation *****") logger.info(" Number examples = %d", len(data_iterator.dataset)) logger.info(" Batch size = %d", args.batch_size) logger.info("") logger.info("***** Beam Search parameters *****") logger.info(" Beam size = %d", args.beam_size) logger.info(" Minimum length = %d", args.min_length) logger.info(" Maximum length = %d", args.max_length) logger.info(" Alpha (length penalty) = %.2f", args.alpha) logger.info(" Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT")) for batch in tqdm(data_iterator): batch_data = predictor.translate_batch(batch) translations = predictor.from_batch(batch_data) summaries = [format_summary(t) for t in translations] save_summaries(summaries, args.summaries_output_dir, batch.document_names) if args.compute_rouge: reference_summaries += batch.tgt_str generated_summaries += summaries if args.compute_rouge: scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries) str_scores = format_rouge_scores(scores) save_rouge_scores(str_scores) print(str_scores)
def evaluate(args): # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) tokenizer = BertTokenizer.from_pretrained( "/data/wanyao/ghproj_d/transformers/summarization/") # sys.exit() print('load model...') # config = BertAbsConfig.from_json_file('/data/wanyao/ghproj_d/transformers/summarization/config.json') # model = BertAbs.from_pretrained("/data/wanyao/ghproj_d/transformers/summarization/", config=config) model = BertAbs.from_pretrained( "/data/wanyao/ghproj_d/transformers/summarization/") # model = BertAbs.from_pretrained("bertabs-finetuned-cnndm") model.to(args.device) model.eval() symbols = { "BOS": tokenizer.vocab["[unused0]"], "EOS": tokenizer.vocab["[unused1]"], "PAD": tokenizer.vocab["[PAD]"], } if args.compute_rouge: reference_summaries = [] generated_summaries = [] import rouge import nltk # nltk.download("punkt") rouge_evaluator = rouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=True, length_limit=args.beam_size, length_limit_type="words", apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True, ) # these (unused) arguments are defined to keep the compatibility # with the legacy code and will be deleted in a next iteration. args.result_path = "" args.temp_dir = "" data_iterator = build_data_iterator(args, tokenizer) predictor = build_predictor(args, tokenizer, symbols, model) logger.info("***** Running evaluation *****") logger.info(" Number examples = %d", len(data_iterator.dataset)) logger.info(" Batch size = %d", args.batch_size) logger.info("") logger.info("***** Beam Search parameters *****") logger.info(" Beam size = %d", args.beam_size) logger.info(" Minimum length = %d", args.min_length) logger.info(" Maximum length = %d", args.max_length) logger.info(" Alpha (length penalty) = %.2f", args.alpha) logger.info(" Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT")) iterator = 0 for batch in data_iterator: # print('batch-: ', batch) print('batch-src: ', batch.src.size()) print(batch.src) batch_data = predictor.translate_batch(batch) translations = predictor.from_batch(batch_data) summaries = [format_summary(t) for t in translations] save_summaries(summaries, args.summaries_output_dir, batch.document_names) if args.compute_rouge: reference_summaries += batch.tgt_str generated_summaries += summaries logging.info('iterator: {}'.format(iterator)) iterator += 1 if iterator >= 1: break if args.compute_rouge: print('generated_summaries: ', generated_summaries) print('reference_summaries: ', reference_summaries) scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries) str_scores = format_rouge_scores(scores) save_rouge_scores(str_scores) print(str_scores)
# df=df.drop_duplicates() df = df.drop_duplicates(subset="abstract", keep="first") # drop NANs df = df.dropna() # convert abstracts to lowercase df["abstract"] = df["abstract"].str.lower() # show 5 lines of the new dataframe print(df.shape) df.head() # Make model tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) model = BertAbs.from_pretrained("bertabs-finetuned-cnndm") model.to(args['device']) model.eval() symbols = { "BOS": tokenizer.vocab["[unused0]"], "EOS": tokenizer.vocab["[unused1]"], "PAD": tokenizer.vocab["[PAD]"], } data_iterator = build_data_iterator(args, tokenizer) predictor = build_predictor(args, tokenizer, symbols, model) if args['compute_rouge']: reference_summaries = [] generated_summaries = []
def evaluate(args): # store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) # runs bertabs-finetuned-cnndm-extractive-abstractive-summarization model model = BertAbs.from_pretrained("bertabs-finetuned-cnndm") model.to(args.device) model.eval() symbols = { "BOS": tokenizer.vocab["[unused0]"], "EOS": tokenizer.vocab["[unused1]"], "PAD": tokenizer.vocab["[PAD]"], } if args.compute_rouge: reference_summaries = [] generated_summaries = [] import rouge import nltk nltk.download("punkt") # creates rouge evaluator for model evaluation rouge_evaluator = rouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=True, length_limit=args.beam_size, length_limit_type="words", apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True, ) # these (unused) arguments are defined to keep the compatibility # with the legacy code and will be deleted in a next iteration. args.result_path = "" args.temp_dir = "" # creates embeddings data_iterator = build_data_iterator(args, tokenizer) # creates score for each sentence within document using GNMTGlobalScorer class in modeling_bertabs.py predictor = build_predictor(args, tokenizer, symbols, model) # store model summary within log files that are stored every n steps logger.info("***** Running evaluation *****") logger.info(" Number examples = %d", len(data_iterator.dataset)) logger.info(" Batch size = %d", args.batch_size) logger.info("") logger.info("***** Beam Search parameters *****") logger.info(" Beam size = %d", args.beam_size) logger.info(" Minimum length = %d", args.min_length) logger.info(" Maximum length = %d", args.max_length) logger.info(" Alpha (length penalty) = %.2f", args.alpha) logger.info(" Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT")) for batch in tqdm(data_iterator): # Generates summaries from one batch of data. batch_data = predictor.translate_batch(batch) # compare translations to pretrained translations translations = predictor.from_batch(batch_data) # transforms the output of the `from_batch` function into nicely formatted summaries. summaries = [format_summary(t) for t in translations] save_summaries(summaries, args.summaries_output_dir, batch.document_names) # calculate rouge score for predicted vs actual summaries if args.compute_rouge: reference_summaries += batch.tgt_str generated_summaries += summaries # calculate rouge score for predicted vs actual summaries if args.compute_rouge: scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries) str_scores = format_rouge_scores(scores) save_rouge_scores(str_scores) print(str_scores)