def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file", type=str, required=True, help='Input raw text file. ') parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file. ') parser.add_argument("--num_splits", type=int, default=1, help='The MindRecord file will be split into the number of partition. ') parser.add_argument("--max_length", type=int, required=True, help='Maximum sequence length. ') parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ') parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ') args = parser.parse_args() tokenizer = Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file) input_file = args.input_file logging.info("***** Reading from input files *****") logging.info("Input File: %s", input_file) output_file = args.output_file logging.info("***** Writing to output files *****") logging.info("Output File: %s", output_file) writer = FileWriter(output_file, args.num_splits) data_schema = {"input_ids": {"type": "int64", "shape": [-1]}, "input_mask": {"type": "int64", "shape": [-1]}, "input_length": {"type": "int64", "shape": [-1]}, } writer.add_schema(data_schema, "lambada-schema") total_written = 0 total_read = 0 logging.info("***** Reading from %s *****", input_file) with open(input_file, "r") as f: while True: line = f.readline() if not line: break total_read += 1 if total_read % 500 == 0: logging.info("%d ...", total_read) output = create_instance(tokenizer, line, args.max_length) features = write_instance_to_file(writer, instance=output) total_written += 1 if total_written <= 20: logging.info("***** Example *****") logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1])) logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:])) for feature_name in features.keys(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) writer.commit() logging.info("Wrote %d total instances", total_written)
def clip_article(input_path, out_path, hint, max_length): """ clip article that the sample (article + summary) exceed max_length """ tokenizer = Tokenizer() cnt = 0 with open(input_path, "r") as r, open(out_path, "a+") as a: line = r.readline() while line: pos = line.rfind(hint) article = line[:pos] summary = line[pos:] if len(tokenizer.encode(line)) > max_length: l_article = tokenizer.encode( article)[:max_length - len(tokenizer.encode(summary))] article = tokenizer.decode(l_article) + " " if cnt % 1000 == 0: print(article + summary) print("==============================") cnt += 1 a.write(article + summary) line = r.readline()
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="", eval_type=None, generate_length_Dynamically=True): """ Do eval Args: dataset: the eval dataset. network: the network with loss. metric: the evaluation method. load_checkpoint_path: the file path which saved finetune model checkpoint. """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") tokenizer = Tokenizer( vocab_file='./src/utils/pretrain-data/gpt2-vocab.json', merge_file='./src/utils/pretrain-data/gpt2-merges.txt') if metric.lower() == "accuracy": print("Prepare to calculate the accuracy score ...") # callback = Accuracy() # callback = LastWordAccuracy() # callback = LastTokenAccuracy() callback = LastWordAccuracy(smooth=False) gpt2_loss = network(config=gpt2_net_cfg, is_training=False, use_one_hot_embeddings=False) gpt2_loss.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for k, v in param_dict.items(): final_param_dict['gpt2.gpt2.' + k] = param_dict[k] model = Model(gpt2_loss) columns_list = ["input_ids", "input_mask", "label_ids"] # set the weights of final linear weights to weights of gpt2 token embedding final_param_dict['gpt2.dense1.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(gpt2_loss, final_param_dict) print("============= Testing LAMBADA ACC =============") cnt = 0 for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, label_ids = input_data print("=========== LAMBADA Accuracy Test iteration:{}==========". format(cnt)) # input_ids = Tensor(input_ids, mindspore.int32) # input_mask = Tensor(input_mask, mindspore.int32) # label_ids = Tensor(label_ids, mindspore.int32) print("input_ids_shape: {}".format( input_ids.shape)) # (batch_size,seq_len) print("input_mask_shape: {}".format(input_mask.shape)) print("label_ids_shape: {}".format(label_ids.shape)) logits = model.predict( input_ids, input_mask) # (batch_size,seq_len,vocab_size) print("=" * 40) output_str = generate_for_LAMBADA_numpy_topk( decoder=model, input_ids=input_ids, logits=logits, tokenizer=tokenizer, max_iterations=200, generate_length_dynamically=generate_length_Dynamically, stop_word_file="src/utils/pretrain-data/stopwords.txt") label_str = get_wholeword_label_str(input_ids=input_ids, config=gpt2_net_cfg, tokenizer=tokenizer) # print("logits shape: {}".format(logits.shape)) # print("logits: \n{}".format(logits)) # print("===================================") print("==============================================") print("output_str:", output_str) print("label_str", label_str) callback.update(output_str, label_str) # callback.update(logits, label_ids) cnt += 1 print("==============================================") eval_result_print(metric, callback) print("************** Testing Finished **************") elif metric.lower() == "ppl": print("Prepare to calculate the ppl score ...") # ppl metric can be calculated by using the loss, so the difference is 'is_training' gpt2_loss = GPT2Lambada(config=gpt2_net_cfg, is_training=False, use_one_hot_embeddings=False) gpt2_loss.set_train(False) model = Model(gpt2_loss) param_dict = load_checkpoint(load_checkpoint_path) if eval_type == "zero-shot": final_param_dict = {} for k, v in param_dict.items(): final_param_dict['gpt2.gpt2.' + k] = param_dict[k] # set the weights of final linear weights to weights of gpt2 token embedding final_param_dict['gpt2.dense1.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(gpt2_loss, final_param_dict) print("load pretrained parameter successfully!\n") elif eval_type == "finetuned": load_param_into_net(gpt2_loss, param_dict) print("load finetuned parameter successfully!\n") columns_list = ["input_ids", "input_mask", "label_ids"] num_data = 0 total_ppl = 0.0 total_loss = 0.0 print("================= Testing LAMBADA PPL =================") for data in dataset.create_dict_iterator(): print("=========== LAMBADA PPL Test iteration:{}==========".format( num_data)) input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, label_ids = input_data print("input_ids_shape: {}".format(input_ids.shape)) print("input_mask_shape: {}".format(input_mask.shape)) print("label_ids_shape: {}".format(label_ids.shape)) logits = model.predict( input_ids, input_mask) # (batch_size,seq_len,vocab_size) # print("*"*30) last_word_range_ = get_lastword_range( input_ids=input_ids, config=gpt2_net_cfg, tokenizer=tokenizer) #[(left_pos,right_pos)] last_word_range = (last_word_range_[0][0] + 1, last_word_range_[0][1] + 1) last_word_logits_start_pos = last_word_range[0] - 1 last_word_logits_end_pos = last_word_range[1] - 1 last_word_token_len = last_word_range[1] - last_word_range[0] # print(" | last word range:", last_word_range) print(" | Last word token length:", last_word_token_len) # print(last_word_ids) # last_word_ids = P.Reshape()(last_word_ids,(-1,)).asnumpy().tolist() # print(last_word_ids) label_ids = extract_last_word_input_ids( input_ids=input_ids, seq_pos=last_word_range) #(batch_size=1,x=lastword token num) gold_logits = logits[::, last_word_logits_start_pos: last_word_logits_end_pos:1, ::] label_ids = P.Reshape()(label_ids, (-1, )) # (x,) print("label ids: ", label_ids) # print("labels ids shape:",label_ids.shape) # print("gold logits shape:",gold_logits.shape) gold_logits = P.Reshape()(gold_logits, (-1, gpt2_net_cfg.vocab_size)) # print("gold logits:",gold_logits) # print("gold logits shape :",gold_logits.shape) label_word_ids = label_ids.asnumpy().tolist() # generate_ids = np.argmax(gold_logits.asnumpy().tolist()) # print(type(generate_ids)) # generate_ids = generate_ids.tolist() # print(generate_ids) label_word = tokenizer.decode(label_word_ids) print("label word: ", label_word) # generate_word = tokenizer.decode([generate_ids]) # print("generate word:", generate_word) cross_entropy = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss = cross_entropy(gold_logits, label_ids) # loss = model.predict(input_ids, input_mask, label_ids) loss = loss.asnumpy() print(" | Loss: {:.6f}".format(float(loss))) num_data += 1 total_loss += loss avg_loss = total_loss / num_data print(" | Current AVG loss:", avg_loss) print(" | Current AVG ppl:", math.exp(avg_loss)) ppl = math.exp(avg_loss) # avg_ppl = total_loss / num_data print("-----------------------------------------") print(" PPL: {:.6f}".format(ppl)) print("************** Testing Finished **************") else: raise ValueError( "metric method not supported, support: [accuracy, ppl]")