Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True, help='Input raw text file. ')
    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file. ')
    parser.add_argument("--num_splits", type=int, default=1,
                        help='The MindRecord file will be split into the number of partition. ')
    parser.add_argument("--max_length", type=int, required=True, help='Maximum sequence length. ')
    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
    args = parser.parse_args()

    tokenizer = Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file)

    input_file = args.input_file
    logging.info("***** Reading from input files *****")
    logging.info("Input File: %s", input_file)

    output_file = args.output_file
    logging.info("***** Writing to output files *****")
    logging.info("Output File: %s", output_file)

    writer = FileWriter(output_file, args.num_splits)
    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
                   "input_mask": {"type": "int64", "shape": [-1]},
                   "input_length": {"type": "int64", "shape": [-1]},
                   }
    writer.add_schema(data_schema, "lambada-schema")

    total_written = 0
    total_read = 0

    logging.info("***** Reading from  %s *****", input_file)
    with open(input_file, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            total_read += 1
            if total_read % 500 == 0:
                logging.info("%d ...", total_read)

            output = create_instance(tokenizer, line, args.max_length)
            features = write_instance_to_file(writer, instance=output)
            total_written += 1

            if total_written <= 20:
                logging.info("***** Example *****")
                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))

                for feature_name in features.keys():
                    feature = features[feature_name]
                    logging.info("%s: %s", feature_name, feature)

    writer.commit()
    logging.info("Wrote %d total instances", total_written)
Ejemplo n.º 2
0
def clip_article(input_path, out_path, hint, max_length):
    """
    clip article that the sample (article + summary) exceed max_length
    """
    tokenizer = Tokenizer()
    cnt = 0
    with open(input_path, "r") as r, open(out_path, "a+") as a:
        line = r.readline()
        while line:
            pos = line.rfind(hint)
            article = line[:pos]
            summary = line[pos:]
            if len(tokenizer.encode(line)) > max_length:
                l_article = tokenizer.encode(
                    article)[:max_length - len(tokenizer.encode(summary))]
                article = tokenizer.decode(l_article) + " "
            if cnt % 1000 == 0:
                print(article + summary)
                print("==============================")
            cnt += 1
            a.write(article + summary)
            line = r.readline()
Ejemplo n.º 3
0
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            generate_length_Dynamically=True):
    """
    Do eval
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")

    tokenizer = Tokenizer(
        vocab_file='./src/utils/pretrain-data/gpt2-vocab.json',
        merge_file='./src/utils/pretrain-data/gpt2-merges.txt')

    if metric.lower() == "accuracy":
        print("Prepare to calculate the accuracy score ...")
        # callback = Accuracy()
        # callback = LastWordAccuracy()
        # callback = LastTokenAccuracy()
        callback = LastWordAccuracy(smooth=False)
        gpt2_loss = network(config=gpt2_net_cfg,
                            is_training=False,
                            use_one_hot_embeddings=False)

        gpt2_loss.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)
        final_param_dict = {}
        for k, v in param_dict.items():
            final_param_dict['gpt2.gpt2.' + k] = param_dict[k]

        model = Model(gpt2_loss)
        columns_list = ["input_ids", "input_mask", "label_ids"]
        # set the weights of final linear weights to weights of gpt2 token embedding
        final_param_dict['gpt2.dense1.weight'] = param_dict[
            'gpt2_embedding_lookup.embedding_table']
        load_param_into_net(gpt2_loss, final_param_dict)
        print("============= Testing LAMBADA ACC =============")
        cnt = 0

        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data
            print("=========== LAMBADA Accuracy Test iteration:{}==========".
                  format(cnt))
            # input_ids = Tensor(input_ids, mindspore.int32)
            # input_mask = Tensor(input_mask, mindspore.int32)
            # label_ids = Tensor(label_ids, mindspore.int32)
            print("input_ids_shape: {}".format(
                input_ids.shape))  # (batch_size,seq_len)
            print("input_mask_shape: {}".format(input_mask.shape))
            print("label_ids_shape: {}".format(label_ids.shape))

            logits = model.predict(
                input_ids, input_mask)  # (batch_size,seq_len,vocab_size)
            print("=" * 40)

            output_str = generate_for_LAMBADA_numpy_topk(
                decoder=model,
                input_ids=input_ids,
                logits=logits,
                tokenizer=tokenizer,
                max_iterations=200,
                generate_length_dynamically=generate_length_Dynamically,
                stop_word_file="src/utils/pretrain-data/stopwords.txt")

            label_str = get_wholeword_label_str(input_ids=input_ids,
                                                config=gpt2_net_cfg,
                                                tokenizer=tokenizer)
            # print("logits shape: {}".format(logits.shape))
            # print("logits: \n{}".format(logits))
            # print("===================================")
            print("==============================================")
            print("output_str:", output_str)
            print("label_str", label_str)
            callback.update(output_str, label_str)
            # callback.update(logits, label_ids)
            cnt += 1
        print("==============================================")
        eval_result_print(metric, callback)
        print("************** Testing Finished **************")

    elif metric.lower() == "ppl":
        print("Prepare to calculate the ppl score ...")
        # ppl metric can be calculated by using the loss, so the difference is 'is_training'
        gpt2_loss = GPT2Lambada(config=gpt2_net_cfg,
                                is_training=False,
                                use_one_hot_embeddings=False)
        gpt2_loss.set_train(False)
        model = Model(gpt2_loss)

        param_dict = load_checkpoint(load_checkpoint_path)

        if eval_type == "zero-shot":
            final_param_dict = {}
            for k, v in param_dict.items():
                final_param_dict['gpt2.gpt2.' + k] = param_dict[k]
            # set the weights of final linear weights to weights of gpt2 token embedding
            final_param_dict['gpt2.dense1.weight'] = param_dict[
                'gpt2_embedding_lookup.embedding_table']
            load_param_into_net(gpt2_loss, final_param_dict)
            print("load pretrained parameter successfully!\n")

        elif eval_type == "finetuned":
            load_param_into_net(gpt2_loss, param_dict)
            print("load finetuned parameter successfully!\n")

        columns_list = ["input_ids", "input_mask", "label_ids"]

        num_data = 0
        total_ppl = 0.0
        total_loss = 0.0
        print("================= Testing LAMBADA PPL =================")
        for data in dataset.create_dict_iterator():
            print("=========== LAMBADA PPL Test iteration:{}==========".format(
                num_data))
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data
            print("input_ids_shape: {}".format(input_ids.shape))
            print("input_mask_shape: {}".format(input_mask.shape))
            print("label_ids_shape: {}".format(label_ids.shape))

            logits = model.predict(
                input_ids, input_mask)  # (batch_size,seq_len,vocab_size)
            # print("*"*30)
            last_word_range_ = get_lastword_range(
                input_ids=input_ids, config=gpt2_net_cfg,
                tokenizer=tokenizer)  #[(left_pos,right_pos)]
            last_word_range = (last_word_range_[0][0] + 1,
                               last_word_range_[0][1] + 1)
            last_word_logits_start_pos = last_word_range[0] - 1
            last_word_logits_end_pos = last_word_range[1] - 1
            last_word_token_len = last_word_range[1] - last_word_range[0]

            # print(" | last word range:", last_word_range)
            print(" | Last word token length:", last_word_token_len)

            # print(last_word_ids)
            # last_word_ids = P.Reshape()(last_word_ids,(-1,)).asnumpy().tolist()
            # print(last_word_ids)

            label_ids = extract_last_word_input_ids(
                input_ids=input_ids,
                seq_pos=last_word_range)  #(batch_size=1,x=lastword token num)

            gold_logits = logits[::, last_word_logits_start_pos:
                                 last_word_logits_end_pos:1, ::]

            label_ids = P.Reshape()(label_ids, (-1, ))  # (x,)
            print("label ids: ", label_ids)
            # print("labels ids shape:",label_ids.shape)
            # print("gold logits shape:",gold_logits.shape)

            gold_logits = P.Reshape()(gold_logits,
                                      (-1, gpt2_net_cfg.vocab_size))
            # print("gold logits:",gold_logits)
            # print("gold logits shape :",gold_logits.shape)
            label_word_ids = label_ids.asnumpy().tolist()
            # generate_ids = np.argmax(gold_logits.asnumpy().tolist())
            # print(type(generate_ids))
            # generate_ids = generate_ids.tolist()
            # print(generate_ids)
            label_word = tokenizer.decode(label_word_ids)
            print("label word: ", label_word)
            # generate_word = tokenizer.decode([generate_ids])

            # print("generate word:", generate_word)

            cross_entropy = SoftmaxCrossEntropyWithLogits(sparse=True,
                                                          reduction='mean')
            loss = cross_entropy(gold_logits, label_ids)

            # loss = model.predict(input_ids, input_mask, label_ids)

            loss = loss.asnumpy()
            print(" | Loss: {:.6f}".format(float(loss)))

            num_data += 1
            total_loss += loss
            avg_loss = total_loss / num_data

            print(" | Current AVG loss:", avg_loss)
            print(" | Current AVG ppl:", math.exp(avg_loss))

        ppl = math.exp(avg_loss)
        # avg_ppl = total_loss / num_data
        print("-----------------------------------------")
        print(" PPL: {:.6f}".format(ppl))
        print("************** Testing Finished **************")
    else:
        raise ValueError(
            "metric method not supported, support: [accuracy, ppl]")