Beispiel #1
0
 def __init__(self):
     # 加载分词自定义词典
     dicts.init()
     self.data_process = DataPressing()
     # 停用词
     self.stop_words = load_stop_words()
     # 股票-股票代码对, 并且对股票代码做一些变换,比如
     _, self.stocks_df = dicts.load_stock_data()
     self.tokenizer = Tokenizer(self.data_process, self.stop_words)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True, help='Input raw text file. ')
    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file. ')
    parser.add_argument("--num_splits", type=int, default=1,
                        help='The MindRecord file will be split into the number of partition. ')
    parser.add_argument("--max_length", type=int, required=True, help='Maximum sequence length. ')
    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
    args = parser.parse_args()

    tokenizer = Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file)

    input_file = args.input_file
    logging.info("***** Reading from input files *****")
    logging.info("Input File: %s", input_file)

    output_file = args.output_file
    logging.info("***** Writing to output files *****")
    logging.info("Output File: %s", output_file)

    writer = FileWriter(output_file, args.num_splits)
    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
                   "input_mask": {"type": "int64", "shape": [-1]},
                   "input_length": {"type": "int64", "shape": [-1]},
                   }
    writer.add_schema(data_schema, "lambada-schema")

    total_written = 0
    total_read = 0

    logging.info("***** Reading from  %s *****", input_file)
    with open(input_file, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            total_read += 1
            if total_read % 500 == 0:
                logging.info("%d ...", total_read)

            output = create_instance(tokenizer, line, args.max_length)
            features = write_instance_to_file(writer, instance=output)
            total_written += 1

            if total_written <= 20:
                logging.info("***** Example *****")
                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))

                for feature_name in features.keys():
                    feature = features[feature_name]
                    logging.info("%s: %s", feature_name, feature)

    writer.commit()
    logging.info("Wrote %d total instances", total_written)
def clip_article(input_path, out_path, hint, max_length):
    """
    clip article that the sample (article + summary) exceed max_length
    """
    tokenizer = Tokenizer()
    cnt = 0
    with open(input_path, "r") as r, open(out_path, "a+") as a:
        line = r.readline()
        while line:
            pos = line.rfind(hint)
            article = line[:pos]
            summary = line[pos:]
            if len(tokenizer.encode(line)) > max_length:
                l_article = tokenizer.encode(
                    article)[:max_length - len(tokenizer.encode(summary))]
                article = tokenizer.decode(l_article) + " "
            if cnt % 1000 == 0:
                print(article + summary)
                print("==============================")
            cnt += 1
            a.write(article + summary)
            line = r.readline()
    pass


def list2tensor(lst, dtype=mstype.float32):
    return Tensor(np.array(lst), dtype=dtype)


if __name__ == '__main__':
    print('*'*65)
    print('We are now in testing mode for GPT2 Interactive Generation Demo')
    print('*'*65)
    print('Set Running Env and Load Model')
    gpt2, config = set_env(mode="GPU",device_id=2)
    

    tokenizer = Tokenizer(vocab_file='./src/utils/pretrain-data/gpt2-vocab.json',
                          merge_file='./src/utils/pretrain-data/gpt2-merges.txt')

    generate_length = 10
    sample = Sample(gpt2, generate_length=generate_length, tokenizer=tokenizer,
                    model_config=config, topk_num=100, topp_prob=0.85, min_tokens_to_keep=1,temperature=0.85,demo_mode=True)
    beam_search = BeamSearch(decoder = gpt2,model_config=config,tokenizer=tokenizer,beam_size=3)
    
    official_unicorn_demo = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English."

    """
    In the sparse and frozen wilds of Alaska, it is not often that a tasty meal jumps almost straight into your mouth. But that was the case for one bear while hunting salmon in the\xa0Brooks River, which runs through the Katmai National Park, in southern Alaska. However, the dozy creature was unable to take advantage of his good fortune, letting the juicy fish slip away, even after it hit him on the nose. Scroll down for video . Fish supper: A bear hunting salmon in Alaska eyes up his dinner as a fish leaps straight at him while swimming up stream in order to reach its breeding grounds . Staring at defeat: This salmon's number appears to be up as it comes face to face with a hungry bear along the\xa0Brooks River in the\xa0Katmai National Park, Alaska . As close as it gets: As the two creatures come face to face, it looks as if the bear is about to enjoy the most hassle-free meal of its life . Sockeye salmon, which are native to Alaska, migrate up rivers during the spring in order to reach the breeding grounds where they were born in order to spawn. The fish, which spend the rest of the year out in the ocean, will swim against the current in order to reach the spawning grounds, leaping through waterfalls, which is where the bears wait. While the salmon are very fast and difficult to catch underwater, after they leap into the air they have no way of changing course, and so a relatively easy to pick out of the air. Husband and wife photography team Juergen and Christine Sohns captured the moment the bear let his prey get away. The salmon will not eat during their battle upstream, and will undergo a huge transformation, changing from grey to bright red, with their lower lip extending and their head turning green. Swing and a miss: However, nothing is a simple as it seems, and at the very last moment the bear makes a crucial error of judgement, and the Sockeye salmon is allowed to continue its journey . Second time unlucky: Photographer\xa0Juergen Sohns explained that once the fish are in the air they cannot change direction, which should make them easy to catch, but not for this bear, as another fish slips away . Once they reach the breeding grounds, usually a freshwater lake, they will mate, before perishing shortly afterwards. These images were captured by Juergen and Christine Sohns, who travelled to Alaska to photograph the salmon migration. Mr Sohns, 56, took the photo and said: 'The bear was just waiting at the best position in the falls to catch the fish when it was leaping.' He said that, while the bear was unlucky on this occasion, he did have more success after moving further up the river. Mr Sohns and his wife, from Germany, are veteran wildlife photographers, and over the last 20 years have travelled to every continent on Earth photographing wildlife. Fish season: Sockeye salmon spend most of the year out at sea, but during spring they attempt to swim back up rivers to breed, making them easy targets for bears and eagles3020099 . <|endoftext|>
    """

    """
["Sunderland are trailing former Wigan striker Franco di Santo. The Argentine is playing for Werder Bremen and has scored 13 goals in 22 games this season. The Bundesliga side want £8million for the 26-year-old who Sunderland sporting director Lee Congerton knows well from his time at Chelsea. Sunderland are considering a summer move for in-form Werder Bremen forward Franco Di Santo . The Argentine has been in superb form for his club this season, netting 13 goals in 22 games . Di Santo began his senior career with Chilean side Audax Italiano in 2006, before catching the Blues' eye two years later. However, he failed to make an impact at Stamford Bridge and following a similarly ineffectual loan spell at Blackburn Rovers, was sold to Wigan in 2010. He spent three seasons with the Lancashire-based outfit, scoring 13 goals in 97 appearances. Di Santo was an unused substitute during the club's FA Cup final victory over Manchester City in 2013, before being released at the end of that season. He made the move to the Bundesliga in August 2013 and has appeared to fulfil some of his early promise. Di Santo previously played for Chelsea but struggled to make an impact at Stamford Bridge and was sold . Di Santo played for Wigan for three seasons, scoring\xa013 goals in 97 appearances before being released. TL;DR: <|endoftext|>"]
[DEBUG INFO] len_str:[287]
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="",eval_load_param_mode="zero-shot",generation_config_path="",tokenizer_file=""):
    """
    Do evaluation on summarization
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.
    """
    if load_checkpoint_path == "":
        raise ValueError("Finetune model missed, evaluation task must load finetune model!")
    if metric.lower() == "rouge":
        print("Prepare to calculate the Rouge score ...")
        callback = Rouge()
        
        #initialize network and load params
        gpt2_loss = network(config=gpt2_net_cfg,
                            is_training=False,
                            use_one_hot_embeddings=False)
        gpt2_loss.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)
       
        #get reorganized param_dict and load parms into network
        reorganized_param_dict = modify_paramdict(param_dict,mode=eval_load_param_mode,model_prefix="gpt2.")
        load_param_into_net(gpt2_loss, reorganized_param_dict)


        #load nn.Cell into Model and initiate tokenizer and Sample
        model = Model(gpt2_loss)
        tokenizer = Tokenizer(vocab_file=tokenizer_file+'gpt2-vocab.json',
        merge_file=tokenizer_file+'gpt2-merges.txt')
        generate_config = GenerationConfig( file_path=generation_config_path)
        TL_DR = generate_config.get_arg("tldr") if generate_config.get_arg("tldr") is not None else True
        tldr_str = generate_config.get_arg("tldr_str") if generate_config.get_arg("tldr_str") is not None else "TL;DR:"
        #sample = Sample(model,tokenizer=tokenizer,model_config=gpt2_net_cfg,topk_num = topk,topp_prob=topp,
        #min_tokens_to_keep=1,demo_mode=False,temperature=temperature,append_eos=append_eos)

        #load data and process text generation
        columns_list = ["input_ids", "input_mask", "label_ids"]
        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data

            print("input_ids shape: {}".format(input_ids.shape))
            print("label_ids shape: {}".format(label_ids.shape))
            print("="*15," Summrization Testing ","="*15)
           
            hypo,ref = generate_for_CNN_DAILYMAIL(model,input_ids,
                                                select_sentence=3,
                                                TL_DR=TL_DR,
                                                tldr_str=tldr_str,
                                                tokenizer=tokenizer,
                                                generate_config=generate_config)

            print("REF str:\n ",ref,"\nHYPO str:\n",hypo,"\n")

            for i in range(gpt2_net_cfg.batch_size):
                hypo[i] = clean_hypo(hypo[i])
            
            for i in range(gpt2_net_cfg.batch_size):
                hypo[i] = hypo[i].lower()
                ref[i] = ref[i].lower()
            
            callback.update(hypo,ref)

        print("="*35)
        eval_result_print(metric, callback)
        print("="*35)
        print("*"*15," Summrization Testing Finished","*"*15)
    
    else:
        raise ValueError("metric method not supported in summarization, support: [Rouge]")
Beispiel #6
0
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            tokenizer_file_path="",
            generate_length=1,
            top_k=1,
            top_p=1.0,
            temperature=1.0):
    """
    Do evaluation on Translation
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.

    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    if metric.lower() == "bleu":
        print("Prepare to calculate the BLEU score ...")

        gpt2_translation = network(config=gpt2_net_cfg,
                                   is_training=False,
                                   use_one_hot_embeddings=False)
        gpt2_translation.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)

        if eval_type == "zero-shot":
            final_param_dict = {}
            for name, _ in param_dict.items():
                final_param_dict['gpt2.' + name] = param_dict[name]
            final_param_dict['dense1.weight'] = param_dict[
                'gpt2_embedding_lookup.embedding_table']
            load_param_into_net(gpt2_translation, final_param_dict)
            print("load pretrained parameter successfully!\n")
        elif eval_type == "finetuned":
            load_param_into_net(gpt2_translation, param_dict)
            print("load finetuned parameter successfully!\n")
        else:
            raise ValueError(
                "Evaluation type missed, eval_type should be [zero-shot, finetuned]"
            )

        model = Model(gpt2_translation)
        tokenizer = Tokenizer(
            vocab_file=tokenizer_file_path + 'gpt2-vocab.json',
            merge_file=tokenizer_file_path + 'gpt2-merges.txt')
        callback = BLEU(tokenizer)
        translation_generator = GenerateForTranslation(
            decoder=model,
            config=gpt2_net_cfg,
            tokenizer=tokenizer,
            generate_length=1,
            use_hint=True,
            select_first_sentence=True,
            topk_num=top_k,
            topp_prob=float(top_p),
            temperature=float(temperature))

        columns_list = ["input_ids", "input_mask", "label_ids"]
        print("==================== [BLEU] Testing ====================")
        num_data = 1
        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data

            print("| Data count: {}".format(num_data *
                                            gpt2_net_cfg.batch_size))
            print("input_ids shape: {}".format(input_ids.shape))
            print("input_mask shape: {}".format(input_mask.shape))
            print("label_ids shape: {}".format(label_ids.shape))

            ts_predict_list, ref_list = translation_generator.generate_for_translation(
                input_ids)
            print("| Batch Reference translation:\n{}\n".format(ref_list))
            if ref_list == '' or ref_list is None:
                print("Sorry ref_list is None, skip it!")
                continue
            else:
                print(" | Batch Predict translation:\n{}\n".format(
                    ts_predict_list))
                callback.update(ref_list, ts_predict_list)
                num_data += 1
                print("\n\n")

        print("**************************************************************")
        eval_result_print(metric, callback)
        print("********************** Testing Finished **********************")
    else:
        raise ValueError(
            "metric method not supported in translation, support: [BLEU]")
Beispiel #7
0
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="", eval_type=None, tokenizer_file_path="",
            generate_length=1, top_k=1, top_p=1.0, temperature=1.0):
    """
    Do evaluation on Translation
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.

    """
    if load_checkpoint_path == "":
        raise ValueError("Finetune model missed, evaluation task must load finetune model!")
    if metric.lower() == "f1":
        print("Prepare to calculate the F1 score ...")

        gpt2_rc = network(config=gpt2_net_cfg,
                          is_training=False,
                          use_one_hot_embeddings=False)
        gpt2_rc.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)

        if eval_type == "zero-shot":
            final_param_dict = {}
            for name, _ in param_dict.items():
                final_param_dict['gpt2.' + name] = param_dict[name]
            final_param_dict['dense1.weight'] = param_dict['gpt2_embedding_lookup.embedding_table']
            load_param_into_net(gpt2_rc, final_param_dict)
            print("load pretrained parameter successfully!\n")
        elif eval_type == "finetuned":
            load_param_into_net(gpt2_rc, param_dict)
            print("load finetuned parameter successfully!\n")
        else:
            raise ValueError("Evaluation type missed, eval_type should be [zero-shot, finetuned]")

        model = Model(gpt2_rc)
        tokenizer = Tokenizer(vocab_file=tokenizer_file_path + 'gpt2-vocab.json',
                              merge_file=tokenizer_file_path + 'gpt2-merges.txt')
        callback = F1()
        rc_generator = GenerateForReadComprehension(decoder=model,
                                                    config=gpt2_net_cfg,
                                                    tokenizer=tokenizer,
                                                    generate_length=generate_length,
                                                    topk_num=top_k,
                                                    topp_prob=float(top_p),
                                                    temperature=float(temperature)
                                                    )

        columns_list = ["input_ids", "input_mask", "label_ids"]
        print("==================== [F1] Testing ====================")
        num_data = 0
        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, _, label_ids = input_data

            print("input_ids shape: {}".format(input_ids.shape))
            print("label_ids shape: {}".format(label_ids.shape))

            passage, pred_answer, gold_answer = rc_generator.generate_for_read_comprehension(input_ids)

            for batch_id in range(gpt2_net_cfg.batch_size):
                print("============== [F1] {}  ================".format(num_data + 1))
                print(" | Passage:{}".format(passage[batch_id]))
                print(" | Gold_answer:{}".format(gold_answer[batch_id]))
                print(" | Pred_answer:{}".format(pred_answer[batch_id]))

                pred = callback.get_normalize_answer_token(pred_answer[batch_id])
                gold = callback.get_normalize_answer_token(gold_answer[batch_id])

                callback.update(pred, gold)
                num_data += 1

            average_f1_score = callback.f1_score / num_data
            print("==============  Evaluation  =================")
            print("|   Avg F1 Score:{:.8f}".format(average_f1_score))
            print("=============================================\n\n")

        print("********************** Testing Finished **********************")
    else:
        raise ValueError("metric method not supported in Reading Comprehension task, support: [F1]")
    stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
    # stock_df.append(stocks_df.set_index('SESNAME'))
    for index, row in stocks_df.iterrows():
        stock_dict.append(row.SESNAME)
        stock_dict.append(row.SYMBOL)
    return stock_dict, stocks_df


_, stocks_df = load_stock_data()

# 识别评论中的股票实体。
# 对讨论进行分词,然后提取评论中的股票实体。
data_process = DataPressing()
dict_init = dicts.init()
stop_words = load_stop_words()
tokenizer = Tokenizer(data_process, stop_words)


# 整理股票代码
stocks_df = stocks_df.set_index('SESNAME')
# print('stocks_df %s' % stocks_df)


def cut_process(text):
    """
    数据处理模块, 分词、提取股票实体词
    :param text:
    :return:
    """
    # 分词
    dicts.init()
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path=""):
    """
    Do evaluation on summarization
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    if metric.lower() == "bleu":
        print("Prepare to calculate the BLEU score ...")

        gpt2_loss = network(config=gpt2_net_cfg,
                            is_training=False,
                            use_one_hot_embeddings=False)

        gpt2_loss.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)
        reorganized_param_dict = dict()
        for netName in param_dict:
            reorganized_param_dict['gpt2.' + netName] = param_dict[netName]
        reorganized_param_dict['lm_head.weight'] = param_dict[
            'gpt2_embedding_lookup.embedding_table']
        load_param_into_net(gpt2_loss, reorganized_param_dict)

        # for item in gpt2_loss.get_parameters():

        #     print('name: ',item.data.name)

        model = Model(gpt2_loss)
        tokenizer = Tokenizer(
            vocab_file='./src/utils/pretrain-data/gpt2-vocab.json',
            merge_file='./src/utils/pretrain-data/gpt2-merges.txt')
        callback = BLEU(tokenizer)
        sample = Sample(model,
                        tokenizer=tokenizer,
                        model_config=gpt2_net_cfg,
                        topk_num=0,
                        topp_prob=0.92,
                        min_tokens_to_keep=1,
                        demo_mode=False,
                        early_stop=True)
        columns_list = ["input_ids", "input_mask", "label_ids"]
        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data

            print("input_ids shape: {}".format(input_ids.shape))
            print("label_ids shape: {}".format(label_ids.shape))
            print("============= Translation Testing =============")

            #input_str,ref_str = sample.extract_string_from_tensor(input_ids,mode="pair")
            hypo, ref = sample.generate_for_Translation(
                input_ids, max_generate_length=150)
            print("REF str:\n ", ref, "\nHYPO str:\n", hypo, "\n")
            #print("LENGTH: ",len(ref[1]),"   and   ",len(hypo[1]),"\n")
            callback.update(ref, hypo)
        print("==============================================")
        eval_result_print(metric, callback)
        print("==============================================")
        print("************** Translation Testing Finished **************")

    else:
        raise ValueError(
            "metric method not supported in summarization, support: [Rouge]")
Beispiel #10
0
# query = tokenization('/Users/yiiyuanliu/Desktop/nlp/demo/articles/关于降压药的五个问题.txt')
# query_bow = dictionary.doc2bow(query)
# print query_bow
#
#
# # 文本相似度计算
# # 基于积累的事件,首先计算所有事件的词向量或者tf-idf值,然后将新晋事件与最近的事件进行相似度计算,计算
# lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=2)

if __name__ == '__main__':
    import dicts
    data_processing = data_process.DataPressing()
    dict_init = dicts.init()
    stop_words = load_stop_words()
    t = Tokenizer(data_processing, stop_words)
    stock_dict = dicts.stock_dict
    print(["大智慧".decode("utf8")])
    a = ["大智慧".decode("utf8")]
    print(len(a[0]))
    # print(["【今日题材】".decode("utf8")])

    # file = open('file_name.txt', 'w')
    # file.write(str(raw_documents))
    # file.close()

    # 剔除杂质词
    print(
        data_processing.no_remove(
            "【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的"))
Beispiel #11
0
class DiscussParser(object):
    """
    讨论解析器
    """
    def __init__(self):
        # 加载分词自定义词典
        dicts.init()
        self.data_process = DataPressing()
        # 停用词
        self.stop_words = load_stop_words()
        # 股票-股票代码对, 并且对股票代码做一些变换,比如
        _, self.stocks_df = dicts.load_stock_data()
        self.tokenizer = Tokenizer(self.data_process, self.stop_words)

    def __cut_process(self, text):
        """
        数据处理模块, 分词、提取股票实体词
        :param text:
        :return:
        """
        print('cut_process进程: %sd   父进程ID:%s' % (os.getpid(), os.getppid()))
        # 分词
        # 用到多进程处理DataFrame,所以将类申明放到每个进程中,不然在调用token的时候,每个子进程不能再调用初始化词典
        text_list = self.tokenizer.token(text)
        # print("text_list %s" % text_list)
        # 提取text中涉及到的股票实体,并且转换成股票代码
        stock_list = self.data_process.find_stocks(text_list, self.stocks_df)
        # stock_list = ','.join(stock_list)  # 展示使用
        return stock_list

    def tmp_func(self, tmp_df, column="text"):
        """
        apply函数封装
        :param column: 需要处理的列名
        :param tmp_df:
        :return:
        """
        print('tmp_func进程: %sd   父进程ID:%s' % (os.getpid(), os.getppid()))
        tmp_df['stock_list'] = tmp_df[column].apply(self.__cut_process)
        return tmp_df

    @staticmethod
    def __apply_parallel(df_grouped, func):
        """
        # 多进程处理dataframe
        :param df_grouped:
        :param func:
        :return:
        """
        print('apply_parallel是进程: %sd   父进程ID:%s' %
              (os.getpid(), os.getppid()))
        num_cpu = multiprocessing.cpu_count()

        # Parallel不使用参数的时候, 程序多进程运行, 但是字典没有加载
        # res_list = Parallel(n_jobs=num_cpu - 2)(delayed(func)(group) for name, group in df_grouped)
        # 单独使用prefer参数, 依然是单进程
        # res_list = Parallel(n_jobs=num_cpu - 2, prefer="threads")(delayed(func)(group) for name, group in df_grouped)
        # 单独使用backend, 词典可以加载成功
        # res_list = Parallel(n_jobs=num_cpu - 2, backend="multiprocessing")(delayed(func)(group) for name, group in df_grouped)
        # 两个参数都设置, 词典加载成功, 而且运行时间略有缩短
        res_list = Parallel(n_jobs=(num_cpu - 2),
                            backend="multiprocessing",
                            prefer="threads")(delayed(func)(group)
                                              for name, group in df_grouped)
        return pd.concat(res_list)

    def run(self, target_df):
        """
        多进程处理主程序
        :param target_df:
        :return:
        """
        # print('run进程: %sd   父进程ID:%s' % (os.getpid(), os.getppid()))
        # 将输入数据按照
        df_grouped = target_df.groupby(target_df.index)
        res_df = self.__apply_parallel(df_grouped, self.tmp_func)
        return res_df
Beispiel #12
0
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            generate_length_Dynamically=True):
    """
    Do eval
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")

    tokenizer = Tokenizer(
        vocab_file='./src/utils/pretrain-data/gpt2-vocab.json',
        merge_file='./src/utils/pretrain-data/gpt2-merges.txt')

    if metric.lower() == "accuracy":
        print("Prepare to calculate the accuracy score ...")
        # callback = Accuracy()
        # callback = LastWordAccuracy()
        # callback = LastTokenAccuracy()
        callback = LastWordAccuracy(smooth=False)
        gpt2_loss = network(config=gpt2_net_cfg,
                            is_training=False,
                            use_one_hot_embeddings=False)

        gpt2_loss.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)
        final_param_dict = {}
        for k, v in param_dict.items():
            final_param_dict['gpt2.gpt2.' + k] = param_dict[k]

        model = Model(gpt2_loss)
        columns_list = ["input_ids", "input_mask", "label_ids"]
        # set the weights of final linear weights to weights of gpt2 token embedding
        final_param_dict['gpt2.dense1.weight'] = param_dict[
            'gpt2_embedding_lookup.embedding_table']
        load_param_into_net(gpt2_loss, final_param_dict)
        print("============= Testing LAMBADA ACC =============")
        cnt = 0

        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data
            print("=========== LAMBADA Accuracy Test iteration:{}==========".
                  format(cnt))
            # input_ids = Tensor(input_ids, mindspore.int32)
            # input_mask = Tensor(input_mask, mindspore.int32)
            # label_ids = Tensor(label_ids, mindspore.int32)
            print("input_ids_shape: {}".format(
                input_ids.shape))  # (batch_size,seq_len)
            print("input_mask_shape: {}".format(input_mask.shape))
            print("label_ids_shape: {}".format(label_ids.shape))

            logits = model.predict(
                input_ids, input_mask)  # (batch_size,seq_len,vocab_size)
            print("=" * 40)

            output_str = generate_for_LAMBADA_numpy_topk(
                decoder=model,
                input_ids=input_ids,
                logits=logits,
                tokenizer=tokenizer,
                max_iterations=200,
                generate_length_dynamically=generate_length_Dynamically,
                stop_word_file="src/utils/pretrain-data/stopwords.txt")

            label_str = get_wholeword_label_str(input_ids=input_ids,
                                                config=gpt2_net_cfg,
                                                tokenizer=tokenizer)
            # print("logits shape: {}".format(logits.shape))
            # print("logits: \n{}".format(logits))
            # print("===================================")
            print("==============================================")
            print("output_str:", output_str)
            print("label_str", label_str)
            callback.update(output_str, label_str)
            # callback.update(logits, label_ids)
            cnt += 1
        print("==============================================")
        eval_result_print(metric, callback)
        print("************** Testing Finished **************")

    elif metric.lower() == "ppl":
        print("Prepare to calculate the ppl score ...")
        # ppl metric can be calculated by using the loss, so the difference is 'is_training'
        gpt2_loss = GPT2Lambada(config=gpt2_net_cfg,
                                is_training=False,
                                use_one_hot_embeddings=False)
        gpt2_loss.set_train(False)
        model = Model(gpt2_loss)

        param_dict = load_checkpoint(load_checkpoint_path)

        if eval_type == "zero-shot":
            final_param_dict = {}
            for k, v in param_dict.items():
                final_param_dict['gpt2.gpt2.' + k] = param_dict[k]
            # set the weights of final linear weights to weights of gpt2 token embedding
            final_param_dict['gpt2.dense1.weight'] = param_dict[
                'gpt2_embedding_lookup.embedding_table']
            load_param_into_net(gpt2_loss, final_param_dict)
            print("load pretrained parameter successfully!\n")

        elif eval_type == "finetuned":
            load_param_into_net(gpt2_loss, param_dict)
            print("load finetuned parameter successfully!\n")

        columns_list = ["input_ids", "input_mask", "label_ids"]

        num_data = 0
        total_ppl = 0.0
        total_loss = 0.0
        print("================= Testing LAMBADA PPL =================")
        for data in dataset.create_dict_iterator():
            print("=========== LAMBADA PPL Test iteration:{}==========".format(
                num_data))
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, label_ids = input_data
            print("input_ids_shape: {}".format(input_ids.shape))
            print("input_mask_shape: {}".format(input_mask.shape))
            print("label_ids_shape: {}".format(label_ids.shape))

            logits = model.predict(
                input_ids, input_mask)  # (batch_size,seq_len,vocab_size)
            # print("*"*30)
            last_word_range_ = get_lastword_range(
                input_ids=input_ids, config=gpt2_net_cfg,
                tokenizer=tokenizer)  #[(left_pos,right_pos)]
            last_word_range = (last_word_range_[0][0] + 1,
                               last_word_range_[0][1] + 1)
            last_word_logits_start_pos = last_word_range[0] - 1
            last_word_logits_end_pos = last_word_range[1] - 1
            last_word_token_len = last_word_range[1] - last_word_range[0]

            # print(" | last word range:", last_word_range)
            print(" | Last word token length:", last_word_token_len)

            # print(last_word_ids)
            # last_word_ids = P.Reshape()(last_word_ids,(-1,)).asnumpy().tolist()
            # print(last_word_ids)

            label_ids = extract_last_word_input_ids(
                input_ids=input_ids,
                seq_pos=last_word_range)  #(batch_size=1,x=lastword token num)

            gold_logits = logits[::, last_word_logits_start_pos:
                                 last_word_logits_end_pos:1, ::]

            label_ids = P.Reshape()(label_ids, (-1, ))  # (x,)
            print("label ids: ", label_ids)
            # print("labels ids shape:",label_ids.shape)
            # print("gold logits shape:",gold_logits.shape)

            gold_logits = P.Reshape()(gold_logits,
                                      (-1, gpt2_net_cfg.vocab_size))
            # print("gold logits:",gold_logits)
            # print("gold logits shape :",gold_logits.shape)
            label_word_ids = label_ids.asnumpy().tolist()
            # generate_ids = np.argmax(gold_logits.asnumpy().tolist())
            # print(type(generate_ids))
            # generate_ids = generate_ids.tolist()
            # print(generate_ids)
            label_word = tokenizer.decode(label_word_ids)
            print("label word: ", label_word)
            # generate_word = tokenizer.decode([generate_ids])

            # print("generate word:", generate_word)

            cross_entropy = SoftmaxCrossEntropyWithLogits(sparse=True,
                                                          reduction='mean')
            loss = cross_entropy(gold_logits, label_ids)

            # loss = model.predict(input_ids, input_mask, label_ids)

            loss = loss.asnumpy()
            print(" | Loss: {:.6f}".format(float(loss)))

            num_data += 1
            total_loss += loss
            avg_loss = total_loss / num_data

            print(" | Current AVG loss:", avg_loss)
            print(" | Current AVG ppl:", math.exp(avg_loss))

        ppl = math.exp(avg_loss)
        # avg_ppl = total_loss / num_data
        print("-----------------------------------------")
        print(" PPL: {:.6f}".format(ppl))
        print("************** Testing Finished **************")
    else:
        raise ValueError(
            "metric method not supported, support: [accuracy, ppl]")
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            tokenizer_file="",
            top_k=None,
            top_p=None,
            temperature=None,
            generate_length=None):
    """
    Do evaluation on summarization
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    if metric.lower() == "rouge":
        print("Prepare to calculate the Rouge score ...")
        callback = Rouge()

        gpt2_loss = network(config=gpt2_net_cfg,
                            is_training=False,
                            use_one_hot_embeddings=False)
        gpt2_loss.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)

        reorganized_param_dict = modify_paramdict(param_dict,
                                                  mode=eval_type,
                                                  model_prefix="gpt2.")
        load_param_into_net(gpt2_loss, reorganized_param_dict)

        # load nn.Cell into Model and initiate tokenizer and Sample
        model = Model(gpt2_loss)
        tokenizer = Tokenizer(vocab_file=tokenizer_file + 'gpt2-vocab.json',
                              merge_file=tokenizer_file + 'gpt2-merges.txt')

        # load data and process text generation
        columns_list = ["input_ids", "input_mask", "label_ids"]

        summarization_generator = GenerateForSummarization(
            model,
            config=gpt2_net_cfg,
            tokenizer=tokenizer,
            select_sentence=3,
            eval_type=eval_type,
            topk=top_k,
            topp=float(top_p),
            temperature=float(temperature),
            generate_length=generate_length)
        num_data = 1
        print(
            "==================== [Summrization] Testing ====================")
        for data in dataset.create_dict_iterator():
            input_data = []
            for value in columns_list:
                input_data.append(data[value])
            input_ids, _, label_ids = input_data
            print(" | [ROUGE] number : {} / {} ".format(
                num_data, dataset.get_dataset_size()))
            print("input_ids shape: {}".format(input_ids.shape))
            print("label_ids shape: {}".format(label_ids.shape))

            hypothesis, ref = summarization_generator.generate_for_summarization(
                input_ids)
            if ref[0] == '' or ref[0] is None:
                print("Sorry ref_list is None, skip it!")
                continue

            print("REF str:\n ", ref, "\nHYPO str:\n", hypothesis, "\n")
            for batch_idx in range(gpt2_net_cfg.batch_size):
                hypothesis[batch_idx] = clean_hypo(hypothesis[batch_idx])
            for batch_idx in range(gpt2_net_cfg.batch_size):
                hypothesis[batch_idx] = hypothesis[batch_idx].lower()
                ref[batch_idx] = ref[batch_idx].lower()

            callback.update(hypothesis, ref)
            num_data += 1

        print("\n\n")
        print("**********************************************************")
        eval_result_print(metric, callback)
        print("******************** Testing Finished ********************")

    else:
        raise ValueError(
            "metric method not supported in summarization, support: [Rouge]")
Beispiel #14
0
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            stop_word_file="",
            generate_length_dynamic=True,
            tokenizer_file_path=""):
    """
    Do eval
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetune model checkpoint.
        eval_type: the eval type, i.e. zero-shot, finetuned.
        generate_length_dynamic (bool): True for the generate length is dynamic, False for fixed. Default: True.
        tokenizer_file_path: the tokenizer file path for vocab file and merge file.
        stop_word_file: stop word file for calculating Accuracy.
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")

    tokenizer = Tokenizer(vocab_file=tokenizer_file_path + 'gpt2-vocab.json',
                          merge_file=tokenizer_file_path + 'gpt2-merges.txt')

    gpt2_lambada = network(config=gpt2_net_cfg,
                           is_training=False,
                           use_one_hot_embeddings=False)
    gpt2_lambada.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)

    if eval_type == "zero-shot":
        final_param_dict = {}
        for name, _ in param_dict.items():
            final_param_dict['gpt2.gpt2.' + name] = param_dict[name]
        final_param_dict['gpt2.dense1.weight'] = param_dict[
            'gpt2_embedding_lookup.embedding_table']
        load_param_into_net(gpt2_lambada, final_param_dict)
        print("load pretrained parameter successfully!\n")
    elif eval_type == "finetuned":
        load_param_into_net(gpt2_lambada, param_dict)
        print("load finetuned parameter successfully!\n")

    model = Model(gpt2_lambada)

    if metric.lower() == "accuracy":
        print("Prepare to calculate the accuracy score ...")

        callback = LastWordAccuracy()
        columns_list = ["input_ids", "input_mask", "input_length"]
        print("==================== [ACC] Testing ====================")
        lambada_generator = GenerateForLambada(
            decoder=model,
            config=gpt2_net_cfg,
            tokenizer=tokenizer,
            generate_length_dynamic=generate_length_dynamic,
            max_iterations=200,
            stop_word_file=stop_word_file)

        num_data = 1
        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, input_length = input_data
            print("| [ACC] number : {} / {} ".format(
                num_data, dataset.get_dataset_size()))

            logits = model.predict(input_ids, input_mask)
            predict_str = lambada_generator.generate_for_lambada(
                input_ids=input_ids, logits=logits, input_length=input_length)
            label_str = get_final_word_label(input_ids=input_ids,
                                             input_length=input_length,
                                             tokenizer=tokenizer)
            callback.update(predict_str, label_str)
            eval_result_print(metric, callback)
            num_data += 1

        print("\n\n")
        print("**********************************************************")
        eval_result_print(metric, callback)
        print("******************** Testing Finished ********************")

    elif metric.lower() == "ppl":
        print("Prepare to calculate the ppl score ...")
        cross_entropy = CrossEntropyCalculationWithMask(
            is_training=True,
            num_labels=gpt2_net_cfg.vocab_size,
            config=gpt2_net_cfg)
        columns_list = ["input_ids", "input_mask", "input_length"]
        num_data = 1
        total_loss = 0.0
        print("==================== [PPL] Testing ====================")
        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, input_length = input_data
            print("| [PPL] number : {} / {} ".format(
                num_data, dataset.get_dataset_size()))

            logits = model.predict(
                input_ids, input_mask)  # (batch_size, seq_len, vocab_size)
            avg_batch_loss = calculate_final_word_loss(logits,
                                                       gpt2_net_cfg.batch_size,
                                                       input_ids, input_length,
                                                       cross_entropy)

            total_loss += avg_batch_loss
            avg_total_loss = total_loss / num_data
            print(" | Current AVG loss:", avg_total_loss)
            print(" | Current AVG ppl:", math.exp(avg_total_loss))
            num_data += 1

        print("\n\n")
        print("**********************************************************")
        print("Average PPL: {:.6f}".format(math.exp(avg_total_loss)))
        print("******************** Testing Finished ********************")

    else:

        raise ValueError(
            "metric method not supported, support: [accuracy, ppl]")