def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size).clamp(3, )
        input_ids[:, -1] = self.eos_token_id  # Eos Token

        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length],
                                       self.vocab_size)

        config = PegasusConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
            decoder_layers=self.num_hidden_layers,
            encoder_attention_heads=self.num_attention_heads,
            decoder_attention_heads=self.num_attention_heads,
            encoder_ffn_dim=self.intermediate_size,
            decoder_ffn_dim=self.intermediate_size,
            dropout=self.hidden_dropout_prob,
            attention_dropout=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            eos_token_id=self.eos_token_id,
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
        )
        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids,
                                                  decoder_input_ids)
        return config, inputs_dict
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)

        attention_mask = None
        if self.use_attention_mask:
            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)

        lm_labels = None
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)

        config = PegasusConfig(
            vocab_size=self.vocab_size,
            d_model=self.d_model,
            decoder_layers=self.decoder_layers,
            decoder_ffn_dim=self.decoder_ffn_dim,
            encoder_attention_heads=self.encoder_attention_heads,
            decoder_attention_heads=self.decoder_attention_heads,
            eos_token_id=self.eos_token_id,
            bos_token_id=self.bos_token_id,
            use_cache=self.use_cache,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.decoder_start_token_id,
            max_position_embeddings=self.max_position_embeddings,
            is_encoder_decoder=self.is_encoder_decoder,
        )

        return (
            config,
            input_ids,
            attention_mask,
            lm_labels,
        )
Beispiel #3
0
    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()
Beispiel #4
0
 def __init__(self, parent):
     self.config = PegasusConfig(
         vocab_size=99,
         d_model=24,
         encoder_layers=2,
         decoder_layers=2,
         encoder_attention_heads=2,
         decoder_attention_heads=2,
         encoder_ffn_dim=32,
         decoder_ffn_dim=32,
         max_position_embeddings=48,
         add_final_layer_norm=True,
     )
 def get_pipeline_config(self):
     return PegasusConfig(
         vocab_size=200,
         d_model=self.hidden_size,
         encoder_layers=self.num_hidden_layers,
         decoder_layers=self.num_hidden_layers,
         encoder_attention_heads=self.num_attention_heads,
         decoder_attention_heads=self.num_attention_heads,
         encoder_ffn_dim=self.intermediate_size,
         decoder_ffn_dim=self.intermediate_size,
         dropout=self.hidden_dropout_prob,
         attention_dropout=self.attention_probs_dropout_prob,
         max_position_embeddings=200,
         eos_token_id=self.eos_token_id,
         bos_token_id=self.bos_token_id,
         pad_token_id=self.pad_token_id,
     )
Beispiel #6
0
def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1024
    decode_max_len = 256
    data = load_data('./final_test_data_list.json')
    model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                        decode_max_len)
    model.load_weights('./pagesus_section/best_model.hdf5')
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=256,
                          max_decode_len=decode_max_len,
                          model=model)

    result = just_predict(autotitle, tokenizer, MAX_LEN, data)
    with open('./pred_result.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False, cls=NpEncoder))
def convert_pegasus_to_bart(
        tf_weights: dict,
        cfg_updates: dict) -> PegasusForConditionalGeneration:
    cfg_kwargs = DEFAULTS.copy()
    cfg_kwargs.update(cfg_updates)

    cfg = PegasusConfig(**cfg_updates)
    bart = PegasusForConditionalGeneration(cfg)
    sd = bart.model.state_dict()
    mapping = {}
    for k, v in tf_weights.items():
        new_k = rename_state_dict_key(k)
        if new_k not in sd:
            raise ValueError(
                f"could not find new key {new_k} in state dict. (converted from {k})"
            )

        if "dense" in k or "proj" in new_k:
            v = v.T
        mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype)
        assert v.shape == sd[
            new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}"
    # make sure embedding.padding_idx is respected
    mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like(
        mapping["shared.weight"][cfg.pad_token_id + 1])
    mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"]
    mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"]
    empty_biases = {
        k: torch.zeros_like(v)
        for k, v in sd.items() if k.endswith("bias") and k not in mapping
    }
    mapping.update(**empty_biases)
    missing, extra = bart.model.load_state_dict(mapping, strict=False)
    unexpected_missing = [
        k for k in missing if k not in
        ["encoder.embed_positions.weight", "decoder.embed_positions.weight"]
    ]
    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
    assert extra == [], f"no matches found for the following tf keys {extra}"
    return bart
Beispiel #8
0
def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1920
    decode_max_len = 600
    batch_size = 2
    data = load_data(
        '/home_zyz/abstract_generate/final_abdata/union_add_noabs_cleaned_1920.json'
    )
    random.shuffle(data)
    print(len(data))
    print(data[0][0])
    print(data[0][1])
    valid_data = data[:5]
    train_data = data[5:]
    train_generator = data_generator(train_data, batch_size, MAX_LEN,
                                     decode_max_len, tokenizer)

    K.clear_session()
    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    with strategy.scope():
        model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                            decode_max_len)

    epochs = 50
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=599,
                          max_decode_len=decode_max_len,
                          model=model)
    evaluator = Evaluator(tokenizer, MAX_LEN, autotitle, valid_data)
    model.fit(train_generator.forfit(),
              steps_per_epoch=len(train_generator) - 1,
              epochs=epochs,
              callbacks=[evaluator])
Beispiel #9
0
    return {'Total': total_num, 'Trainable': trainable_num}


text = "四海网讯,近日,有媒体报道称:章子怡真怀孕了!报道还援引知情人士消息称,“章子怡怀孕大概四五个月,预产期是年底前后,现在已经不接工作了。”这到底是怎么回事?消息是真是假?针对此消息,23日晚8时30分," \
       "华西都市报记者迅速联系上了与章子怡家里关系极好的知情人士,这位人士向华西都市报记者证实说:“子怡这次确实怀孕了。她已经36岁了,也该怀孕了。章子怡怀上汪峰的孩子后,子怡的父母亲十分高兴。子怡的母亲," \
       "已开始悉心照料女儿了。子怡的预产期大概是今年12月底。”当晚9时,华西都市报记者为了求证章子怡怀孕消息,又电话联系章子怡的亲哥哥章子男,但电话通了," \
       "一直没有人接听。有关章子怡怀孕的新闻自从2013年9月份章子怡和汪峰恋情以来,就被传N遍了!不过,时间跨入2015年,事情却发生着微妙的变化。2015年3月21日,章子怡担任制片人的电影《从天儿降》开机," \
       "在开机发布会上几张合影,让网友又燃起了好奇心:“章子怡真的怀孕了吗?”但后据证实,章子怡的“大肚照”只是影片宣传的噱头。过了四个月的7月22日,《太平轮》新一轮宣传,章子怡又被发现状态不佳,不时深呼吸," \
       "不自觉想捂住肚子,又觉得不妥。然后在8月的一天,章子怡和朋友吃饭,在酒店门口被风行工作室拍到了,疑似有孕在身!今年7月11日,汪峰本来在上海要举行演唱会,后来因为台风“灿鸿”取消了。而消息人士称," \
       "汪峰原来打算在演唱会上当着章子怡的面宣布重大消息,而且章子怡已经赴上海准备参加演唱会了,怎知遇到台风,只好延期,相信9月26日的演唱会应该还会有惊喜大白天下吧。 "

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(
    'uer/pegasus-base-chinese-cluecorpussmall')

config = PegasusConfig()
# config.activation_dropout = 0.1
config.activation_function = 'relu'
config.d_model = 384
config.decoder_attention_heads = 6
config.decoder_ffn_dim = 1536
config.decoder_start_token_id = 101
config.decoder_layers = 6
config.dropout = 0.0  # 关闭dropout,保证每次预测结果相同
config.encoder_attention_heads = 6
config.encoder_ffn_dim = 1536
config.encoder_layers = 6
config.forced_eos_token_id = 102
config.scale_embedding = True
config.vocab_size = 21128