Python PegasusTokenizer.from_pretrained Exemples, transformers.PegasusTokenizer.from_pretrained Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : base.py Projet : MichaelJanz/benchmarking-and-architectural-analysis-of-state-of-the-art-transformer-models

def get_model_tokenizer(model_name):
    import torch
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if "pegasus" in model_name:
        #its a pegasus model
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-large" in model_name:
        # its a bart-model
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-custom-large" in model_name:
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    else:
        # T5 or distilbart
        from transformers import AutoTokenizer, AutoModelWithLMHead
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelWithLMHead.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

Exemple #2

0

Afficher le fichier

def compute(sm):
    # Import the Pegasus Model
    model_name = 'google/pegasus-xsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    sm_len = len(sm)

    sen_list = splitText(sm, sm_len)  # Get sections to be summarized

    try:
        batches = []
        for s in sen_list:  # Preparation
            batch = tokenizer.prepare_seq2seq_batch(
                [s], truncation=True, padding='longest').to(torch_device)
            batches.append(batch)
    except:
        return ""

    temp = []
    for b in batches:  # Summary generation
        translated = model.generate(**b)
        temp.append(translated)

    final_summary = []
    for t in temp:  # Put together the summaries from the different sections
        final_summary.append(
            tokenizer.batch_decode(t, skip_special_tokens=True)[0])

    return final_summary

Exemple #3

0

Afficher le fichier

Fichier : paraphrasing.py Projet : udit-pandey1/kairon

class ParaPhrasing:
    """Class loads pegasus model for text augmentation"""
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        """
        generates variations for
        a given sentence/text

        :param input_text: sentence or text
        :param num_return_sequences: Number of variations to be returned
        :param num_beams: Number of beams for beam search. 1 means no beam search
        :return: list of variations of the input text
        """
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(
            input_text, truncation=True, padding='longest',
            max_length=60).to(ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(
            **batch,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(
            translated, skip_special_tokens=True)
        return tgt_text

Exemple #4

0

Afficher le fichier

    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()

Exemple #5

0

Afficher le fichier

Fichier : huggingface.py Projet : yudai1102jp/tensorflow-onnx

    def _test_TFPegasus(self, size, large=False):
        from transformers import PegasusTokenizer, TFPegasusModel
        tokenizer = PegasusTokenizer.from_pretrained(size)
        model = TFPegasusModel.from_pretrained(size)
        input_ids = \
            tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids
        decoder_input_ids = \
            tokenizer("Studies show that", return_tensors="tf").input_ids

        input_dict = {
            "input_ids": input_ids,
            "decoder_input_ids": decoder_input_ids
        }

        # this comes from TFPegasusEncoder/Decoder like:
        #   self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
        # while this is mean to come from config tf tells us that those are model inputs
        # this might be new in tensformers-2.4.2, we did not notice before that
        extra_input = {
            "tf_pegasus_model/model/decoder/mul/y:0":
            np.array([32.], dtype=np.float32),
            "tf_pegasus_model/model/encoder/mul/y:0":
            np.array([32.], dtype=np.float32)
        }
        spec, input_dict = self.spec_and_pad(
            input_dict, max_length=model.config.max_length)
        outputs = ["last_hidden_state"]
        self.run_test(model,
                      input_dict,
                      input_signature=spec,
                      outputs=outputs,
                      large=large,
                      extra_input=extra_input)

Exemple #6

0

Afficher le fichier

Fichier : pegasus.py Projet : eeic-ai-01/text2slide

    def exec(self, text):
        src_text = [text]
        model_name = self.model
        #model_name = 'google/pegasus-xsum'
        #model_name = 'google/pegasus-large'
        #model_name = 'google/pegasus-cnn_dailymail'
        #model_name = 'google/pegasus-pubmed'
        #model_name = 'google/pegasus-wikihow'
        #model_name = 'google/pegasus-newsroom'
        #model_name = 'google/pegasus-multi_news'
        #model_name = 'google/pegasus-reddit_tifu'
        #model_name = 'google/pegasus-arxiv'

        torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        batch = tokenizer.prepare_seq2seq_batch(
            src_text, truncation=True, padding='longest').to(torch_device)
        result = model.generate(**batch)
        tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True)
        if self.model == "google/pegasus-cnn_dailymail":
            tgt_text[0] = re.sub('<n>', ' ', tgt_text[0])

        return tgt_text[0]

Exemple #7

0

Afficher le fichier

 def __init__(self, config):
     self.model_name = 'google/pegasus-reddit_tifu'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"using device: {self.device}")
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name,
                                                       force_download=True)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name, force_download=True).to(self.device)

Exemple #8

0

Afficher le fichier

    def load_model(self):
        model = PegasusForConditionalGeneration.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model'))
        tokenizer = PegasusTokenizer.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer'))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        return model, tokenizer, device

Exemple #9

0

Afficher le fichier

Fichier : AbstractiveSummarizationParaphraser.py Projet : alexstoken/nlp-qa-finalproj

 def __init__(self, args, device):
     super().__init__(args, device)
     assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES
     self.pretrained_model_name = args.pretrained_model_name
     logging.info(f'Loading Pegasus ({self.pretrained_model_name})')
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.pretrained_model_name).to(self.device)
     self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained(
         self.pretrained_model_name)

Exemple #10

0

Afficher le fichier

Fichier : abstractive_summaries.py Projet : psmukherjee009/summarizatio_final

def generate_summary(text, model_name):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(
        text, truncation=True, padding="longest",
        return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

Exemple #11

0

Afficher le fichier

Fichier : main.py Projet : surbhihirawat88/Pegasus

def generate_summary(context):
    model_name = 'google/pegasus-xsum'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    batch = tokenizer.prepare_seq2seq_batch(src_texts='context',
                                            truncation=True,
                                            padding='max-length',
                                            return_tensors="pt")
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

Exemple #12

0

Afficher le fichier

def get_summary(text):
        try:
            model_name = 'google/pegasus-xsum'
            tokenizer = PegasusTokenizer.from_pretrained(model_name)
            model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
            src_text=[""""""+text+""""""]
            batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**batch)
            target = tokenizer.batch_decode(translated, skip_special_tokens=True)
        except :
            print("API Error occured")
            return (-100)
        return target[0]

Exemple #13

0

Afficher le fichier

Fichier : pegasus_inference.py Projet : sakshitantak/Pegasus

    def single_document_summarization(self, src_text):
        tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
        model = PegasusForConditionalGeneration.from_pretrained(
            self.model_name).to(torch_device)
        batch = tokenizer(src_text,
                          truncation=True,
                          padding=True,
                          return_tensors='pt').to(self.torch_device)

        translated = model.generate(**batch)
        generated_summary = tokenizer.batch_decode(translated,
                                                   skip_special_tokens=True)
        return generated_summary

Exemple #14

0

Afficher le fichier

Fichier : test_tokenization_sst2.py Projet : guillaume-be/rust-tokenizers

    def test_tokenization_pegasus(self):
        # Given
        self.base_tokenizer = PegasusTokenizer.from_pretrained(
            'google/pegasus-cnn_dailymail', cache_dir=self.test_dir)
        self.rust_tokenizer = PyPegasusTokenizer(get_from_cache(
            'https://cdn.huggingface.co/google/pegasus-cnn_dailymail/spiece.model'
        ),
                                                 do_lower_case=False)

        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline['input_ids']:
                if len(rust.token_ids) == len(baseline['input_ids']):
                    if Counter(rust.token_ids) != Counter(
                            baseline['input_ids']):
                        raise AssertionError(
                            f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                            f'Sentence a: {self.examples[idx].text_a} \n'
                            f'Sentence b: {self.examples[idx].text_b} \n'
                            f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                            f'Rust: {rust.token_ids} \n'
                            f'Python {baseline["input_ids"]}')
                else:
                    raise AssertionError(
                        f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                        f'Sentence a: {self.examples[idx].text_a} \n'
                        f'Sentence b: {self.examples[idx].text_b} \n'
                        f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                        f'Rust: {rust.token_ids} \n'
                        f'Python {baseline["input_ids"]}')
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])

Exemple #15

0

Afficher le fichier

Fichier : util.py Projet : jiacheng-xu/text-sum-uncertainty

def load_BART_or_PEGASUS(mname):
    if 'bart' in mname.lower():
        from transformers import BartTokenizer, BartForConditionalGeneration

        model = BartForConditionalGeneration.from_pretrained(mname)
        tokenizer = BartTokenizer.from_pretrained(mname)
    elif 'pegasus' in mname.lower():
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration

        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tokenizer = PegasusTokenizer.from_pretrained(mname)
    else:
        raise NotImplementedError("UNKOWN model name.")
    return model, tokenizer

Exemple #16

0

Afficher le fichier

def summarizeP(src_text, variant="xsum", device=None):
    model_name = "google/pegasus-"
    model_name += variant
    torch_device = ('cuda' if torch.cuda.is_available() else
                    'cpu') if device is None else device
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(src_text,
                                            truncation=True,
                                            padding='longest').to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

Exemple #17

0

Afficher le fichier

Fichier : util.py Projet : jiacheng-xu/text-sum-uncertainty

def load_data(dataset_dir, data_name, tokenizer_name='bart-large-cnn',
              batch_size=7, split='test', max_sample_num=34, max_length=500):
    if data_name == 'xsum':
        dataset = load_dataset(data_name, cache_dir=dataset_dir, split=split)
        print("Assume only use one subset of the dataset")
        if len(dataset) > max_sample_num:
            dataset = dataset.shuffle()
    elif data_name == 'cnndm' or data_name == "cnn_dailymail":
        # dataset = load_dataset('cnn_dailymail', '3.0.0', cache_dir=dataset_dir, split=split)
        # import tensorflow_datasets as tfds
        # cnndm_dir = '/mnt/data0/user/data/better_cnndm/formal_data/test'
        dataset = yield_cnndm()
    else:
        raise NotImplementedError("Unkown dataset")

    if 'bart' in tokenizer_name:
        tokenizer = BartTokenizer.from_pretrained(tokenizer_name)
    elif 'gpt' in tokenizer_name:
        from transformers import GPT2Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif 'pegasus' in tokenizer_name:
        from transformers import PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name)
        print("Load PEGASUS tokenizer...")
    else:
        raise NotImplementedError

    cur_src_txt, cur_tgt_txt = [], []
    cnt = 0
    for example in dataset:
        if data_name == 'xsum':
            doc = example[dataset_meta[data_name]['key_doc']]
            summary = example[dataset_meta[data_name]['key_sum']]
        elif data_name == 'cnn_dailymail' or data_name == 'cnndm':
            doc, summary = example
        else:
            raise NotImplementedError
        cur_src_txt.append(doc)
        cur_tgt_txt.append(summary)
        if len(cur_src_txt) == batch_size:
            assert len(cur_src_txt) == len(cur_tgt_txt)
            batch = tokenizer.prepare_seq2seq_batch(cur_src_txt, tgt_texts=cur_tgt_txt, max_length=max_length,
                                                    truncation=True, padding='longest', return_tensors='pt')

            yield batch
            cur_src_txt, cur_tgt_txt = [], []
        cnt += 1
        if cnt > max_sample_num:
            break

Exemple #18

0

Afficher le fichier

def convert_pegasus_ckpt_to_pytorch(ckpt_path, save_dir):
    # save tokenizer first
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = max_model_length[dataset]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_dir)

    # convert model
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = dict(max_length=max_gen_length[dataset],
                       length_penalty=expected_alpha.get(dataset, 0.8))
    torch_model = convert_pegasus_to_bart(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_dir)

Exemple #19

0

Afficher le fichier

Fichier : paraphrasing_augmentation.py Projet : cmazzoni87/SentimentAnalysis

def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame:
    MODEL_NAME = var.PARAPHRASING_MODEL
    tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
    model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device)
    train = data.copy()
    train = train[['summary', 'sentiment']]
    number_sequences = 10
    train['paraphrased text'] = train['summary'].progress_apply(get_response,
                                                                     num_return_sequences=number_sequences,
                                                                     tokenizer=tokenizer,
                                                                     model=model)
    generated = train.explode('paraphrased text')
    generated = generated.dropna()
    generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False)
    return generated

Exemple #20

0

Afficher le fichier

Fichier : pegasus.py Projet : quantapix/qnarre

def to_pytorch(ckpt_path, save_path):
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = task_params[f"sum_{dataset}"]["n_pos"]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_path)
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = task_params[f"sum_{dataset}"]
    if dataset == "large":
        cfg_updates["task_params"] = task_params
    torch_model = convert_pegasus(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_path)
    sd = torch_model.state_dict()
    sd.pop("model.decoder.embed_positions.weight")
    sd.pop("model.encoder.embed_positions.weight")
    torch.save(sd, Path(save_path) / "pytorch_model.bin")

Exemple #21

0

Afficher le fichier

Fichier : paraphrasing.py Projet : paper2code/rasa-kairon

class ParaPhrasing:
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest',
                                                             max_length=60).to(
            ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams,
                                                 num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text

Exemple #22

0

Afficher le fichier

Fichier : app.py Projet : dataisamazing/Machine-Learning

def generate_summary(text):

    # Create tokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    # load pretrained model
    model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")

    # convert into tokens (number representation of text)
    tokens = tokenizer(text,
                       truncation=True,
                       padding="longest",
                       return_tensors="pt")
    summary = model.generate(**tokens)
    #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip()
    Summarized = tokenizer.decode(summary[0])
    return Summarized

Exemple #23

0

Afficher le fichier

Fichier : test_modeling_flax_pegasus.py Projet : yulinggu-cs/transformers

    def test_pegasus_xsum_summary(self):
        model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
        tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

        src_text = [
            """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
            """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
        ]

        tgt_text = [
            "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
            "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
        ]

        inputs = tokenizer(src_text, return_tensors="np", truncation=True, max_length=512, padding=True)
        translated_tokens = model.generate(**inputs, num_beams=2).sequences
        decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        assert tgt_text == decoded

Exemple #24

0

Afficher le fichier

Fichier : result_pred.py Projet : xbqnl/NLP-model

def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1024
    decode_max_len = 256
    data = load_data('./final_test_data_list.json')
    model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                        decode_max_len)
    model.load_weights('./pagesus_section/best_model.hdf5')
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=256,
                          max_decode_len=decode_max_len,
                          model=model)

    result = just_predict(autotitle, tokenizer, MAX_LEN, data)
    with open('./pred_result.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False, cls=NpEncoder))

Exemple #25

0

Afficher le fichier

Fichier : convert_pegasus_tf_to_pytorch.py Projet : sshleifer/transformers_fork

def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
    # save tokenizer first
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = task_specific_params[
        f"summarization_{dataset}"]["max_position_embeddings"]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_dir)

    # convert model
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = task_specific_params[f"summarization_{dataset}"]
    if dataset == "large":
        cfg_updates["task_specific_params"] = task_specific_params
    torch_model = convert_pegasus(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_dir)
    sd = torch_model.state_dict()
    sd.pop("model.decoder.embed_positions.weight")
    sd.pop("model.encoder.embed_positions.weight")
    torch.save(sd, Path(save_dir) / "pytorch_model.bin")

Exemple #26

0

Afficher le fichier

def index(request):
    if request.method == 'POST':
        form = textForm(request.POST, request.FILES)
        if form.is_valid():
            _type = form.cleaned_data['_type']
            text = form.cleaned_data['text']
            percent = form.cleaned_data['percent']
            if (text == ""):
                file = request.FILES['file']
                text = ''
                for line in file:
                    text += line.decode()
            tokenized_sentence = sent_tokenize(text)
            if (_type == 'Extractive'):
                summary = summarize(tokenized_sentence, percent)
                return render(request, 'summary/summary.html', {
                    'text': text,
                    'summary': summary,
                    'percent': percent
                })
            elif (_type == 'Abstractive'):
                model_name = 'google/pegasus-xsum'
                torch_device = 'cuda'
                tokenizer = PegasusTokenizer.from_pretrained(model_name)
                model = PegasusForConditionalGeneration.from_pretrained(
                    model_name).to(torch_device)
                batch = tokenizer.prepare_seq2seq_batch(
                    [text], truncation=True,
                    padding='longest').to(torch_device)
                translated = model.generate(**batch)
                summary = tokenizer.batch_decode(translated,
                                                 skip_special_tokens=True)
                return render(
                    request, 'summary/summary.html', {
                        'text': text,
                        'summary': summary[0],
                        'percent': "Not Applicable"
                    })
    return render(request, 'summary/index.html', {'form': textForm()})

Exemple #27

0

Afficher le fichier

def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1920
    decode_max_len = 600
    batch_size = 2
    data = load_data(
        '/home_zyz/abstract_generate/final_abdata/union_add_noabs_cleaned_1920.json'
    )
    random.shuffle(data)
    print(len(data))
    print(data[0][0])
    print(data[0][1])
    valid_data = data[:5]
    train_data = data[5:]
    train_generator = data_generator(train_data, batch_size, MAX_LEN,
                                     decode_max_len, tokenizer)

    K.clear_session()
    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    with strategy.scope():
        model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                            decode_max_len)

    epochs = 50
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=599,
                          max_decode_len=decode_max_len,
                          model=model)
    evaluator = Evaluator(tokenizer, MAX_LEN, autotitle, valid_data)
    model.fit(train_generator.forfit(),
              steps_per_epoch=len(train_generator) - 1,
              epochs=epochs,
              callbacks=[evaluator])

Exemple #28

0

Afficher le fichier

Fichier : attention_y_entropy.py Projet : jiacheng-xu/text-sum-uncertainty

def run_one_fig(spec, args, num_samples=300):
    print(f"--{spec}--")
    CUR_DIR = os.path.join(args.prob_meta_dir, spec)
    args.cur_dir = CUR_DIR
    files = os.listdir(CUR_DIR)
    random.shuffle(files)
    files = files[:num_samples]

    BOS_TOKEN = 0
    print(args.spec_name)
    if 'pegasus' in args.model_name:
        from transformers import PegasusTokenizer

        bpe_tokenizer = PegasusTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id, 2]  # <n>
    elif 'gpt' in args.model_name:
        from transformers import GPT2Tokenizer

        bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    elif 'bart' in args.model_name:
        from transformers import BartTokenizer

        bpe_tokenizer = BartTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    else:
        raise NotImplementedError
    # process_data_single(args, files[0], eos_token_ids=EOS_TOK_IDs)
    len_samples = len(files)
    cpu_cnt = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=cpu_cnt) as pool:
        results = pool.starmap(process_data_single, zip([args] * len_samples, files, [EOS_TOK_IDs] * len_samples))
    output = list(itertools.chain.from_iterable(results))
    print(f"Samples: {len(output)}")
    output = proceed_data(10, output)
    return output

Exemple #29

0

Afficher le fichier

        if '## Example' in text:
            text = re.sub(r'## Example(.*)', '', text)
            text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL)
        return text

    for i, doc in enumerate(docs):
        markdown_without_example = remove_example_from_description(doc['markdown_description'])
        docs[i]['markdown_without_example'] = markdown_without_example
        # LOGGER.debug(markdown_without_example)

    # Generate 1 sentence summaries for the models
    if not args.quick_run:
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration
        mname = "google/pegasus-large"
        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tok = PegasusTokenizer.from_pretrained(mname)

        def summarise(text):
            batch = tok.prepare_seq2seq_batch(src_texts=[text])  # don't need tgt_text for inference
            gen = model.generate(**batch)
            return tok.batch_decode(gen, skip_special_tokens=True)[0]

        for i, doc in enumerate(docs):
            if 'short_description' not in docs[i].keys():
                short_description = summarise(doc['description'])
                docs[i]['short_description'] = short_description
                # LOGGER.debug(short_description)

    vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY'])
    ids = vi_client.get_field_across_documents('_id', docs)
    if args.reset_collection:

Exemple #30

0

Afficher le fichier

Fichier : analyze_prob_attn.py Projet : jiacheng-xu/text-sum-uncertainty

                                    data['pred_distributions'],
                                    data['logits'],
                                    data['input_doc'],
                                    BOS_TOKEN=bos_token_id,
                                    layer_num=lay_num)
        results += result
    result_in_arry = np.asarray(results)
    return result_in_arry.T


if __name__ == '__main__':
    print("Looking at  attention")
    if 'pegasus' in MODEL_NAME:
        from transformers import PegasusTokenizer

        bpe_tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
        EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id]  # <n>
        bos_token_id = 0
    else:
        raise NotImplementedError
    # visualize_distribution(None,None)
    files = os.listdir(CUR_DIR)
    random.shuffle(files)
    files = files[:20]

    if True:
        all_outputs = []
        for layer_num in range(16):
            print(f"Layer :{layer_num}")
            output_array = run_trial(layer_num, files)
            all_outputs.append(output_array)