def get_model_tokenizer(model_name):
    import torch
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if "pegasus" in model_name:
        #its a pegasus model
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-large" in model_name:
        # its a bart-model
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-custom-large" in model_name:
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    else:
        # T5 or distilbart
        from transformers import AutoTokenizer, AutoModelWithLMHead
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelWithLMHead.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer
Exemple #2
0
def compute(sm):
    # Import the Pegasus Model
    model_name = 'google/pegasus-xsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    sm_len = len(sm)

    sen_list = splitText(sm, sm_len)  # Get sections to be summarized

    try:
        batches = []
        for s in sen_list:  # Preparation
            batch = tokenizer.prepare_seq2seq_batch(
                [s], truncation=True, padding='longest').to(torch_device)
            batches.append(batch)
    except:
        return ""

    temp = []
    for b in batches:  # Summary generation
        translated = model.generate(**b)
        temp.append(translated)

    final_summary = []
    for t in temp:  # Put together the summaries from the different sections
        final_summary.append(
            tokenizer.batch_decode(t, skip_special_tokens=True)[0])

    return final_summary
Exemple #3
0
class ParaPhrasing:
    """Class loads pegasus model for text augmentation"""
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        """
        generates variations for
        a given sentence/text

        :param input_text: sentence or text
        :param num_return_sequences: Number of variations to be returned
        :param num_beams: Number of beams for beam search. 1 means no beam search
        :return: list of variations of the input text
        """
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(
            input_text, truncation=True, padding='longest',
            max_length=60).to(ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(
            **batch,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(
            translated, skip_special_tokens=True)
        return tgt_text
Exemple #4
0
    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()
    def _test_TFPegasus(self, size, large=False):
        from transformers import PegasusTokenizer, TFPegasusModel
        tokenizer = PegasusTokenizer.from_pretrained(size)
        model = TFPegasusModel.from_pretrained(size)
        input_ids = \
            tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids
        decoder_input_ids = \
            tokenizer("Studies show that", return_tensors="tf").input_ids

        input_dict = {
            "input_ids": input_ids,
            "decoder_input_ids": decoder_input_ids
        }

        # this comes from TFPegasusEncoder/Decoder like:
        #   self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
        # while this is mean to come from config tf tells us that those are model inputs
        # this might be new in tensformers-2.4.2, we did not notice before that
        extra_input = {
            "tf_pegasus_model/model/decoder/mul/y:0":
            np.array([32.], dtype=np.float32),
            "tf_pegasus_model/model/encoder/mul/y:0":
            np.array([32.], dtype=np.float32)
        }
        spec, input_dict = self.spec_and_pad(
            input_dict, max_length=model.config.max_length)
        outputs = ["last_hidden_state"]
        self.run_test(model,
                      input_dict,
                      input_signature=spec,
                      outputs=outputs,
                      large=large,
                      extra_input=extra_input)
Exemple #6
0
    def exec(self, text):
        src_text = [text]
        model_name = self.model
        #model_name = 'google/pegasus-xsum'
        #model_name = 'google/pegasus-large'
        #model_name = 'google/pegasus-cnn_dailymail'
        #model_name = 'google/pegasus-pubmed'
        #model_name = 'google/pegasus-wikihow'
        #model_name = 'google/pegasus-newsroom'
        #model_name = 'google/pegasus-multi_news'
        #model_name = 'google/pegasus-reddit_tifu'
        #model_name = 'google/pegasus-arxiv'

        torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        batch = tokenizer.prepare_seq2seq_batch(
            src_text, truncation=True, padding='longest').to(torch_device)
        result = model.generate(**batch)
        tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True)
        if self.model == "google/pegasus-cnn_dailymail":
            tgt_text[0] = re.sub('<n>', ' ', tgt_text[0])

        return tgt_text[0]
Exemple #7
0
 def __init__(self, config):
     self.model_name = 'google/pegasus-reddit_tifu'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"using device: {self.device}")
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name,
                                                       force_download=True)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name, force_download=True).to(self.device)
Exemple #8
0
    def load_model(self):
        model = PegasusForConditionalGeneration.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model'))
        tokenizer = PegasusTokenizer.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer'))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        return model, tokenizer, device
 def __init__(self, args, device):
     super().__init__(args, device)
     assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES
     self.pretrained_model_name = args.pretrained_model_name
     logging.info(f'Loading Pegasus ({self.pretrained_model_name})')
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.pretrained_model_name).to(self.device)
     self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained(
         self.pretrained_model_name)
def generate_summary(text, model_name):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(
        text, truncation=True, padding="longest",
        return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
Exemple #11
0
def generate_summary(context):
    model_name = 'google/pegasus-xsum'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    batch = tokenizer.prepare_seq2seq_batch(src_texts='context',
                                            truncation=True,
                                            padding='max-length',
                                            return_tensors="pt")
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text
Exemple #12
0
def get_summary(text):
        try:
            model_name = 'google/pegasus-xsum'
            tokenizer = PegasusTokenizer.from_pretrained(model_name)
            model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
            src_text=[""""""+text+""""""]
            batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**batch)
            target = tokenizer.batch_decode(translated, skip_special_tokens=True)
        except :
            print("API Error occured")
            return (-100)
        return target[0]
    def single_document_summarization(self, src_text):
        tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
        model = PegasusForConditionalGeneration.from_pretrained(
            self.model_name).to(torch_device)
        batch = tokenizer(src_text,
                          truncation=True,
                          padding=True,
                          return_tensors='pt').to(self.torch_device)

        translated = model.generate(**batch)
        generated_summary = tokenizer.batch_decode(translated,
                                                   skip_special_tokens=True)
        return generated_summary
    def test_tokenization_pegasus(self):
        # Given
        self.base_tokenizer = PegasusTokenizer.from_pretrained(
            'google/pegasus-cnn_dailymail', cache_dir=self.test_dir)
        self.rust_tokenizer = PyPegasusTokenizer(get_from_cache(
            'https://cdn.huggingface.co/google/pegasus-cnn_dailymail/spiece.model'
        ),
                                                 do_lower_case=False)

        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline['input_ids']:
                if len(rust.token_ids) == len(baseline['input_ids']):
                    if Counter(rust.token_ids) != Counter(
                            baseline['input_ids']):
                        raise AssertionError(
                            f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                            f'Sentence a: {self.examples[idx].text_a} \n'
                            f'Sentence b: {self.examples[idx].text_b} \n'
                            f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                            f'Rust: {rust.token_ids} \n'
                            f'Python {baseline["input_ids"]}')
                else:
                    raise AssertionError(
                        f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                        f'Sentence a: {self.examples[idx].text_a} \n'
                        f'Sentence b: {self.examples[idx].text_b} \n'
                        f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                        f'Rust: {rust.token_ids} \n'
                        f'Python {baseline["input_ids"]}')
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
def load_BART_or_PEGASUS(mname):
    if 'bart' in mname.lower():
        from transformers import BartTokenizer, BartForConditionalGeneration

        model = BartForConditionalGeneration.from_pretrained(mname)
        tokenizer = BartTokenizer.from_pretrained(mname)
    elif 'pegasus' in mname.lower():
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration

        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tokenizer = PegasusTokenizer.from_pretrained(mname)
    else:
        raise NotImplementedError("UNKOWN model name.")
    return model, tokenizer
Exemple #16
0
def summarizeP(src_text, variant="xsum", device=None):
    model_name = "google/pegasus-"
    model_name += variant
    torch_device = ('cuda' if torch.cuda.is_available() else
                    'cpu') if device is None else device
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(src_text,
                                            truncation=True,
                                            padding='longest').to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text
def load_data(dataset_dir, data_name, tokenizer_name='bart-large-cnn',
              batch_size=7, split='test', max_sample_num=34, max_length=500):
    if data_name == 'xsum':
        dataset = load_dataset(data_name, cache_dir=dataset_dir, split=split)
        print("Assume only use one subset of the dataset")
        if len(dataset) > max_sample_num:
            dataset = dataset.shuffle()
    elif data_name == 'cnndm' or data_name == "cnn_dailymail":
        # dataset = load_dataset('cnn_dailymail', '3.0.0', cache_dir=dataset_dir, split=split)
        # import tensorflow_datasets as tfds
        # cnndm_dir = '/mnt/data0/user/data/better_cnndm/formal_data/test'
        dataset = yield_cnndm()
    else:
        raise NotImplementedError("Unkown dataset")

    if 'bart' in tokenizer_name:
        tokenizer = BartTokenizer.from_pretrained(tokenizer_name)
    elif 'gpt' in tokenizer_name:
        from transformers import GPT2Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif 'pegasus' in tokenizer_name:
        from transformers import PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name)
        print("Load PEGASUS tokenizer...")
    else:
        raise NotImplementedError

    cur_src_txt, cur_tgt_txt = [], []
    cnt = 0
    for example in dataset:
        if data_name == 'xsum':
            doc = example[dataset_meta[data_name]['key_doc']]
            summary = example[dataset_meta[data_name]['key_sum']]
        elif data_name == 'cnn_dailymail' or data_name == 'cnndm':
            doc, summary = example
        else:
            raise NotImplementedError
        cur_src_txt.append(doc)
        cur_tgt_txt.append(summary)
        if len(cur_src_txt) == batch_size:
            assert len(cur_src_txt) == len(cur_tgt_txt)
            batch = tokenizer.prepare_seq2seq_batch(cur_src_txt, tgt_texts=cur_tgt_txt, max_length=max_length,
                                                    truncation=True, padding='longest', return_tensors='pt')

            yield batch
            cur_src_txt, cur_tgt_txt = [], []
        cnt += 1
        if cnt > max_sample_num:
            break
Exemple #18
0
def convert_pegasus_ckpt_to_pytorch(ckpt_path, save_dir):
    # save tokenizer first
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = max_model_length[dataset]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_dir)

    # convert model
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = dict(max_length=max_gen_length[dataset],
                       length_penalty=expected_alpha.get(dataset, 0.8))
    torch_model = convert_pegasus_to_bart(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_dir)
def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame:
    MODEL_NAME = var.PARAPHRASING_MODEL
    tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
    model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device)
    train = data.copy()
    train = train[['summary', 'sentiment']]
    number_sequences = 10
    train['paraphrased text'] = train['summary'].progress_apply(get_response,
                                                                     num_return_sequences=number_sequences,
                                                                     tokenizer=tokenizer,
                                                                     model=model)
    generated = train.explode('paraphrased text')
    generated = generated.dropna()
    generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False)
    return generated
Exemple #20
0
def to_pytorch(ckpt_path, save_path):
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = task_params[f"sum_{dataset}"]["n_pos"]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_path)
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = task_params[f"sum_{dataset}"]
    if dataset == "large":
        cfg_updates["task_params"] = task_params
    torch_model = convert_pegasus(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_path)
    sd = torch_model.state_dict()
    sd.pop("model.decoder.embed_positions.weight")
    sd.pop("model.encoder.embed_positions.weight")
    torch.save(sd, Path(save_path) / "pytorch_model.bin")
class ParaPhrasing:
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest',
                                                             max_length=60).to(
            ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams,
                                                 num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text
Exemple #22
0
def generate_summary(text):

    # Create tokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    # load pretrained model
    model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")

    # convert into tokens (number representation of text)
    tokens = tokenizer(text,
                       truncation=True,
                       padding="longest",
                       return_tensors="pt")
    summary = model.generate(**tokens)
    #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip()
    Summarized = tokenizer.decode(summary[0])
    return Summarized
    def test_pegasus_xsum_summary(self):
        model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
        tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

        src_text = [
            """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
            """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
        ]

        tgt_text = [
            "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
            "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
        ]

        inputs = tokenizer(src_text, return_tensors="np", truncation=True, max_length=512, padding=True)
        translated_tokens = model.generate(**inputs, num_beams=2).sequences
        decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        assert tgt_text == decoded
Exemple #24
0
def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1024
    decode_max_len = 256
    data = load_data('./final_test_data_list.json')
    model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                        decode_max_len)
    model.load_weights('./pagesus_section/best_model.hdf5')
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=256,
                          max_decode_len=decode_max_len,
                          model=model)

    result = just_predict(autotitle, tokenizer, MAX_LEN, data)
    with open('./pred_result.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False, cls=NpEncoder))
def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
    # save tokenizer first
    dataset = Path(ckpt_path).parent.name
    desired_max_model_length = task_specific_params[
        f"summarization_{dataset}"]["max_position_embeddings"]
    tok = PegasusTokenizer.from_pretrained(
        "sshleifer/pegasus", model_max_length=desired_max_model_length)
    assert tok.model_max_length == desired_max_model_length
    tok.save_pretrained(save_dir)

    # convert model
    tf_weights = get_tf_weights_as_numpy(ckpt_path)
    cfg_updates = task_specific_params[f"summarization_{dataset}"]
    if dataset == "large":
        cfg_updates["task_specific_params"] = task_specific_params
    torch_model = convert_pegasus(tf_weights, cfg_updates)
    torch_model.save_pretrained(save_dir)
    sd = torch_model.state_dict()
    sd.pop("model.decoder.embed_positions.weight")
    sd.pop("model.encoder.embed_positions.weight")
    torch.save(sd, Path(save_dir) / "pytorch_model.bin")
Exemple #26
0
def index(request):
    if request.method == 'POST':
        form = textForm(request.POST, request.FILES)
        if form.is_valid():
            _type = form.cleaned_data['_type']
            text = form.cleaned_data['text']
            percent = form.cleaned_data['percent']
            if (text == ""):
                file = request.FILES['file']
                text = ''
                for line in file:
                    text += line.decode()
            tokenized_sentence = sent_tokenize(text)
            if (_type == 'Extractive'):
                summary = summarize(tokenized_sentence, percent)
                return render(request, 'summary/summary.html', {
                    'text': text,
                    'summary': summary,
                    'percent': percent
                })
            elif (_type == 'Abstractive'):
                model_name = 'google/pegasus-xsum'
                torch_device = 'cuda'
                tokenizer = PegasusTokenizer.from_pretrained(model_name)
                model = PegasusForConditionalGeneration.from_pretrained(
                    model_name).to(torch_device)
                batch = tokenizer.prepare_seq2seq_batch(
                    [text], truncation=True,
                    padding='longest').to(torch_device)
                translated = model.generate(**batch)
                summary = tokenizer.batch_decode(translated,
                                                 skip_special_tokens=True)
                return render(
                    request, 'summary/summary.html', {
                        'text': text,
                        'summary': summary[0],
                        'percent': "Not Applicable"
                    })
    return render(request, 'summary/index.html', {'form': textForm()})
Exemple #27
0
def main():
    pagesus_pretrain_path = './page_arciv/'
    tokenizer = PegasusTokenizer.from_pretrained(pagesus_pretrain_path)
    config_path = os.path.join(pagesus_pretrain_path, 'config.json')
    psus_config = PegasusConfig.from_json_file(config_path)
    MAX_LEN = 1920
    decode_max_len = 600
    batch_size = 2
    data = load_data(
        '/home_zyz/abstract_generate/final_abdata/union_add_noabs_cleaned_1920.json'
    )
    random.shuffle(data)
    print(len(data))
    print(data[0][0])
    print(data[0][1])
    valid_data = data[:5]
    train_data = data[5:]
    train_generator = data_generator(train_data, batch_size, MAX_LEN,
                                     decode_max_len, tokenizer)

    K.clear_session()
    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    with strategy.scope():
        model = build_model(pagesus_pretrain_path, psus_config, MAX_LEN,
                            decode_max_len)

    epochs = 50
    autotitle = AutoTitle(start_id=tokenizer.pad_token_id,
                          end_id=tokenizer.eos_token_id,
                          maxlen=599,
                          max_decode_len=decode_max_len,
                          model=model)
    evaluator = Evaluator(tokenizer, MAX_LEN, autotitle, valid_data)
    model.fit(train_generator.forfit(),
              steps_per_epoch=len(train_generator) - 1,
              epochs=epochs,
              callbacks=[evaluator])
def run_one_fig(spec, args, num_samples=300):
    print(f"--{spec}--")
    CUR_DIR = os.path.join(args.prob_meta_dir, spec)
    args.cur_dir = CUR_DIR
    files = os.listdir(CUR_DIR)
    random.shuffle(files)
    files = files[:num_samples]

    BOS_TOKEN = 0
    print(args.spec_name)
    if 'pegasus' in args.model_name:
        from transformers import PegasusTokenizer

        bpe_tokenizer = PegasusTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id, 2]  # <n>
    elif 'gpt' in args.model_name:
        from transformers import GPT2Tokenizer

        bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    elif 'bart' in args.model_name:
        from transformers import BartTokenizer

        bpe_tokenizer = BartTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    else:
        raise NotImplementedError
    # process_data_single(args, files[0], eos_token_ids=EOS_TOK_IDs)
    len_samples = len(files)
    cpu_cnt = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=cpu_cnt) as pool:
        results = pool.starmap(process_data_single, zip([args] * len_samples, files, [EOS_TOK_IDs] * len_samples))
    output = list(itertools.chain.from_iterable(results))
    print(f"Samples: {len(output)}")
    output = proceed_data(10, output)
    return output
Exemple #29
0
        if '## Example' in text:
            text = re.sub(r'## Example(.*)', '', text)
            text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL)
        return text

    for i, doc in enumerate(docs):
        markdown_without_example = remove_example_from_description(doc['markdown_description'])
        docs[i]['markdown_without_example'] = markdown_without_example
        # LOGGER.debug(markdown_without_example)

    # Generate 1 sentence summaries for the models
    if not args.quick_run:
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration
        mname = "google/pegasus-large"
        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tok = PegasusTokenizer.from_pretrained(mname)

        def summarise(text):
            batch = tok.prepare_seq2seq_batch(src_texts=[text])  # don't need tgt_text for inference
            gen = model.generate(**batch)
            return tok.batch_decode(gen, skip_special_tokens=True)[0]

        for i, doc in enumerate(docs):
            if 'short_description' not in docs[i].keys():
                short_description = summarise(doc['description'])
                docs[i]['short_description'] = short_description
                # LOGGER.debug(short_description)

    vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY'])
    ids = vi_client.get_field_across_documents('_id', docs)
    if args.reset_collection:
                                    data['pred_distributions'],
                                    data['logits'],
                                    data['input_doc'],
                                    BOS_TOKEN=bos_token_id,
                                    layer_num=lay_num)
        results += result
    result_in_arry = np.asarray(results)
    return result_in_arry.T


if __name__ == '__main__':
    print("Looking at  attention")
    if 'pegasus' in MODEL_NAME:
        from transformers import PegasusTokenizer

        bpe_tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
        EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id]  # <n>
        bos_token_id = 0
    else:
        raise NotImplementedError
    # visualize_distribution(None,None)
    files = os.listdir(CUR_DIR)
    random.shuffle(files)
    files = files[:20]

    if True:
        all_outputs = []
        for layer_num in range(16):
            print(f"Layer :{layer_num}")
            output_array = run_trial(layer_num, files)
            all_outputs.append(output_array)