Python PegasusForConditionalGenerationの例、transformers.PegasusForConditionalGeneration Pythonの例

コード例 #1

0

ファイルを表示

def compute(sm):
    # Import the Pegasus Model
    model_name = 'google/pegasus-xsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    sm_len = len(sm)

    sen_list = splitText(sm, sm_len)  # Get sections to be summarized

    try:
        batches = []
        for s in sen_list:  # Preparation
            batch = tokenizer.prepare_seq2seq_batch(
                [s], truncation=True, padding='longest').to(torch_device)
            batches.append(batch)
    except:
        return ""

    temp = []
    for b in batches:  # Summary generation
        translated = model.generate(**b)
        temp.append(translated)

    final_summary = []
    for t in temp:  # Put together the summaries from the different sections
        final_summary.append(
            tokenizer.batch_decode(t, skip_special_tokens=True)[0])

    return final_summary

コード例 #2

0

ファイルを表示

ファイル: paraphrasing.py プロジェクト: udit-pandey1/kairon

class ParaPhrasing:
    """Class loads pegasus model for text augmentation"""
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        """
        generates variations for
        a given sentence/text

        :param input_text: sentence or text
        :param num_return_sequences: Number of variations to be returned
        :param num_beams: Number of beams for beam search. 1 means no beam search
        :return: list of variations of the input text
        """
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(
            input_text, truncation=True, padding='longest',
            max_length=60).to(ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(
            **batch,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(
            translated, skip_special_tokens=True)
        return tgt_text

コード例 #3

0

ファイルを表示

ファイル: pegasus.py プロジェクト: eeic-ai-01/text2slide

    def exec(self, text):
        src_text = [text]
        model_name = self.model
        #model_name = 'google/pegasus-xsum'
        #model_name = 'google/pegasus-large'
        #model_name = 'google/pegasus-cnn_dailymail'
        #model_name = 'google/pegasus-pubmed'
        #model_name = 'google/pegasus-wikihow'
        #model_name = 'google/pegasus-newsroom'
        #model_name = 'google/pegasus-multi_news'
        #model_name = 'google/pegasus-reddit_tifu'
        #model_name = 'google/pegasus-arxiv'

        torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        batch = tokenizer.prepare_seq2seq_batch(
            src_text, truncation=True, padding='longest').to(torch_device)
        result = model.generate(**batch)
        tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True)
        if self.model == "google/pegasus-cnn_dailymail":
            tgt_text[0] = re.sub('<n>', ' ', tgt_text[0])

        return tgt_text[0]

コード例 #4

0

ファイルを表示

    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()

コード例 #5

0

ファイルを表示

ファイル: base.py プロジェクト: MichaelJanz/benchmarking-and-architectural-analysis-of-state-of-the-art-transformer-models

def get_model_tokenizer(model_name):
    import torch
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if "pegasus" in model_name:
        #its a pegasus model
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-large" in model_name:
        # its a bart-model
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-custom-large" in model_name:
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    else:
        # T5 or distilbart
        from transformers import AutoTokenizer, AutoModelWithLMHead
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelWithLMHead.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

コード例 #6

0

ファイルを表示

ファイル: test_modeling_pegasus.py プロジェクト: huggingface/transformers

 def test_generate_fp16(self):
     config, input_dict = self.model_tester.prepare_config_and_inputs()
     input_ids = input_dict["input_ids"]
     attention_mask = input_ids.ne(1).to(torch_device)
     model = PegasusForConditionalGeneration(config).eval().to(torch_device)
     if torch_device == "cuda":
         model.half()
     model.generate(input_ids, attention_mask=attention_mask)
     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)

コード例 #7

0

ファイルを表示

 def __init__(self, config):
     self.model_name = 'google/pegasus-reddit_tifu'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"using device: {self.device}")
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name,
                                                       force_download=True)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name, force_download=True).to(self.device)

コード例 #8

0

ファイルを表示

    def load_model(self):
        model = PegasusForConditionalGeneration.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model'))
        tokenizer = PegasusTokenizer.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer'))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        return model, tokenizer, device

コード例 #9

0

ファイルを表示

ファイル: AbstractiveSummarizationParaphraser.py プロジェクト: alexstoken/nlp-qa-finalproj

 def __init__(self, args, device):
     super().__init__(args, device)
     assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES
     self.pretrained_model_name = args.pretrained_model_name
     logging.info(f'Loading Pegasus ({self.pretrained_model_name})')
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.pretrained_model_name).to(self.device)
     self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained(
         self.pretrained_model_name)

コード例 #10

0

ファイルを表示

ファイル: abstractive_summaries.py プロジェクト: psmukherjee009/summarizatio_final

def generate_summary(text, model_name):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(
        text, truncation=True, padding="longest",
        return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: surbhihirawat88/Pegasus

def generate_summary(context):
    model_name = 'google/pegasus-xsum'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    batch = tokenizer.prepare_seq2seq_batch(src_texts='context',
                                            truncation=True,
                                            padding='max-length',
                                            return_tensors="pt")
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

コード例 #12

0

ファイルを表示

ファイル: pegasus_inference.py プロジェクト: sakshitantak/Pegasus

    def single_document_summarization(self, src_text):
        tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
        model = PegasusForConditionalGeneration.from_pretrained(
            self.model_name).to(torch_device)
        batch = tokenizer(src_text,
                          truncation=True,
                          padding=True,
                          return_tensors='pt').to(self.torch_device)

        translated = model.generate(**batch)
        generated_summary = tokenizer.batch_decode(translated,
                                                   skip_special_tokens=True)
        return generated_summary

コード例 #13

0

ファイルを表示

def get_summary(text):
        try:
            model_name = 'google/pegasus-xsum'
            tokenizer = PegasusTokenizer.from_pretrained(model_name)
            model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
            src_text=[""""""+text+""""""]
            batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**batch)
            target = tokenizer.batch_decode(translated, skip_special_tokens=True)
        except :
            print("API Error occured")
            return (-100)
        return target[0]

コード例 #14

0

ファイルを表示

ファイル: util.py プロジェクト: jiacheng-xu/text-sum-uncertainty

def load_BART_or_PEGASUS(mname):
    if 'bart' in mname.lower():
        from transformers import BartTokenizer, BartForConditionalGeneration

        model = BartForConditionalGeneration.from_pretrained(mname)
        tokenizer = BartTokenizer.from_pretrained(mname)
    elif 'pegasus' in mname.lower():
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration

        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tokenizer = PegasusTokenizer.from_pretrained(mname)
    else:
        raise NotImplementedError("UNKOWN model name.")
    return model, tokenizer

コード例 #15

0

ファイルを表示

def summarizeP(src_text, variant="xsum", device=None):
    model_name = "google/pegasus-"
    model_name += variant
    torch_device = ('cuda' if torch.cuda.is_available() else
                    'cpu') if device is None else device
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(src_text,
                                            truncation=True,
                                            padding='longest').to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

コード例 #16

0

ファイルを表示

ファイル: paraphrasing_augmentation.py プロジェクト: cmazzoni87/SentimentAnalysis

def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame:
    MODEL_NAME = var.PARAPHRASING_MODEL
    tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
    model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device)
    train = data.copy()
    train = train[['summary', 'sentiment']]
    number_sequences = 10
    train['paraphrased text'] = train['summary'].progress_apply(get_response,
                                                                     num_return_sequences=number_sequences,
                                                                     tokenizer=tokenizer,
                                                                     model=model)
    generated = train.explode('paraphrased text')
    generated = generated.dropna()
    generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False)
    return generated

コード例 #17

0

ファイルを表示

ファイル: paraphrasing.py プロジェクト: paper2code/rasa-kairon

class ParaPhrasing:
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest',
                                                             max_length=60).to(
            ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams,
                                                 num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text

コード例 #18

0

ファイルを表示

ファイル: app.py プロジェクト: dataisamazing/Machine-Learning

def generate_summary(text):

    # Create tokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    # load pretrained model
    model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")

    # convert into tokens (number representation of text)
    tokens = tokenizer(text,
                       truncation=True,
                       padding="longest",
                       return_tensors="pt")
    summary = model.generate(**tokens)
    #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip()
    Summarized = tokenizer.decode(summary[0])
    return Summarized

コード例 #19

0

ファイルを表示

ファイル: transformers-summarisation-demo.py プロジェクト: nus-cs3244-ml-singapore-7/singapore-hansard-transformers-demo

def main(sequence):
    # Pretrained model from https://huggingface.co/google/pegasus-cnn_dailymail
    tokenizer = PegasusTokenizerFast.from_pretrained(
        'google/pegasus-cnn_dailymail')
    model = PegasusForConditionalGeneration.from_pretrained(
        'google/pegasus-cnn_dailymail').to(DEVICE)
    model.eval()

    inputs = tokenizer.encode(sequence, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(inputs)

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Input:")
    print(sequence)
    print("--------------------------------")
    print("Output:")
    print(summary)

コード例 #20

0

ファイルを表示

ファイル: convert_pegasus_tf_to_pytorch.py プロジェクト: NUS-ISS-IS-PLP-G12/Smart_Investor

def convert_pegasus_to_bart(
        tf_weights: dict,
        cfg_updates: dict) -> PegasusForConditionalGeneration:
    cfg_kwargs = DEFAULTS.copy()
    cfg_kwargs.update(cfg_updates)

    cfg = PegasusConfig(**cfg_updates)
    bart = PegasusForConditionalGeneration(cfg)
    sd = bart.model.state_dict()
    mapping = {}
    for k, v in tf_weights.items():
        new_k = rename_state_dict_key(k)
        if new_k not in sd:
            raise ValueError(
                f"could not find new key {new_k} in state dict. (converted from {k})"
            )

        if "dense" in k or "proj" in new_k:
            v = v.T
        mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype)
        assert v.shape == sd[
            new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}"
    # make sure embedding.padding_idx is respected
    mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like(
        mapping["shared.weight"][cfg.pad_token_id + 1])
    mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"]
    mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"]
    empty_biases = {
        k: torch.zeros_like(v)
        for k, v in sd.items() if k.endswith("bias") and k not in mapping
    }
    mapping.update(**empty_biases)
    missing, extra = bart.model.load_state_dict(mapping, strict=False)
    unexpected_missing = [
        k for k in missing if k not in
        ["encoder.embed_positions.weight", "decoder.embed_positions.weight"]
    ]
    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
    assert extra == [], f"no matches found for the following tf keys {extra}"
    return bart

コード例 #21

0

ファイルを表示

def index(request):
    if request.method == 'POST':
        form = textForm(request.POST, request.FILES)
        if form.is_valid():
            _type = form.cleaned_data['_type']
            text = form.cleaned_data['text']
            percent = form.cleaned_data['percent']
            if (text == ""):
                file = request.FILES['file']
                text = ''
                for line in file:
                    text += line.decode()
            tokenized_sentence = sent_tokenize(text)
            if (_type == 'Extractive'):
                summary = summarize(tokenized_sentence, percent)
                return render(request, 'summary/summary.html', {
                    'text': text,
                    'summary': summary,
                    'percent': percent
                })
            elif (_type == 'Abstractive'):
                model_name = 'google/pegasus-xsum'
                torch_device = 'cuda'
                tokenizer = PegasusTokenizer.from_pretrained(model_name)
                model = PegasusForConditionalGeneration.from_pretrained(
                    model_name).to(torch_device)
                batch = tokenizer.prepare_seq2seq_batch(
                    [text], truncation=True,
                    padding='longest').to(torch_device)
                translated = model.generate(**batch)
                summary = tokenizer.batch_decode(translated,
                                                 skip_special_tokens=True)
                return render(
                    request, 'summary/summary.html', {
                        'text': text,
                        'summary': summary[0],
                        'percent': "Not Applicable"
                    })
    return render(request, 'summary/index.html', {'form': textForm()})

コード例 #22

0

ファイルを表示

def main(input_dir_path, output_dir_path, model_name_or_dir):
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name_or_dir)
    model = PegasusForConditionalGeneration.from_pretrained(
        model_name_or_dir).to(DEVICE)
    model.eval()

    os.makedirs(output_dir_path, exist_ok=True)

    for file_name in os.listdir(input_dir_path):
        if file_name.endswith('.json'):
            count = 0
            input_file_path = os.path.join(input_dir_path, file_name)
            with open(input_file_path) as json_file:
                data = json.load(json_file)

                for session in data['sessions']:
                    for speech in session['speeches']:
                        content = []
                        for text in speech['content']:
                            inputs = tokenizer.encode(
                                text, return_tensors="pt").to(DEVICE)

                            with torch.no_grad():
                                outputs = model.generate(inputs)

                            summary = tokenizer.decode(
                                outputs[0], skip_special_tokens=True)

                            content.append({'text': text, 'summary': summary})
                            count += 1
                        speech['content'] = content

            output_file_path = os.path.join(output_dir_path, file_name)
            with open(output_file_path, 'w') as json_file:
                json.dump(data, json_file)

            print("File: {}, Count: {}".format(file_name, count))

コード例 #23

0

ファイルを表示

        text = re.sub(r'## Example(.*?)##', '##', text, flags=re.DOTALL)
        if '## Example' in text:
            text = re.sub(r'## Example(.*)', '', text)
            text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL)
        return text

    for i, doc in enumerate(docs):
        markdown_without_example = remove_example_from_description(doc['markdown_description'])
        docs[i]['markdown_without_example'] = markdown_without_example
        # LOGGER.debug(markdown_without_example)

    # Generate 1 sentence summaries for the models
    if not args.quick_run:
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration
        mname = "google/pegasus-large"
        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tok = PegasusTokenizer.from_pretrained(mname)

        def summarise(text):
            batch = tok.prepare_seq2seq_batch(src_texts=[text])  # don't need tgt_text for inference
            gen = model.generate(**batch)
            return tok.batch_decode(gen, skip_special_tokens=True)[0]

        for i, doc in enumerate(docs):
            if 'short_description' not in docs[i].keys():
                short_description = summarise(doc['description'])
                docs[i]['short_description'] = short_description
                # LOGGER.debug(short_description)

    vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY'])
    ids = vi_client.get_field_across_documents('_id', docs)

コード例 #24

0

ファイルを表示

def load_pegasus_model():
    pegasus_model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")
    return pegasus_model

コード例 #25

0

ファイルを表示

ファイル: full_executer_wordshop.py プロジェクト: S-murf/WordShop

"""

# Commented out IPython magic to ensure Python compatibility.
!git clone https://github.com/google-research/pegasus
# %cd pegasus
!export PYTHONPATH=.
!pip3 install -r requirements.txt

!pip install transformers==3.5.0

import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60).to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

import pandas as pd
import nltk
nltk.download('cmudict')
nltk.download('wordnet')

"""Next, we import the procrustean alliteration paraphraser"""

class RuleBoundsInterface:
    """This interface is used to define different properties of a rhetorical figure generating algorithm.

コード例 #26

0

ファイルを表示

 def __init__(self):
     self.sum_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') #use pegasus-large for actual pc and xsum for cloud
     self.sum_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

コード例 #27

0

ファイルを表示

ファイル: convert-pegasus-to-longformer.py プロジェクト: nimashoghi/dl-summarization

def create_long_model(save_model_to, base_model, tokenizer_name_or_path,
                      attention_window, max_pos):
    model = PegasusForConditionalGeneration.from_pretrained(base_model)
    tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name_or_path,
                                                 model_max_length=max_pos)
    config = LongformerPegasusConfig.from_pretrained(base_model)
    model.config = config

    # in Pegasus attention_probs_dropout_prob is attention_dropout, but LongformerSelfAttention
    # expects attention_probs_dropout_prob, so set it here
    config.attention_probs_dropout_prob = config.attention_dropout
    config.architectures = [
        "LongformerPegasusForConditionalGeneration",
    ]

    N = 0

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs["model_max_length"] = max_pos
    current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
    print(max_pos, current_max_pos, embed_size, config.max_position_embeddings)
    assert current_max_pos == config.max_position_embeddings + N

    config.max_encoder_position_embeddings = max_pos
    config.max_decoder_position_embeddings = config.max_position_embeddings
    del config.max_position_embeddings
    max_pos += N  # NOTE: Pegasus has positions 0,1 reserved, so embedding size is max position + N
    assert max_pos >= current_max_pos

    # allocate a larger position embedding matrix for the encoder
    new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(
        max_pos, embed_size)
    k = N
    step = current_max_pos - N
    while k < max_pos - 1:
        new_encoder_pos_embed[k:(
            k + step)] = model.model.encoder.embed_positions.weight[N:]
        k += step
    model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed

    config.attention_window = [attention_window] * config.num_hidden_layers
    config.attention_dilation = [1] * config.num_hidden_layers

    for i, layer in enumerate(model.model.encoder.layers):
        longformer_self_attn_for_pegasus = LongformerSelfAttentionForPegasus(
            config, layer_id=i)

        longformer_self_attn_for_pegasus.longformer_self_attn.query = (
            layer.self_attn.q_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.key = (
            layer.self_attn.k_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.value = (
            layer.self_attn.v_proj)

        longformer_self_attn_for_pegasus.longformer_self_attn.query_global = (
            layer.self_attn.q_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.key_global = (
            layer.self_attn.k_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.value_global = (
            layer.self_attn.v_proj)

        longformer_self_attn_for_pegasus.output = layer.self_attn.out_proj

        layer.self_attn = longformer_self_attn_for_pegasus
    print(f"saving model to {save_model_to}")
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer

コード例 #28

0

ファイルを表示

import torch
from transformers import AutoConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from string import punctuation
import re
from nltk.corpus import stopwords

#load tokenizer and model
tok = PegasusTokenizer.from_pretrained(
    'C:/Users/Ramana/Desktop/pegasus/nirvana-test/SMA/models')
config = AutoConfig.from_pretrained(
    'C:/Users/Ramana/Desktop/pegasus/nirvana-test/SMA/models')
dummy_model = PegasusForConditionalGeneration(config)

quantized_model = torch.quantization.quantize_dynamic(dummy_model,
                                                      {torch.nn.Linear},
                                                      dtype=torch.qint8)

quantized_state_dict = torch.load(
    'C:/Users/Ramana/Desktop/pegasus/nirvana-test/SMA/models/pegasus-quantized.h5'
)
quantized_model.load_state_dict(quantized_state_dict)

app = Flask(__name__)
run_with_ngrok(app)


@app.route('/')

コード例 #29

0

ファイルを表示

ファイル: summarizers.py プロジェクト: Haruhi07/news-tls

 def __init__(self):
     self.model_name = 'google/pegasus-multi_news'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name).to(self.device)

コード例 #30

0

ファイルを表示

 def __init__(self):
     self.model_name = 'tuner007/pegasus_paraphrase'
     self.pegasus_tokenizer = PegasusTokenizer.from_pretrained(
         self.model_name)
     self.pegasus_model = PegasusForConditionalGeneration.from_pretrained(self.model_name) \
         .to(ReflectiveListening.torch_device)