def compute(sm): # Import the Pegasus Model model_name = 'google/pegasus-xsum' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) sm_len = len(sm) sen_list = splitText(sm, sm_len) # Get sections to be summarized try: batches = [] for s in sen_list: # Preparation batch = tokenizer.prepare_seq2seq_batch( [s], truncation=True, padding='longest').to(torch_device) batches.append(batch) except: return "" temp = [] for b in batches: # Summary generation translated = model.generate(**b) temp.append(translated) final_summary = [] for t in temp: # Put together the summaries from the different sections final_summary.append( tokenizer.batch_decode(t, skip_special_tokens=True)[0]) return final_summary
class ParaPhrasing: """Class loads pegasus model for text augmentation""" model_name = 'tuner007/pegasus_paraphrase' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) @staticmethod def paraphrases(input_text, num_return_sequences=10, num_beams=10): """ generates variations for a given sentence/text :param input_text: sentence or text :param num_return_sequences: Number of variations to be returned :param num_beams: Number of beams for beam search. 1 means no beam search :return: list of variations of the input text """ if isinstance(input_text, str): input_text = [input_text] batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch( input_text, truncation=True, padding='longest', max_length=60).to(ParaPhrasing.torch_device) translated = ParaPhrasing.model.generate( **batch, max_length=60, num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) tgt_text = ParaPhrasing.tokenizer.batch_decode( translated, skip_special_tokens=True) return tgt_text
def exec(self, text): src_text = [text] model_name = self.model #model_name = 'google/pegasus-xsum' #model_name = 'google/pegasus-large' #model_name = 'google/pegasus-cnn_dailymail' #model_name = 'google/pegasus-pubmed' #model_name = 'google/pegasus-wikihow' #model_name = 'google/pegasus-newsroom' #model_name = 'google/pegasus-multi_news' #model_name = 'google/pegasus-reddit_tifu' #model_name = 'google/pegasus-arxiv' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) batch = tokenizer.prepare_seq2seq_batch( src_text, truncation=True, padding='longest').to(torch_device) result = model.generate(**batch) tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True) if self.model == "google/pegasus-cnn_dailymail": tgt_text[0] = re.sub('<n>', ' ', tgt_text[0]) return tgt_text[0]
def __init__(self, model: str = None): log.info(model) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.info(torch_device) if model is None: model = "t5" self.modelName = model # path to all the files that will be used for inference self.path = f"./app/api/{model}/" self.model_path = self.path + "pytorch_model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) self.model.eval() self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device)) elif model == "google/pegasus-newsroom": self.config = PegasusConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = PegasusTokenizer.from_pretrained(model) elif model == "facebook/bart-large-cnn": self.config = BartConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = BartTokenizer.from_pretrained(model) else: raise Exception("This model is not supported") self.text = str()
def get_model_tokenizer(model_name): import torch torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' if "pegasus" in model_name: #its a pegasus model from transformers import PegasusForConditionalGeneration, PegasusTokenizer tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer elif "bart-large" in model_name: # its a bart-model from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer elif "bart-custom-large" in model_name: from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name).to( torch_device) return model, tokenizer else: # T5 or distilbart from transformers import AutoTokenizer, AutoModelWithLMHead tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelWithLMHead.from_pretrained(model_name).to( torch_device) return model, tokenizer
def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_ids = input_dict["input_ids"] attention_mask = input_ids.ne(1).to(torch_device) model = PegasusForConditionalGeneration(config).eval().to(torch_device) if torch_device == "cuda": model.half() model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def __init__(self, config): self.model_name = 'google/pegasus-reddit_tifu' self.device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"using device: {self.device}") self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name, force_download=True) self.model = PegasusForConditionalGeneration.from_pretrained( self.model_name, force_download=True).to(self.device)
def load_model(self): model = PegasusForConditionalGeneration.from_pretrained( os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model')) tokenizer = PegasusTokenizer.from_pretrained( os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) return model, tokenizer, device
def __init__(self, args, device): super().__init__(args, device) assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES self.pretrained_model_name = args.pretrained_model_name logging.info(f'Loading Pegasus ({self.pretrained_model_name})') self.model = PegasusForConditionalGeneration.from_pretrained( self.pretrained_model_name).to(self.device) self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained( self.pretrained_model_name)
def generate_summary(text, model_name): torch_device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) batch = tokenizer.prepare_seq2seq_batch( text, truncation=True, padding="longest", return_tensors="pt").to(torch_device) translated = model.generate(**batch) return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
def generate_summary(context): model_name = 'google/pegasus-xsum' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name) batch = tokenizer.prepare_seq2seq_batch(src_texts='context', truncation=True, padding='max-length', return_tensors="pt") translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text
def single_document_summarization(self, src_text): tokenizer = PegasusTokenizer.from_pretrained(self.model_name) model = PegasusForConditionalGeneration.from_pretrained( self.model_name).to(torch_device) batch = tokenizer(src_text, truncation=True, padding=True, return_tensors='pt').to(self.torch_device) translated = model.generate(**batch) generated_summary = tokenizer.batch_decode(translated, skip_special_tokens=True) return generated_summary
def get_summary(text): try: model_name = 'google/pegasus-xsum' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) src_text=[""""""+text+""""""] batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device) translated = model.generate(**batch) target = tokenizer.batch_decode(translated, skip_special_tokens=True) except : print("API Error occured") return (-100) return target[0]
def load_BART_or_PEGASUS(mname): if 'bart' in mname.lower(): from transformers import BartTokenizer, BartForConditionalGeneration model = BartForConditionalGeneration.from_pretrained(mname) tokenizer = BartTokenizer.from_pretrained(mname) elif 'pegasus' in mname.lower(): from transformers import PegasusTokenizer, PegasusForConditionalGeneration model = PegasusForConditionalGeneration.from_pretrained(mname) tokenizer = PegasusTokenizer.from_pretrained(mname) else: raise NotImplementedError("UNKOWN model name.") return model, tokenizer
def summarizeP(src_text, variant="xsum", device=None): model_name = "google/pegasus-" model_name += variant torch_device = ('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to( torch_device) batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device) translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text
def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame: MODEL_NAME = var.PARAPHRASING_MODEL tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME) model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device) train = data.copy() train = train[['summary', 'sentiment']] number_sequences = 10 train['paraphrased text'] = train['summary'].progress_apply(get_response, num_return_sequences=number_sequences, tokenizer=tokenizer, model=model) generated = train.explode('paraphrased text') generated = generated.dropna() generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False) return generated
class ParaPhrasing: model_name = 'tuner007/pegasus_paraphrase' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) @staticmethod def paraphrases(input_text, num_return_sequences=10, num_beams=10): if isinstance(input_text, str): input_text = [input_text] batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest', max_length=60).to( ParaPhrasing.torch_device) translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text
def generate_summary(text): # Create tokenizer tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") # load pretrained model model = PegasusForConditionalGeneration.from_pretrained( "google/pegasus-xsum") # convert into tokens (number representation of text) tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt") summary = model.generate(**tokens) #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip() Summarized = tokenizer.decode(summary[0]) return Summarized
def main(sequence): # Pretrained model from https://huggingface.co/google/pegasus-cnn_dailymail tokenizer = PegasusTokenizerFast.from_pretrained( 'google/pegasus-cnn_dailymail') model = PegasusForConditionalGeneration.from_pretrained( 'google/pegasus-cnn_dailymail').to(DEVICE) model.eval() inputs = tokenizer.encode(sequence, return_tensors="pt").to(DEVICE) with torch.no_grad(): outputs = model.generate(inputs) summary = tokenizer.decode(outputs[0], skip_special_tokens=True) print("Input:") print(sequence) print("--------------------------------") print("Output:") print(summary)
def convert_pegasus_to_bart( tf_weights: dict, cfg_updates: dict) -> PegasusForConditionalGeneration: cfg_kwargs = DEFAULTS.copy() cfg_kwargs.update(cfg_updates) cfg = PegasusConfig(**cfg_updates) bart = PegasusForConditionalGeneration(cfg) sd = bart.model.state_dict() mapping = {} for k, v in tf_weights.items(): new_k = rename_state_dict_key(k) if new_k not in sd: raise ValueError( f"could not find new key {new_k} in state dict. (converted from {k})" ) if "dense" in k or "proj" in new_k: v = v.T mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype) assert v.shape == sd[ new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}" # make sure embedding.padding_idx is respected mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like( mapping["shared.weight"][cfg.pad_token_id + 1]) mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"] mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"] empty_biases = { k: torch.zeros_like(v) for k, v in sd.items() if k.endswith("bias") and k not in mapping } mapping.update(**empty_biases) missing, extra = bart.model.load_state_dict(mapping, strict=False) unexpected_missing = [ k for k in missing if k not in ["encoder.embed_positions.weight", "decoder.embed_positions.weight"] ] assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}" assert extra == [], f"no matches found for the following tf keys {extra}" return bart
def index(request): if request.method == 'POST': form = textForm(request.POST, request.FILES) if form.is_valid(): _type = form.cleaned_data['_type'] text = form.cleaned_data['text'] percent = form.cleaned_data['percent'] if (text == ""): file = request.FILES['file'] text = '' for line in file: text += line.decode() tokenized_sentence = sent_tokenize(text) if (_type == 'Extractive'): summary = summarize(tokenized_sentence, percent) return render(request, 'summary/summary.html', { 'text': text, 'summary': summary, 'percent': percent }) elif (_type == 'Abstractive'): model_name = 'google/pegasus-xsum' torch_device = 'cuda' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained( model_name).to(torch_device) batch = tokenizer.prepare_seq2seq_batch( [text], truncation=True, padding='longest').to(torch_device) translated = model.generate(**batch) summary = tokenizer.batch_decode(translated, skip_special_tokens=True) return render( request, 'summary/summary.html', { 'text': text, 'summary': summary[0], 'percent': "Not Applicable" }) return render(request, 'summary/index.html', {'form': textForm()})
def main(input_dir_path, output_dir_path, model_name_or_dir): tokenizer = PegasusTokenizerFast.from_pretrained(model_name_or_dir) model = PegasusForConditionalGeneration.from_pretrained( model_name_or_dir).to(DEVICE) model.eval() os.makedirs(output_dir_path, exist_ok=True) for file_name in os.listdir(input_dir_path): if file_name.endswith('.json'): count = 0 input_file_path = os.path.join(input_dir_path, file_name) with open(input_file_path) as json_file: data = json.load(json_file) for session in data['sessions']: for speech in session['speeches']: content = [] for text in speech['content']: inputs = tokenizer.encode( text, return_tensors="pt").to(DEVICE) with torch.no_grad(): outputs = model.generate(inputs) summary = tokenizer.decode( outputs[0], skip_special_tokens=True) content.append({'text': text, 'summary': summary}) count += 1 speech['content'] = content output_file_path = os.path.join(output_dir_path, file_name) with open(output_file_path, 'w') as json_file: json.dump(data, json_file) print("File: {}, Count: {}".format(file_name, count))
text = re.sub(r'## Example(.*?)##', '##', text, flags=re.DOTALL) if '## Example' in text: text = re.sub(r'## Example(.*)', '', text) text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL) return text for i, doc in enumerate(docs): markdown_without_example = remove_example_from_description(doc['markdown_description']) docs[i]['markdown_without_example'] = markdown_without_example # LOGGER.debug(markdown_without_example) # Generate 1 sentence summaries for the models if not args.quick_run: from transformers import PegasusTokenizer, PegasusForConditionalGeneration mname = "google/pegasus-large" model = PegasusForConditionalGeneration.from_pretrained(mname) tok = PegasusTokenizer.from_pretrained(mname) def summarise(text): batch = tok.prepare_seq2seq_batch(src_texts=[text]) # don't need tgt_text for inference gen = model.generate(**batch) return tok.batch_decode(gen, skip_special_tokens=True)[0] for i, doc in enumerate(docs): if 'short_description' not in docs[i].keys(): short_description = summarise(doc['description']) docs[i]['short_description'] = short_description # LOGGER.debug(short_description) vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY']) ids = vi_client.get_field_across_documents('_id', docs)
def load_pegasus_model(): pegasus_model = PegasusForConditionalGeneration.from_pretrained( "google/pegasus-xsum") return pegasus_model
""" # Commented out IPython magic to ensure Python compatibility. !git clone https://github.com/google-research/pegasus # %cd pegasus !export PYTHONPATH=. !pip3 install -r requirements.txt !pip install transformers==3.5.0 import torch from transformers import PegasusForConditionalGeneration, PegasusTokenizer model_name = 'tuner007/pegasus_paraphrase' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) def get_response(input_text,num_return_sequences,num_beams): batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60).to(torch_device) translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text import pandas as pd import nltk nltk.download('cmudict') nltk.download('wordnet') """Next, we import the procrustean alliteration paraphraser""" class RuleBoundsInterface: """This interface is used to define different properties of a rhetorical figure generating algorithm.
def __init__(self): self.sum_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') #use pegasus-large for actual pc and xsum for cloud self.sum_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
def create_long_model(save_model_to, base_model, tokenizer_name_or_path, attention_window, max_pos): model = PegasusForConditionalGeneration.from_pretrained(base_model) tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos) config = LongformerPegasusConfig.from_pretrained(base_model) model.config = config # in Pegasus attention_probs_dropout_prob is attention_dropout, but LongformerSelfAttention # expects attention_probs_dropout_prob, so set it here config.attention_probs_dropout_prob = config.attention_dropout config.architectures = [ "LongformerPegasusForConditionalGeneration", ] N = 0 # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs["model_max_length"] = max_pos current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape print(max_pos, current_max_pos, embed_size, config.max_position_embeddings) assert current_max_pos == config.max_position_embeddings + N config.max_encoder_position_embeddings = max_pos config.max_decoder_position_embeddings = config.max_position_embeddings del config.max_position_embeddings max_pos += N # NOTE: Pegasus has positions 0,1 reserved, so embedding size is max position + N assert max_pos >= current_max_pos # allocate a larger position embedding matrix for the encoder new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty( max_pos, embed_size) k = N step = current_max_pos - N while k < max_pos - 1: new_encoder_pos_embed[k:( k + step)] = model.model.encoder.embed_positions.weight[N:] k += step model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed config.attention_window = [attention_window] * config.num_hidden_layers config.attention_dilation = [1] * config.num_hidden_layers for i, layer in enumerate(model.model.encoder.layers): longformer_self_attn_for_pegasus = LongformerSelfAttentionForPegasus( config, layer_id=i) longformer_self_attn_for_pegasus.longformer_self_attn.query = ( layer.self_attn.q_proj) longformer_self_attn_for_pegasus.longformer_self_attn.key = ( layer.self_attn.k_proj) longformer_self_attn_for_pegasus.longformer_self_attn.value = ( layer.self_attn.v_proj) longformer_self_attn_for_pegasus.longformer_self_attn.query_global = ( layer.self_attn.q_proj) longformer_self_attn_for_pegasus.longformer_self_attn.key_global = ( layer.self_attn.k_proj) longformer_self_attn_for_pegasus.longformer_self_attn.value_global = ( layer.self_attn.v_proj) longformer_self_attn_for_pegasus.output = layer.self_attn.out_proj layer.self_attn = longformer_self_attn_for_pegasus print(f"saving model to {save_model_to}") model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
import torch from transformers import AutoConfig from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import nltk from string import punctuation import re from nltk.corpus import stopwords #load tokenizer and model tok = PegasusTokenizer.from_pretrained( 'C:/Users/Ramana/Desktop/pegasus/nirvana-test/SMA/models') config = AutoConfig.from_pretrained( 'C:/Users/Ramana/Desktop/pegasus/nirvana-test/SMA/models') dummy_model = PegasusForConditionalGeneration(config) quantized_model = torch.quantization.quantize_dynamic(dummy_model, {torch.nn.Linear}, dtype=torch.qint8) quantized_state_dict = torch.load( 'C:/Users/Ramana/Desktop/pegasus/nirvana-test/SMA/models/pegasus-quantized.h5' ) quantized_model.load_state_dict(quantized_state_dict) app = Flask(__name__) run_with_ngrok(app) @app.route('/')
def __init__(self): self.model_name = 'google/pegasus-multi_news' self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name) self.model = PegasusForConditionalGeneration.from_pretrained( self.model_name).to(self.device)
def __init__(self): self.model_name = 'tuner007/pegasus_paraphrase' self.pegasus_tokenizer = PegasusTokenizer.from_pretrained( self.model_name) self.pegasus_model = PegasusForConditionalGeneration.from_pretrained(self.model_name) \ .to(ReflectiveListening.torch_device)