Ejemplo n.º 1
0
class ParaPhrasing:
    """Class loads pegasus model for text augmentation"""
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        """
        generates variations for
        a given sentence/text

        :param input_text: sentence or text
        :param num_return_sequences: Number of variations to be returned
        :param num_beams: Number of beams for beam search. 1 means no beam search
        :return: list of variations of the input text
        """
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(
            input_text, truncation=True, padding='longest',
            max_length=60).to(ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(
            **batch,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(
            translated, skip_special_tokens=True)
        return tgt_text
Ejemplo n.º 2
0
    def exec(self, text):
        src_text = [text]
        model_name = self.model
        #model_name = 'google/pegasus-xsum'
        #model_name = 'google/pegasus-large'
        #model_name = 'google/pegasus-cnn_dailymail'
        #model_name = 'google/pegasus-pubmed'
        #model_name = 'google/pegasus-wikihow'
        #model_name = 'google/pegasus-newsroom'
        #model_name = 'google/pegasus-multi_news'
        #model_name = 'google/pegasus-reddit_tifu'
        #model_name = 'google/pegasus-arxiv'

        torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        batch = tokenizer.prepare_seq2seq_batch(
            src_text, truncation=True, padding='longest').to(torch_device)
        result = model.generate(**batch)
        tgt_text = tokenizer.batch_decode(result, skip_special_tokens=True)
        if self.model == "google/pegasus-cnn_dailymail":
            tgt_text[0] = re.sub('<n>', ' ', tgt_text[0])

        return tgt_text[0]
Ejemplo n.º 3
0
    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()
def get_model_tokenizer(model_name):
    import torch
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if "pegasus" in model_name:
        #its a pegasus model
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-large" in model_name:
        # its a bart-model
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-custom-large" in model_name:
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    else:
        # T5 or distilbart
        from transformers import AutoTokenizer, AutoModelWithLMHead
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelWithLMHead.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer
Ejemplo n.º 5
0
def compute(sm):
    # Import the Pegasus Model
    model_name = 'google/pegasus-xsum'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    sm_len = len(sm)

    sen_list = splitText(sm, sm_len)  # Get sections to be summarized

    try:
        batches = []
        for s in sen_list:  # Preparation
            batch = tokenizer.prepare_seq2seq_batch(
                [s], truncation=True, padding='longest').to(torch_device)
            batches.append(batch)
    except:
        return ""

    temp = []
    for b in batches:  # Summary generation
        translated = model.generate(**b)
        temp.append(translated)

    final_summary = []
    for t in temp:  # Put together the summaries from the different sections
        final_summary.append(
            tokenizer.batch_decode(t, skip_special_tokens=True)[0])

    return final_summary
Ejemplo n.º 6
0
 def __init__(self, config):
     self.model_name = 'google/pegasus-reddit_tifu'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"using device: {self.device}")
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name,
                                                       force_download=True)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name, force_download=True).to(self.device)
Ejemplo n.º 7
0
    def load_model(self):
        model = PegasusForConditionalGeneration.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'model'))
        tokenizer = PegasusTokenizer.from_pretrained(
            os.path.join(settings.BASE_DIR, 'paraphrase_utils', 'tokenizer'))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        return model, tokenizer, device
 def __init__(self, args, device):
     super().__init__(args, device)
     assert args.pretrained_model_name in self.PRETRAINED_MODEL_NAMES
     self.pretrained_model_name = args.pretrained_model_name
     logging.info(f'Loading Pegasus ({self.pretrained_model_name})')
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.pretrained_model_name).to(self.device)
     self.tokenizer: PegasusTokenizer = PegasusTokenizer.from_pretrained(
         self.pretrained_model_name)
def generate_summary(text, model_name):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(
        text, truncation=True, padding="longest",
        return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
Ejemplo n.º 10
0
def generate_summary(context):
    model_name = 'google/pegasus-xsum'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    batch = tokenizer.prepare_seq2seq_batch(src_texts='context',
                                            truncation=True,
                                            padding='max-length',
                                            return_tensors="pt")
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text
Ejemplo n.º 11
0
    def single_document_summarization(self, src_text):
        tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
        model = PegasusForConditionalGeneration.from_pretrained(
            self.model_name).to(torch_device)
        batch = tokenizer(src_text,
                          truncation=True,
                          padding=True,
                          return_tensors='pt').to(self.torch_device)

        translated = model.generate(**batch)
        generated_summary = tokenizer.batch_decode(translated,
                                                   skip_special_tokens=True)
        return generated_summary
Ejemplo n.º 12
0
def get_summary(text):
        try:
            model_name = 'google/pegasus-xsum'
            tokenizer = PegasusTokenizer.from_pretrained(model_name)
            model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
            src_text=[""""""+text+""""""]
            batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**batch)
            target = tokenizer.batch_decode(translated, skip_special_tokens=True)
        except :
            print("API Error occured")
            return (-100)
        return target[0]
Ejemplo n.º 13
0
def load_BART_or_PEGASUS(mname):
    if 'bart' in mname.lower():
        from transformers import BartTokenizer, BartForConditionalGeneration

        model = BartForConditionalGeneration.from_pretrained(mname)
        tokenizer = BartTokenizer.from_pretrained(mname)
    elif 'pegasus' in mname.lower():
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration

        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tokenizer = PegasusTokenizer.from_pretrained(mname)
    else:
        raise NotImplementedError("UNKOWN model name.")
    return model, tokenizer
Ejemplo n.º 14
0
def summarizeP(src_text, variant="xsum", device=None):
    model_name = "google/pegasus-"
    model_name += variant
    torch_device = ('cuda' if torch.cuda.is_available() else
                    'cpu') if device is None else device
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
        torch_device)
    batch = tokenizer.prepare_seq2seq_batch(src_text,
                                            truncation=True,
                                            padding='longest').to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text
def execute_pegasus_augmentation(data, file_path) -> pd.DataFrame:
    MODEL_NAME = var.PARAPHRASING_MODEL
    tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
    model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to(torch_device)
    train = data.copy()
    train = train[['summary', 'sentiment']]
    number_sequences = 10
    train['paraphrased text'] = train['summary'].progress_apply(get_response,
                                                                     num_return_sequences=number_sequences,
                                                                     tokenizer=tokenizer,
                                                                     model=model)
    generated = train.explode('paraphrased text')
    generated = generated.dropna()
    generated.to_csv('{}-Processed-Summarized-Augmented.csv'.format(file_path), index=False)
    return generated
Ejemplo n.º 16
0
def generate_summary(text):

    # Create tokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    # load pretrained model
    model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")

    # convert into tokens (number representation of text)
    tokens = tokenizer(text,
                       truncation=True,
                       padding="longest",
                       return_tensors="pt")
    summary = model.generate(**tokens)
    #Summarized = wrapper.fill(tokenizer.decode(summary[0])).strip()
    Summarized = tokenizer.decode(summary[0])
    return Summarized
Ejemplo n.º 17
0
class ParaPhrasing:
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    @staticmethod
    def paraphrases(input_text, num_return_sequences=10, num_beams=10):
        if isinstance(input_text, str):
            input_text = [input_text]
        batch = ParaPhrasing.tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest',
                                                             max_length=60).to(
            ParaPhrasing.torch_device)
        translated = ParaPhrasing.model.generate(**batch, max_length=60, num_beams=num_beams,
                                                 num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = ParaPhrasing.tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text
def main(sequence):
    # Pretrained model from https://huggingface.co/google/pegasus-cnn_dailymail
    tokenizer = PegasusTokenizerFast.from_pretrained(
        'google/pegasus-cnn_dailymail')
    model = PegasusForConditionalGeneration.from_pretrained(
        'google/pegasus-cnn_dailymail').to(DEVICE)
    model.eval()

    inputs = tokenizer.encode(sequence, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(inputs)

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Input:")
    print(sequence)
    print("--------------------------------")
    print("Output:")
    print(summary)
Ejemplo n.º 19
0
def index(request):
    if request.method == 'POST':
        form = textForm(request.POST, request.FILES)
        if form.is_valid():
            _type = form.cleaned_data['_type']
            text = form.cleaned_data['text']
            percent = form.cleaned_data['percent']
            if (text == ""):
                file = request.FILES['file']
                text = ''
                for line in file:
                    text += line.decode()
            tokenized_sentence = sent_tokenize(text)
            if (_type == 'Extractive'):
                summary = summarize(tokenized_sentence, percent)
                return render(request, 'summary/summary.html', {
                    'text': text,
                    'summary': summary,
                    'percent': percent
                })
            elif (_type == 'Abstractive'):
                model_name = 'google/pegasus-xsum'
                torch_device = 'cuda'
                tokenizer = PegasusTokenizer.from_pretrained(model_name)
                model = PegasusForConditionalGeneration.from_pretrained(
                    model_name).to(torch_device)
                batch = tokenizer.prepare_seq2seq_batch(
                    [text], truncation=True,
                    padding='longest').to(torch_device)
                translated = model.generate(**batch)
                summary = tokenizer.batch_decode(translated,
                                                 skip_special_tokens=True)
                return render(
                    request, 'summary/summary.html', {
                        'text': text,
                        'summary': summary[0],
                        'percent': "Not Applicable"
                    })
    return render(request, 'summary/index.html', {'form': textForm()})
Ejemplo n.º 20
0
def main(input_dir_path, output_dir_path, model_name_or_dir):
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name_or_dir)
    model = PegasusForConditionalGeneration.from_pretrained(
        model_name_or_dir).to(DEVICE)
    model.eval()

    os.makedirs(output_dir_path, exist_ok=True)

    for file_name in os.listdir(input_dir_path):
        if file_name.endswith('.json'):
            count = 0
            input_file_path = os.path.join(input_dir_path, file_name)
            with open(input_file_path) as json_file:
                data = json.load(json_file)

                for session in data['sessions']:
                    for speech in session['speeches']:
                        content = []
                        for text in speech['content']:
                            inputs = tokenizer.encode(
                                text, return_tensors="pt").to(DEVICE)

                            with torch.no_grad():
                                outputs = model.generate(inputs)

                            summary = tokenizer.decode(
                                outputs[0], skip_special_tokens=True)

                            content.append({'text': text, 'summary': summary})
                            count += 1
                        speech['content'] = content

            output_file_path = os.path.join(output_dir_path, file_name)
            with open(output_file_path, 'w') as json_file:
                json.dump(data, json_file)

            print("File: {}, Count: {}".format(file_name, count))
Ejemplo n.º 21
0
        text = re.sub(r'## Example(.*?)##', '##', text, flags=re.DOTALL)
        if '## Example' in text:
            text = re.sub(r'## Example(.*)', '', text)
            text = re.sub(r"\`\`\`.*?\`\`\`", '', text, flags=re.DOTALL)
        return text

    for i, doc in enumerate(docs):
        markdown_without_example = remove_example_from_description(doc['markdown_description'])
        docs[i]['markdown_without_example'] = markdown_without_example
        # LOGGER.debug(markdown_without_example)

    # Generate 1 sentence summaries for the models
    if not args.quick_run:
        from transformers import PegasusTokenizer, PegasusForConditionalGeneration
        mname = "google/pegasus-large"
        model = PegasusForConditionalGeneration.from_pretrained(mname)
        tok = PegasusTokenizer.from_pretrained(mname)

        def summarise(text):
            batch = tok.prepare_seq2seq_batch(src_texts=[text])  # don't need tgt_text for inference
            gen = model.generate(**batch)
            return tok.batch_decode(gen, skip_special_tokens=True)[0]

        for i, doc in enumerate(docs):
            if 'short_description' not in docs[i].keys():
                short_description = summarise(doc['description'])
                docs[i]['short_description'] = short_description
                # LOGGER.debug(short_description)

    vi_client = ViClient(os.environ['VH_USERNAME'], os.environ['VH_API_KEY'])
    ids = vi_client.get_field_across_documents('_id', docs)
Ejemplo n.º 22
0
def load_pegasus_model():
    pegasus_model = PegasusForConditionalGeneration.from_pretrained(
        "google/pegasus-xsum")
    return pegasus_model
Ejemplo n.º 23
0
"""

# Commented out IPython magic to ensure Python compatibility.
!git clone https://github.com/google-research/pegasus
# %cd pegasus
!export PYTHONPATH=.
!pip3 install -r requirements.txt

!pip install transformers==3.5.0

import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60).to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

import pandas as pd
import nltk
nltk.download('cmudict')
nltk.download('wordnet')

"""Next, we import the procrustean alliteration paraphraser"""

class RuleBoundsInterface:
    """This interface is used to define different properties of a rhetorical figure generating algorithm.
Ejemplo n.º 24
0
 def __init__(self):
     self.sum_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') #use pegasus-large for actual pc and xsum for cloud
     self.sum_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum') 
def create_long_model(save_model_to, base_model, tokenizer_name_or_path,
                      attention_window, max_pos):
    model = PegasusForConditionalGeneration.from_pretrained(base_model)
    tokenizer = PegasusTokenizer.from_pretrained(tokenizer_name_or_path,
                                                 model_max_length=max_pos)
    config = LongformerPegasusConfig.from_pretrained(base_model)
    model.config = config

    # in Pegasus attention_probs_dropout_prob is attention_dropout, but LongformerSelfAttention
    # expects attention_probs_dropout_prob, so set it here
    config.attention_probs_dropout_prob = config.attention_dropout
    config.architectures = [
        "LongformerPegasusForConditionalGeneration",
    ]

    N = 0

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs["model_max_length"] = max_pos
    current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
    print(max_pos, current_max_pos, embed_size, config.max_position_embeddings)
    assert current_max_pos == config.max_position_embeddings + N

    config.max_encoder_position_embeddings = max_pos
    config.max_decoder_position_embeddings = config.max_position_embeddings
    del config.max_position_embeddings
    max_pos += N  # NOTE: Pegasus has positions 0,1 reserved, so embedding size is max position + N
    assert max_pos >= current_max_pos

    # allocate a larger position embedding matrix for the encoder
    new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(
        max_pos, embed_size)
    k = N
    step = current_max_pos - N
    while k < max_pos - 1:
        new_encoder_pos_embed[k:(
            k + step)] = model.model.encoder.embed_positions.weight[N:]
        k += step
    model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed

    config.attention_window = [attention_window] * config.num_hidden_layers
    config.attention_dilation = [1] * config.num_hidden_layers

    for i, layer in enumerate(model.model.encoder.layers):
        longformer_self_attn_for_pegasus = LongformerSelfAttentionForPegasus(
            config, layer_id=i)

        longformer_self_attn_for_pegasus.longformer_self_attn.query = (
            layer.self_attn.q_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.key = (
            layer.self_attn.k_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.value = (
            layer.self_attn.v_proj)

        longformer_self_attn_for_pegasus.longformer_self_attn.query_global = (
            layer.self_attn.q_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.key_global = (
            layer.self_attn.k_proj)
        longformer_self_attn_for_pegasus.longformer_self_attn.value_global = (
            layer.self_attn.v_proj)

        longformer_self_attn_for_pegasus.output = layer.self_attn.out_proj

        layer.self_attn = longformer_self_attn_for_pegasus
    print(f"saving model to {save_model_to}")
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Ejemplo n.º 26
0
 def __init__(self):
     self.model_name = 'google/pegasus-multi_news'
     self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
     self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name)
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self.model_name).to(self.device)
Ejemplo n.º 27
0
 def __init__(self):
     self.model_name = 'tuner007/pegasus_paraphrase'
     self.pegasus_tokenizer = PegasusTokenizer.from_pretrained(
         self.model_name)
     self.pegasus_model = PegasusForConditionalGeneration.from_pretrained(self.model_name) \
         .to(ReflectiveListening.torch_device)
Ejemplo n.º 28
0
        print('Model files downloading for BART')
        bart_model = BartForConditionalGeneration.from_pretrained(
            'facebook/bart-large-cnn')
        bart_tokenizer = BartTokenizer.from_pretrained(
            'facebook/bart-large-cnn')
        bart_model.to(device)
        bart_model.eval()
    if 'T5' in models:
        print('Model files downloading for T5')
        t5_model = T5ForConditionalGeneration.from_pretrained('t5-large')
        t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')
        t5_model.to(device)
        t5_model.eval()
    if 'PEGASUS-CNN' in models:
        print('Model files downloading for PEGASUS-CNN')
        pegasus_cnn_model = PegasusForConditionalGeneration.from_pretrained(
            'google/pegasus-cnn_dailymail')
        pegasus_cnn_tokenizer = PegasusTokenizer.from_pretrained(
            'google/pegasus-cnn_dailymail')
        pegasus_cnn_model.to(device)
        pegasus_cnn_model.eval()
    if 'PEGASUS-MED' in models:
        print('Model files downloading for PEGASUS-MED')
        pegasus_med_model = PegasusForConditionalGeneration.from_pretrained(
            'google/pegasus-pubmed')
        pegasus_med_tokenizer = PegasusTokenizer.from_pretrained(
            'google/pegasus-pubmed')
        pegasus_med_model.to(device)
        pegasus_med_model.eval()
    # app.run(host='0.0.0.0', debug=True, port=8000, use_reloader=False)
    app.run()
Ejemplo n.º 29
0
def main():

    # parse Arguements
    params = parser.parse_args()
    # Define Tesseract bin
    pytesseract.pytesseract.tesseract_cmd = params.tesseract_path

    # Create working directory if doesn't exist
    work_dir = './workingDir/'
    if not os.path.exists(work_dir):
        print('Creating Working Directory')
        os.makedirs(work_dir)

    # For previously cleaned text
    if params.cleaned_text is not None:
        working_dir = makeandmove(work_dir, params.cleaned_text)
        if os.path.exists(params.cleaned_text):
            f1 = open(params.cleaned_text, 'r')
            cleaned_text = f1.read()
            f1.close()
        else:
            print('Something went wrong.')

    # For previously recognized text
    elif params.text_from_image is not None:
        working_dir = makeandmove(work_dir, params.text_from_image)
        if os.path.exists(params.text_from_image):
            f1 = open(params.text_from_image, 'r')
            ocr_text = f1.read()
            f1.close()
        else:
            print('Something went wrong.')
        cleaned_text = clean_text(ocr_text, working_dir)

    # For previously generated images
    elif params.image_proc_list is not None:
        working_dir = makeandmove(work_dir, params.image_proc_list)
        ocr_text = perform_ocr(params.image_proc_list, working_dir)
        cleaned_text = clean_text(ocr_text, work_dir)

    # For PDFs
    else:
        working_dir = makeandmove(work_dir, params.pdf_path)
        image_proc_file = pdf_to_image(params.pdf_path, params.poppler_path,
                                       working_dir)
        ocr_text = perform_ocr(image_proc_file, working_dir)
        cleaned_text = clean_text(ocr_text, working_dir)

    print('--- Summarizing Text ---')
    # download model
    model = PegasusForConditionalGeneration.from_pretrained(params.model)
    # download tokenizer
    tok = PegasusTokenizer.from_pretrained(params.model)
    batch = tok.prepare_seq2seq_batch(src_texts=[cleaned_text])
    # Hyperparameter Tuning

    gen = model.generate(**batch,
                         max_length=params.max_length,
                         min_length=params.min_length,
                         do_sample=params.do_sample,
                         temperature=params.temperature,
                         top_k=params.top_k,
                         top_p=params.top_p,
                         repetition_penalty=params.repetition_penalty,
                         length_penalty=params.length_penalty,
                         num_return_sequences=params.num_return_sequences)

    summary = tok.batch_decode(gen, skip_special_tokens=True)

    summary_file = working_dir + 'summary.txt'
    if os.path.exists(summary_file):
        f1 = open(summary_file, "w")
    else:
        f1 = open(summary_file, "x")
    print(summary, file=f1)
    f1.close()
    print(summary)
    return None
Ejemplo n.º 30
0
 def prepare(self):
     self.model = PegasusForConditionalGeneration.from_pretrained(
         self._model_name
     ).to(self._torch_device)