Esempio n. 1
0
def ingest():
    """
    Every model from HugginFace is applicable
    TODO: put url here
    Corpus example: squad | MedQA or FindZebra
    """
    typer.secho("Welcome to the ingest command", fg=typer.colors.WHITE, bold=True)

    model = BertModel.from_pretrained(Config['model'].get())
    fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(Config['tokenizer'].get())
    # fast_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    corpus = load_dataset(Config['corpus'].get(),
                          split='train[:100]')  # cache_dir=Config['cache_dir'].get() -- Cache directory override

    torch.set_grad_enabled(False)

    typer.secho("Embedding corpus as dense context vector representation using FAISS.")
    corpus_embeddings = corpus.map(
        lambda example: {
            'embeddings': model(**fast_tokenizer(example['line'], return_tensors='pt'))['pooler_output'][0].numpy()})
    # corpus_embeddings.save_to_disk(os.path.join(Config['cache_dir'].get(), "corpus/"))

    typer.secho("Adding FAISS index for efficient similarity search and clustering of dense vectors.")
    corpus_embeddings.add_faiss_index(column='embeddings')

    typer.secho("Saving the index")
    corpus_embeddings.save_faiss_index("embeddings", "corpus.faiss")  # os.path.join(Config['cache_dir'].get())
    return 0
    def setUp(self):
        self.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
        super().setUp()
        self.test_rust_tokenizer = True

        self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})]

        tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast")
        tokenizer.save_pretrained(self.tmpdirname)
Esempio n. 3
0
 def __init__(self, bot):
     self.bot = bot
     self.model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
     self.tokenizer = tokenizer = PreTrainedTokenizerFast.from_pretrained(
         "skt/kogpt2-base-v2",
         bos_token='</s>',
         eos_token='</s>',
         unk_token='<unk>',
         pad_token='<pad>',
         mask_token='<mask>')
def get_kobart_tokenizer():
    tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")

    tokenizer.pad_token = "<pad>"
    tokenizer.bos_token = "<s>"
    tokenizer.eos_token = "</s>"
    tokenizer.unk_token = "<unk>"
    tokenizer.mask_token = "<mask>"

    return tokenizer
Esempio n. 5
0
 def __init__(self, model: str, device: str):
     config = BartConfig.from_pretrained("hyunwoongko/kobart")
     self.model = BartForConditionalGeneration(config).half().eval().to(
         device)
     self.model.model.load_state_dict(torch.load(
         model,
         map_location=device,
     ))
     self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
         "hyunwoongko/kobart")
     self.device = device
    def setUp(self):
        self.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
        super().setUp()
        self.test_rust_tokenizer = True

        model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]

        # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
        self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]

        tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
        tokenizer.save_pretrained(self.tmpdirname)
Esempio n. 7
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")

    model = BartForConditionalGeneration.from_pretrained(
        args.finetuned_model_path)
    model.eval()
    model.to(device)

    examples = [
        "배고프다", "너무너무 사랑해요", "나는 너를 좋아해", "저의 취미는 축구입니다", "어제 무슨 영화 봤어?",
        "짜장면 짬뽕 탕수육 먹었어"
    ]

    for example in examples:
        chosung_example = convert_text_to_chosung(example)

        input_ids = (torch.tensor(
            tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(chosung_example))).unsqueeze(0).to(device))

        if args.decoding_method == "top_p":
            outputs = model.generate(
                input_ids=input_ids,
                max_length=48,
                temperature=1.0,
                do_sample=True,
                top_p=0.8,
                pad_token_id=tokenizer.pad_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                decoder_start_token_id=tokenizer.bos_token_id,
                num_return_sequences=5,
            )
        elif args.decoding_method == "beam_search":
            outputs = model.generate(
                input_ids=input_ids,
                max_length=48,
                num_beams=10,
                pad_token_id=tokenizer.pad_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                decoder_start_token_id=tokenizer.bos_token_id,
                num_return_sequences=5,
            )
        else:
            raise ValueError(
                "Enter the right decoding method (top_p or beam_search)")

        for output in outputs.tolist():
            answer = tokenizer.decode(output)
            print(f"초성: {chosung_example} \t 예측 문장: {answer}")
Esempio n. 8
0
 def __init__(self, datapath, max_seq_len=128):
     self.datapath = datapath
     self.data = pd.read_csv(self.datapath, sep='\t')
     self.bos_token = '</s>'
     self.eos_token = '</s>'
     self.max_seq_len = max_seq_len
     self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
         "skt/kogpt2-base-v2",
         bos_token=self.bos_token,
         eos_token=self.eos_token,
         unk_token='<unk>',
         pad_token='<pad>',
         mask_token='<mask>')
Esempio n. 9
0
    def test_async_share_tokenizer(self):
        # See https://github.com/huggingface/transformers/pull/12550
        # and https://github.com/huggingface/tokenizers/issues/537
        tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "robot-test/dummy-tokenizer-wordlevel")
        text = "The Matrix is a 1999 science fiction action film."

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(self.fetch, tokenizer, text) for i in range(10)
            ]
            return_value = [future.result() for future in futures]
            self.assertEqual(return_value, [[1, 10, 0, 8, 0, 18, 0, 0, 0, 2]
                                            for i in range(10)])
Esempio n. 10
0
    def __init__(self, type="normal", device="cpu"):
        """
        Constructor of Summarizers

        Args:
            type (str): type of article. (e.g. normal, paper, patent)
            device (str): device for inference (e.g. cpu, cuda)
        """

        type = type.lower()
        model_name_prefix = "hyunwoongko/ctrlsum"

        assert type in ['normal', 'paper', 'patent'], \
            "param `article_type` must be one of ['normal', 'paper', 'patent']"

        if type == "normal":
            model_name = f"{model_name_prefix}-cnndm"
        elif type == "paper":
            model_name = f"{model_name_prefix}-paper"
        elif type == "patent":
            model_name = f"{model_name_prefix}-patent"
        else:
            raise Exception(f"Unknown type: {type}")

        self.device = device
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(
            device)
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
        self._5w1h = [
            "what ",
            "what's "
            "when ",
            "why ",
            "who ",
            "who's ",
            "where ",
            "how ",
            "What ",
            "What's ",
            "When ",
            "Why ",
            "Who ",
            "Who's ",
            "Where ",
            "How ",
        ]
Esempio n. 11
0
    def __init__(self, path, max_ids):
        self.model = load_model(path)
        self.max_ids = max_ids
        U_TKN = '<usr>'
        S_TKN = '<sys>'
        BOS = '</s>'
        EOS = '</s>'
        MASK = '<unused0>'
        SENT = '<unused1>'
        PAD = '<pad>'

        TOKENIZER = PreTrainedTokenizerFast.from_pretrained(
            "skt/kogpt2-base-v2",
            bos_token=BOS,
            eos_token=EOS,
            unk_token='<unk>',
            pad_token=PAD,
            mask_token=MASK)

        self.tok = TOKENIZER
Esempio n. 12
0
def fine_tuning(MODEL_TYPE, DATA_PATH, BATCH_SIZE, LEARNING_RATE, WARMUP_STEPS,
                OUTPUT_MODEL_PATH, EPOCHS):
    print("=" * 15, "LOAD MODEL", "=" * 15)
    model = GPT2LMHeadModel.from_pretrained(MODEL_TYPE)
    tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_TYPE)

    print("=" * 15, "GET DATASET", "=" * 15)
    data_loader = get_data_loader(DATA_PATH, tokenizer, BATCH_SIZE, True)

    optimizier = AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizier, WARMUP_STEPS,
        len(data_loader) - WARMUP_STEPS, -1)

    if not os.path.exists(OUTPUT_MODEL_PATH):
        os.mkdir(OUTPUT_MODEL_PATH)

    fine_tuning_runner(model, optimizier, data_loader, scheduler, EPOCHS,
                       OUTPUT_MODEL_PATH)
    model.save_pretrained(OUTPUT_MODEL_PATH)
Esempio n. 13
0
def summarizer(input: TextSummerizeInput) -> TextSummerizeOutput:
    """ Summarize texts """
    tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")
    inputs = tokenizer([
        tokenizer.bos_token + input.text_input + tokenizer.eos_token
    ])['input_ids'][0]

    model_url = 'https://train-mxysk1opgrzauh8ifw55-gpt2-train-teachable-ainize.endpoint.dev.ainize.ai/predictions/bart-ko-small-finetune'

    headers = {'Content-Type': 'application/json; charset=utf-8'}
    response = requests.post(url=model_url,
                             headers=headers,
                             json={"text": inputs})

    if response.status_code == 200:
        result = tokenizer.decode(response.json()[0], skip_special_tokens=True)
        return TextSummerizeOutput(output=result)
    else:
        print(f'Failed {response.text}')
        return TextSummerizeOutput(output='Failed summerize')
Esempio n. 14
0
def main():
    # Config
    config = TrainConfig()

    # Logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("[%(asctime)s] %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Data Loading...
    raw_train_instances = load_data(config.train_file_path)
    raw_dev_instances = load_data(config.dev_file_path)
    logger.info(f"훈련용 예시 개수:{len(raw_train_instances)}\t 검증용 예시 개수:{len(raw_dev_instances)}")

    tokenizer = PreTrainedTokenizerFast.from_pretrained(config.pretrained_model_name)

    train_dataset = ChosungTranslatorDataset(raw_train_instances, tokenizer, config.max_seq_len)
    dev_dataset = ChosungTranslatorDataset(raw_dev_instances, tokenizer, config.max_seq_len)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
    )
    dev_dataloader = DataLoader(
        dev_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
    )

    model = BartForConditionalGeneration.from_pretrained(config.pretrained_model_name)

    # Train
    optimizer = Adam(model.parameters(), lr=config.learning_rate)
    train(config, model, train_dataloader, dev_dataloader, optimizer, logger, device)
Esempio n. 15
0
def get_kogpt2_tokenizer(model_path=None):
    if not model_path:
        model_path = 'taeminlee/kogpt2'
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
    return tokenizer
Esempio n. 16
0
def get_kobart_tokenizer():
    return PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")
nlpbook.set_logger(args)

# %% download corpus
from Korpora import Korpora

Korpora.fetch(
    args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=args.force_download,
    )

# %% prepare tokenizer
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    args.pretrained_model_name,
    eos_token="</s>",
)

# %% create train dataset
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from ratsnlp.nlpbook.generation import GenerationDataset, NsmcCorpus

corpus = NsmcCorpus()

train_dataset = GenerationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train"
)
Esempio n. 18
0
# pip install opyrator transformers torch

from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import torch
from pydantic import BaseModel, Field

import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
                                                    bos_token='</s>',
                                                    eos_token='</s>',
                                                    unk_token='<unk>',
                                                    pad_token='<pad>',
                                                    mask_token='<mask>')

model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')


class Input(BaseModel):
    text: str = Field(title='문장을 입력해주세요.', max_length=128)
    max_length: int = Field(128, ge=5, le=128)
    repetition_penalty: float = Field(2.0, ge=0.0, le=2.0)


class Output(BaseModel):
    generated_text: str


def generate_text(input: Input) -> Output:
Esempio n. 19
0
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, PreTrainedTokenizerFast
from head import GlobalPointer, MutiHeadSelection, Biaffine, TxMutihead
import sys
import os

head_type = sys.argv[1]
os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[2]

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
print("Using {} device".format(device))
model_path = "../model_set/bert-base-chinese"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

assert head_type in [
    'GlobalPointer', 'MutiHeadSelection', 'Biaffine', 'TxMutihead'
]

if head_type in ['MutiHeadSelection', 'Biaffine', 'TxMutihead']:
    batch_size = 4
    learning_rate = 1e-5
    abPosition = False
    rePosition = True
else:
    batch_size = 16
    learning_rate = 2e-5
Esempio n. 20
0
from flask import Flask, request, Response, render_template, jsonify
import requests
import time
import random
import json
import os
from transformers import PreTrainedTokenizerFast

eng_tokenizer = PreTrainedTokenizerFast.from_pretrained("gpt2-large")

# Server & Handling Setting
app = Flask(__name__, static_url_path='/static')

models = {
    "gpt2-large": "gpt2-large",
    "gpt2-cover-letter": "cover-letter-gpt2",
    "gpt2-story": "gpt2_story",
    "gpt2-reddit": "gpt2_reddit",
    "gpt2-trump": "gpt2_trump"
}

SERVER_URL = os.environ.get('GPT2_SERVER_URL')
AINIZE_STATUS_URL = os.environ.get('AINIZE_STATUS_URL')
API_DEV = os.environ.get('API_DEV')
API_STAGING = os.environ.get('API_STAGING')
API_PROD = os.environ.get('API_PROD')


@app.route("/status", methods=['GET'])
def ainize_status():
    try:
Esempio n. 21
0
from transformers import PreTrainedTokenizerFast, RobertaForMaskedLM
from preprocessing.evaluator import Evaluator

# Check that PyTorch sees it
USE_GPU = torch.cuda.is_available()
# USE_GPU = False
print(f'USE_GPU={USE_GPU}')

run_path = Path('runs') / 'run_4'
model_path = run_path / 'model'

dataset_path = Path('data') / 'pan_tadeusz'
text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(dataset_path / 'vocab.json')

tokenizer2 = PreTrainedTokenizerFast.from_pretrained(
    dataset_path / 'my-pretrained-tokenizer-fast2', max_len=128)

# 4. Check that the LM actually trained


def to_gpu(x, *args, **kwargs):
    return x.cuda(*args, **kwargs) if USE_GPU else x


# load trained model

# os.system('tar xzvf PanTadeuszRoBERTa.tgz')

model = RobertaForMaskedLM.from_pretrained(str(model_path))
model = to_gpu(model)
model.device
Esempio n. 22
0
from tqdm import trange
import os
import sys
import json
import urllib.request
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel

MODEL_NAME = "skt/kogpt2-base-v2"
MODEL_PATH = "./models/"
SEQ_LEN = 50
TOKENS_DICT = {
    "additional_special_tokens": ["<unused0>", "<unused1>"],
}

tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens(TOKENS_DICT)

device = torch.device('cpu')
model.load_state_dict(
    torch.load("smithy/models/processed_slogan_final_5epoch_model.pth",
               map_location=device))
model.eval()

import torch
import torch.nn.functional as F
from tqdm import trange


def top_k_top_p_filtering(logits,
Esempio n. 23
0
import torch
from flask import Flask, make_response
from flask_restx import Api, Resource, reqparse
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

app = Flask(__name__)
api = Api(app)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("MODEL LOADING ...")
MODEL = GPT2LMHeadModel.from_pretrained("./models")
MODEL.to(DEVICE)
print("TOKENIZER LOADING ...")
TOKENIZER = PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2")
MODEL.eval()


def generate(text=""):
    if text == "":
        return "error!! :("

    input_ids = text + "</s>"
    tokens = TOKENIZER.encode(input_ids, return_tensors='pt').to(DEVICE)
    min_length = len(tokens)
    output_ids = TOKENIZER.decode(MODEL.generate(tokens,
                                                 do_sample=True,
                                                 max_length=50,
                                                 min_length=min_length,
                                                 top_k=50,
Esempio n. 24
0
    def __init__(self, pretrained):
        self.tokenizer = PTTF.from_pretrained(pretrained, mask_token='[MASK]')

        self.model = AutoModelForMaskedLM.from_pretrained(pretrained)
        self.model.eval()
Esempio n. 25
0
parser.add_argument('--bucket',
                    type=str,
                    default='NONE')

logger = logging.getLogger()
logger.setLevel(logging.INFO)

BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
NEWLINE = '<unused0>'
PAD = '<pad>'

TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK) 

class CommentDataset(Dataset):
    def __init__(self, comments, max_len=32):
        self._data = comments
        self.bos = BOS
        self.eos = EOS
        self.mask = MASK
        self.pad = PAD
        self.max_len = max_len
        self.tokenizer = TOKENIZER

        temp = []

        for x in self._data: