Beispiel #1
0
def model_fn(model_dir):
    logger.info('Loading the model.')

    vocab_file_path = os.path.join(model_dir, 'vocab.json')
    merge_file_path = os.path.join(model_dir, 'merges.txt')
    model_file_path = os.path.join(model_dir, 'lyric_model.bin')

    tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
    bos = tokenizer.convert_tokens_to_ids('<s>')
    eos = tokenizer.convert_tokens_to_ids('</s>')
    pad = tokenizer.convert_tokens_to_ids('<pad>')
    unk = tokenizer.convert_tokens_to_ids('<unk>')

    config = GPT2Config(vocab_size=52003,
                        resid_pdrop=0,
                        embd_pdrop=0,
                        attn_pdrop=0,
                        summary_first_dropout=0)

    model = GPT2LMHeadModel(config)

    model.load_state_dict(torch.load(model_file_path, map_location=device),
                          strict=False)
    model.to(device)

    return model, tokenizer
Beispiel #2
0
from transformers import GPT2LMHeadModel, GPT2Config, AdamW
from new_tokenizer import MyTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import json

vocab_file_path = '../tokenizer/vocab.json'
merge_file_path = '../tokenizer/merges.txt'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
config = GPT2Config(vocab_size=52000)
model = GPT2LMHeadModel(config)

model_dir = '../model/pytorch_model.bin'

model.load_state_dict(torch.load(model_dir), strict=False)
model.to('cuda')

ATTR_TO_SPECIAL_TOKEN = ['<social>', '<economy>', '<world>', '<science>', '<sports>', '<politics>', '<entertainment>', '<it>', '<title>', '</title>']
category_map = {'사회':'<social>', '경제':'<economy>', '세계':'<world>', 'IT/과학':'<science>', '스포츠':'<sports>', '정치':'<politics>', '연예':'<entertainment>', 'IT':'<it>'}

def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
    num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1)

add_special_tokens_(model, tokenizer)
b_title = tokenizer.convert_tokens_to_ids('<title>')
e_title = tokenizer.convert_tokens_to_ids('</title>')
from transformers import GPT2LMHeadModel, GPT2Config
from new_tokenizer import MyTokenizer
import torch
import kss

vocab_file_path = '../tokenizer/vocab.json'
merge_file_path = '../tokenizer/merges.txt'

answer_tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
question_tokenizer = MyTokenizer(vocab_file_path, merge_file_path)

answer_config = GPT2Config(vocab_size=52004)
question_config = GPT2Config(vocab_size=52005)

answer_model = GPT2LMHeadModel(answer_config)
question_model = GPT2LMHeadModel(question_config)

answer_model_dir = '../KorGPT-2SampleModel/answer_model.bin'
question_model_dir = '../KorGPT-2SampleModel/question_model.bin'

answer_model.load_state_dict(torch.load(answer_model_dir), strict=False)
question_model.load_state_dict(torch.load(question_model_dir), strict=False)

answer_model.to('cpu')
question_model.to('cpu')

def add_special_tokens_(model, tokenizer, added_tokens):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(added_tokens)

added_answer_tokens = ['<answer>', '<sep>', '</answer>']
]
category_map = {
    '사회': '<social>',
    '경제': '<economy>',
    '세계': '<world>',
    'IT/과학': '<science>',
    '스포츠': '<sports>',
    '정치': '<politics>',
    '연예': '<entertainment>',
    'IT': '<it>'
}

vocab_file_path = '../tokenizer/vocab.json'
merge_file_path = '../tokenizer/merges.txt'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
bos = tokenizer.convert_tokens_to_ids('<s>')
eos = tokenizer.convert_tokens_to_ids('</s>')
pad = tokenizer.convert_tokens_to_ids('<pad>')
unk = tokenizer.convert_tokens_to_ids('<unk>')

config = GPT2Config(vocab_size=52011,
                    resid_pdrop=0,
                    embd_pdrop=0,
                    attn_pdrop=0,
                    summary_first_dropout=0)
model = GPT2LMHeadModel(config)

# model_dir = '../KorGPT-2SampleModel/lyric_model.bin'
model_dir = '../model/summary_model.bin'
Beispiel #5
0
import torch
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
from transformers import (AdamW, GPT2DoubleHeadsModel, GPT2Config,
                          WEIGHTS_NAME, CONFIG_NAME, cached_path)

from new_tokenizer import MyTokenizer
#import time

vocab_file_path = 'model/kogpt2_news_wiki_ko_cased_818bfa919d.spiece'
tokenizer = MyTokenizer(vocab_file_path)

SPECIAL_TOKENS = ["<s>", "</s>", "<sent1>", "<sent2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {
    'bos_token': '<s>',
    'eos_token': '</s>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<sent1>', '<sent2>']
}
MODEL_INPUTS = [
    "input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"
]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

logger = logging.getLogger(__file__)
from transformers import GPT2LMHeadModel, GPT2Config, AdamW
from new_tokenizer import MyTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import kss

vocab_file_path = '../tokenizer/vocab.json'
merge_file_path = '../tokenizer/merges.txt'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
config = GPT2Config(vocab_size=52000)
model = GPT2LMHeadModel(config)

model_dir = '../KorGPT-2SampleModel/pytorch_model.bin'

model.load_state_dict(torch.load(model_dir), strict=False)
model.to('cuda')

ATTR_TO_SPECIAL_TOKEN = ['<answer>', '</answer>', '<question>', '</question>']


def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
    num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens +
                                  num_added_tokens + 1)


add_special_tokens_(model, tokenizer)
from transformers import GPT2LMHeadModel, GPT2Config
from new_tokenizer import MyTokenizer
import torch

ATTR_TO_SPECIAL_TOKEN = ['<song>', '</song>']

vocab_file_path = '../tokenizer/vocab.json'
merge_file_path = '../tokenizer/merges.txt'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
bos = tokenizer.convert_tokens_to_ids('<s>')
eos = tokenizer.convert_tokens_to_ids('</s>')
pad = tokenizer.convert_tokens_to_ids('<pad>')
unk = tokenizer.convert_tokens_to_ids('<unk>')

config = GPT2Config(vocab_size=52003,
                    resid_pdrop=0,
                    embd_pdrop=0,
                    attn_pdrop=0,
                    summary_first_dropout=0)
model = GPT2LMHeadModel(config)

model_dir = '../KorGPT-2SampleModel/lyric_model.bin'

model.load_state_dict(torch.load(model_dir), strict=False)
model.to('cpu')


def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
from transformers import GPT2LMHeadModel, GPT2Config, AdamW
from new_tokenizer import MyTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import kss

vocab_file_path = '../tokenizer/vocab.json'
merge_file_path = '../tokenizer/merges.txt'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
config = GPT2Config(vocab_size=52000)
model = GPT2LMHeadModel(config)

model_dir = '../KorGPT-2SampleModel/pytorch_model.bin'

model.load_state_dict(torch.load(model_dir), strict=False)
model.to('cuda')

ATTR_TO_SPECIAL_TOKEN = ['<answer>', '<sep>', '</answer>']


def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
    num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens +
                                  num_added_tokens + 1)


add_special_tokens_(model, tokenizer)
parser.add_argument('--num-hidden-layers', type=int, default=6)
parser.add_argument('--type-vocab-size', type=int, default=1)
parser.add_argument('--token-max-len', type=int, default=512)

# Data and model checkpoints/otput directories from the container environment
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])

args = parser.parse_args()

vocab_file_path = os.path.join(args.data_dir, 'tokenizer/vocab.json')
merge_file_path = os.path.join(args.data_dir,'tokenizer/merges.txt')
model_file = os.path.join(args.data_dir,'KorGPT-2SampleModel/pytorch_model.bin')

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
tokenizer.save(args.model_dir) # Save it to model dir for generation

config = GPT2Config(vocab_size=52000)
model = GPT2LMHeadModel(config)
  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.load_state_dict(torch.load(model_file, map_location=device), strict=False)
model.to("cpu").eval() # Memory
get_gpu_memory()

ATTR_TO_SPECIAL_TOKEN = ['<song>', '</song>']

def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()