Exemple #1
0
    def __init__(self, prior_config, rl_config, beam_size=5):
        self.prior_config = prior_config
        self.rl_config = rl_config
        self.rl_config.beam_size = beam_size

        print('Loading Vocabulary...')
        self.vocab = Vocab()
        self.vocab.load(prior_config.word2id_path, prior_config.id2word_path)
        self.prior_config.vocab_size = self.vocab.vocab_size
        self.rl_config.vocab_size = self.vocab.vocab_size
        print(f'Vocabulary size: {self.vocab.vocab_size}')
        
        self.eval_data = self.get_data_loader()
        self.build_models()
Exemple #2
0
    def __init__(self, config, val_config):
        self.config = config
        self.val_config = val_config

        vocab = Vocab()
        vocab.load(config.word2id_path, config.id2word_path)
        self.vocab = vocab
        self.config.vocab_size = vocab.vocab_size

        # To initialize simulated conversations
        self.start_sentences = self.load_sentences(self.config.dataset_dir)
        self.eval_data = self.get_data_loader(train=False)
        self.build_models()

        if self.config.load_rl_ckpt:
            self.load_models()

        self.set_up_optimizers()
        self.set_up_summary()
        self.set_up_logging()

        if self.config.rl_batch_size == self.config.beam_size:
            raise ValueError('Decoding breaks if batch_size == beam_size')
# extracting morph in sentences
list_of_tokens = tr["document"].apply(split_morphs).tolist()

# generating the vocab
token_counter = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=token_counter,
                      min_freq=10,
                      bos_token=None,
                      eos_token=None)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko")
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab = Vocab(
    tmp_vocab.idx_to_token,
    padding_token="<pad>",
    unknown_token="<unk>",
    bos_token=None,
    eos_token=None,
)
vocab.embedding = array

# saving vocab
with open(nsmc_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)

config.update({"vocab": str(nsmc_dir / "vocab.pkl")})
config.save("conf/dataset/nsmc.json")
# korean vocab
split_ko = Stemmer(language='ko')
count_ko = Counter(
    itertools.chain.from_iterable(tr_dataset['ko'].apply(
        split_ko.extract_stem).tolist()))
list_of_token_ko = sorted(
    [token[0] for token in count_ko.items() if token[1] >= 15])
tmp_vocab = nlp.Vocab(Counter(list_of_token_ko),
                      bos_token=None,
                      eos_token=None)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None)
vocab_ko.embedding = array

with open(data_dir / 'vocab_ko.pkl', mode='wb') as io:
    pickle.dump(vocab_ko, io)

# english vocab
split_en = Stemmer(language='en')
count_en = Counter(
    itertools.chain.from_iterable(tr_dataset['en'].apply(
        split_en.extract_stem).tolist()))
list_of_token_en = [token[0] for token in count_en.items() if token[1] >= 15]
tmp_vocab = nlp.Vocab(Counter(list_of_token_en))
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple')
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()
Exemple #5
0
            urlretrieve('https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params',
                        filename=ptr_bert_path)
            ptr_bert = torch.load(ptr_bert_path)
            ptr_bert = OrderedDict([(('bert.' + k), ptr_bert.get(k)) for k in ptr_bert.keys()])
            torch.save(ptr_bert, ptr_bert_path)
        else:
            print('Already you have pytorch_model_skt.bin!')

        if not ptr_vocab_path.exists():
            urlretrieve('https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json',
                        filename=ptr_vocab_path)
            ptr_bert_vocab = BERTVocab.from_json(ptr_vocab_path.open(mode='rt').read())
            vocab = Vocab(ptr_bert_vocab.idx_to_token,
                          padding_token="[PAD]",
                          unknown_token="[UNK]",
                          bos_token=None,
                          eos_token=None,
                          reserved_tokens=["[CLS]", "[SEP]", "[MASK]"],
                          token_to_idx=ptr_bert_vocab.token_to_idx)

            # save vocab
            with open(ptr_vocab_path.with_suffix('.pkl'), mode="wb") as io:
                pickle.dump(vocab, io)
        else:
            print('Already you have pytorch_model_skt_vocab.json!')

        if not ptr_tokenizer_path.exists():
            urlretrieve('https://kobert.blob.core.windows.net/models/kobert/tokenizer/tokenizer_78b3253a26.model',
                        filename=ptr_tokenizer_path)
        else:
            print('Already you have pytorch_model_skt_tokenizer.model')
import json
import pickle
from model.utils import Vocab
from bert.tokenization import BertTokenizer

with open('experiment/config.json') as f:
    params = json.loads(f.read())

# loading BertTokenizer
ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list',
                                              do_lower_case=False)
idx_to_token = list(ptr_tokenizer.vocab.keys())

# generate vocab
token_vocab = Vocab(idx_to_token,
                    padding_token='[PAD]',
                    unknown_token='[UNK]',
                    bos_token=None,
                    eos_token=None,
                    reserved_tokens=['[CLS]', '[SEP]', '[MASK]'],
                    unknown_token_idx=1)

# save vocab
token_vocab_path = params['filepath'].get('token_vocab')
with open(token_vocab_path, 'wb') as f:
    pickle.dump(token_vocab, f)
Exemple #7
0
import pandas as pd
import itertools
import gluonnlp as nlp
from pathlib import Path
from collections import Counter
from model.split import split_morphs
from model.utils import Vocab
from utils import Config

qpair_dir = Path("qpair")
config = Config("conf/dataset/qpair.json")
train = pd.read_csv(config.train, sep="\t")

list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens = list_of_tokens_qa + list_of_tokens_qb

count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)

vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy()

with open(qpair_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)

config.update({"vocab": str(qpair_dir / "vocab.pkl")})
config.save("conf/dataset/qpair.json")
def main():
    """
    here is the plan: for each dialogue create a history sequence of sentences
    seperated by <s>. The sentences in the history must occur in a short time
    span from another so they are relevant. The last sentence becomes the response
    where the response must also be in the span
    :return:
    """
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-dataset_dir",
        default="./datasets/personachat/raw",
        type=str,
        required=False,
        help="The input data dir. Should contain the xml for the task.")
    parser.add_argument("-output_dir",
                        default="./datasets/personachat/",
                        type=str,
                        required=False,
                        help="The output data dir.")
    parser.add_argument("-type",
                        default="none_original",
                        type=str,
                        required=False,
                        help="The genres you would like to use.")
    parser.add_argument("-max_sentence_tokens",
                        default=30,
                        type=int,
                        help="the maximum amout of sentence tokens")
    parser.add_argument(
        "-a_nice_note",
        default="only dialogues 1-10",
        type=str,
        required=False,
        help="leave a nice lil note for yourself in the future")

    parser.add_argument(
        '-train_split',
        default=0.9,
        type=float,
        help=
        'fraction of dataset to use for training, remainder is halved for val & test'
    )

    parser.add_argument('-vocab_size',
                        default=20000,
                        type=int,
                        help='maximum size of the vocabulary for training')

    args = parser.parse_args()

    filename = os.path.join(args.dataset_dir, "train_{}.txt".format(args.type))

    conversations = create_dialogues(filename, args.max_sentence_tokens)

    for conversation in conversations:
        for utterance in conversation:
            if len(utterance) != args.max_sentence_tokens:
                print('Length of utterance not equal max: %s' % len(utterance))
                exit()

    print(conversations[0])

    # shuffle dataset

    random.seed('seed')
    random.shuffle(conversations)

    print('Number of conversations: %s' % len(conversations))

    mean_n_convos = sum([len(conv)
                         for conv in conversations]) / len(conversations)
    print('Average utterances per conversations: %s' % mean_n_convos)

    # this is format needed to train dialogue models on this domain
    def format_for_dialogue(conversations):
        conversation_length = [len(conv) for conv in conversations]
        sentence_length = [[
            sum([1 for token in sent if token != '<pad>']) for sent in conv
        ] for conv in conversations]
        sentences = conversations
        return conversation_length, sentence_length, sentences

    val_idx = int(len(conversations) * args.train_split)
    test_idx = (len(conversations) + val_idx) // 2
    print(val_idx)

    train_convos = conversations[:val_idx]
    val_convos = conversations[val_idx:test_idx]
    test_convos = conversations[test_idx:]

    # construct vocab
    vocab = Vocab()
    vocab.add_dataframe(train_convos, tokenized=True)
    vocab.update(args.vocab_size)
    print('Vocab size: %s' % len(vocab))

    word2id_path = os.path.join(args.output_dir, 'word2id.pkl')
    id2word_path = os.path.join(args.output_dir, 'id2word.pkl')
    vocab.pickle(word2id_path, id2word_path)

    print('Split: train %s, val %s, test %s' %
          (len(train_convos), len(val_convos), len(test_convos)))

    os.makedirs(args.output_dir, exist_ok=True)

    train_convo_len, train_sent_len, train_sent = format_for_dialogue(
        train_convos)
    print('Example data')
    print(train_convo_len[0])
    print(train_sent_len[0])
    print(train_sent[0])
    print()

    os.makedirs(os.path.join(args.output_dir, 'train'), exist_ok=True)
    pickle.dump(
        train_convo_len,
        open(os.path.join(args.output_dir, 'train', 'conversation_length.pkl'),
             'wb'))
    pickle.dump(
        train_sent_len,
        open(os.path.join(args.output_dir, 'train', 'sentence_length.pkl'),
             'wb'))
    pickle.dump(
        train_sent,
        open(os.path.join(args.output_dir, 'train', 'sentences.pkl'), 'wb'))

    val_convo_len, val_sent_len, val_sent = format_for_dialogue(val_convos)
    os.makedirs(os.path.join(args.output_dir, 'valid'), exist_ok=True)
    pickle.dump(
        val_convo_len,
        open(os.path.join(args.output_dir, 'valid', 'conversation_length.pkl'),
             'wb'))
    pickle.dump(
        val_sent_len,
        open(os.path.join(args.output_dir, 'valid', 'sentence_length.pkl'),
             'wb'))
    pickle.dump(
        val_sent,
        open(os.path.join(args.output_dir, 'valid', 'sentences.pkl'), 'wb'))

    test_convo_len, test_sent_len, test_sent = format_for_dialogue(test_convos)
    os.makedirs(os.path.join(args.output_dir, 'test'), exist_ok=True)
    pickle.dump(
        test_convo_len,
        open(os.path.join(args.output_dir, 'test', 'conversation_length.pkl'),
             'wb'))
    pickle.dump(
        test_sent_len,
        open(os.path.join(args.output_dir, 'test', 'sentence_length.pkl'),
             'wb'))
    pickle.dump(
        test_sent,
        open(os.path.join(args.output_dir, 'test', 'sentences.pkl'), 'wb'))
Exemple #9
0
import json
import pickle
from model.utils import Vocab
from bert.tokenization import BertTokenizer


with open('experiment/config.json') as f:
    params = json.loads(f.read())

# loading BertTokenizer
ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list', do_lower_case=False)
idx_to_token = list(ptr_tokenizer.vocab.keys())

# generate vocab
token_vocab = Vocab(idx_to_token, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None,
              eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], unknown_token_idx=1)
label_vocab = Vocab(['<split>', '<non_split>'], unknown_token=None, bos_token=None, eos_token=None)


# save vocab
token_vocab_path = params['filepath'].get('token_vocab')
label_vocab_path = params['filepath'].get('label_vocab')
with open(token_vocab_path, 'wb') as f:
    pickle.dump(token_vocab, f)
with open(label_vocab_path, 'wb') as f:
    pickle.dump(label_vocab, f)
Exemple #10
0
    def __init__(self,
                 id,
                 name,
                 checkpoint_path,
                 max_conversation_length=5,
                 max_sentence_length=30,
                 is_test_bot=False,
                 rl=False,
                 safe_mode=True):
        """
        All chatbots should extend this class and be registered with the @registerbot decorator
        :param id: An id string, must be unique!
        :param name: A user-friendly string shown to the end user to identify the chatbot. Should be unique.
        :param checkpoint_path: Directory where the trained model checkpoint is saved.
        :param max_conversation_length: Maximum number of conversation turns to condition on.
        :param max_sentence_length: Maximum number of tokens per sentence.
        :param is_test_bot: If True, this bot it can be chosen from the list of
            bots you see at /dialogadmins screen, but will never be randomly
            assigned to users landing on the home page.
        """
        self.id = id
        self.name = name
        self.checkpoint_path = checkpoint_path
        self.max_conversation_length = max_conversation_length
        self.max_sentence_length = max_sentence_length
        self.is_test_bot = is_test_bot
        self.safe_mode = safe_mode

        print("\n\nCreating chatbot", name)

        self.config = get_config_from_dir(checkpoint_path,
                                          mode='test',
                                          load_rl_ckpt=rl)
        self.config.beam_size = 5

        print('Loading Vocabulary...')
        self.vocab = Vocab()
        self.vocab.load(self.config.word2id_path, self.config.id2word_path)
        print(f'Vocabulary size: {self.vocab.vocab_size}')

        self.config.vocab_size = self.vocab.vocab_size

        # If checkpoint is for an emotion model, load that pickle file
        emotion_sentences = None
        if self.config.emotion:
            emotion_sentences = load_pickle(self.config.emojis_path)

        # Load infersent embeddings if necessary
        infersent_sentences = None
        if self.config.infersent:
            print('Loading infersent sentence embeddings...')
            infersent_sentences = load_pickle(self.config.infersent_path)
            embedding_size = infersent_sentences[0][0].shape[0]
            self.config.infersent_output_size = embedding_size

        self.data_loader = get_loader(
            sentences=load_pickle(self.config.sentences_path),
            conversation_length=load_pickle(
                self.config.conversation_length_path),
            sentence_length=load_pickle(self.config.sentence_length_path),
            vocab=self.vocab,
            batch_size=self.config.batch_size,
            emojis=emotion_sentences)

        if self.config.model in VariationalModels:
            self.solver = VariationalSolver(self.config,
                                            None,
                                            self.data_loader,
                                            vocab=self.vocab,
                                            is_train=False)
        elif self.config.model == 'Transformer':
            self.solver = ParlAISolver(self.config)
        else:
            self.solver = Solver(self.config,
                                 None,
                                 self.data_loader,
                                 vocab=self.vocab,
                                 is_train=False)

        self.solver.build()
Exemple #11
0
list_of_tokens = [
    token_count[0] for token_count in token_counter.items()
    if token_count[1] >= min_freq
]
list_of_tokens = sorted(list_of_tokens)
list_of_tokens.insert(0, '<pad>')
list_of_tokens.insert(0, '<unk>')

tmp_vocab = nlp.Vocab(counter=Counter(list_of_tokens),
                      min_freq=1,
                      bos_token=None,
                      eos_token=None)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab = Vocab(list_of_tokens,
              padding_token='<pad>',
              unknown_token='<unk>',
              bos_token=None,
              eos_token=None)
vocab.embedding = array

# saving vocab
with open('data/vocab.pkl', mode='wb') as io:
    pickle.dump(vocab, io)
data_config.vocab = 'data/vocab.pkl'
data_config.save('data/config.json')
            line = line.strip()

            if line:
                data.append(line.split('\t')[1:])
            else:
                dataset.append([list(elm) for elm in zip(*data)])
                data = []
                continue

    except StopIteration:
        print('parsing is done')

label_counter = nlp.data.count_tokens(
    itertools.chain.from_iterable(map(lambda elm: elm[1], dataset)))
tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None)
label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None)

with open('./data/label_vocab.pkl', mode='wb') as io:
    pickle.dump(label_vocab, io)

tr, val = train_test_split(dataset, test_size=.1, random_state=777)
token_counter = nlp.data.count_tokens(
    itertools.chain.from_iterable(map(lambda elm: elm[0], tr)))
tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_token_vocab.set_embedding(ptr_embedding)
token_vocab = Vocab(tmp_token_vocab.idx_to_token)
token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy()

with open('./data/token_vocab.pkl', mode='wb') as io:
    pickle.dump(token_vocab, io)
Exemple #13
0
import pickle
from pathlib import Path
from model.utils import Vocab
from utils import Config

LIST_OF_CHOSUNG = [
    "ㄱ", "ㄲ", "ㄴ", "ㄷ", "ㄸ", "ㄹ", "ㅁ", "ㅂ", "ㅃ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅉ", "ㅊ",
    "ㅋ", "ㅌ", "ㅍ", "ㅎ"
]
LIST_OF_JUNGSUNG = [
    "ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ", "ㅗ", "ㅘ", "ㅙ", "ㅚ", "ㅛ", "ㅜ", "ㅝ",
    "ㅞ", "ㅟ", "ㅠ", "ㅡ", "ㅢ", "ㅣ"
]
LIST_OF_JONGSUNG = [
    " ", "ㄱ", "ㄲ", "ㄳ", "ㄴ", "ㄵ", "ㄶ", "ㄷ", "ㄹ", "ㄺ", "ㄻ", "ㄼ", "ㄽ", "ㄾ", "ㄿ",
    "ㅀ", "ㅁ", "ㅂ", "ㅄ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"
]

LIST_OF_JAMOS = sorted(
    set(LIST_OF_CHOSUNG + LIST_OF_JUNGSUNG + LIST_OF_JONGSUNG))
vocab = Vocab(list_of_tokens=LIST_OF_JAMOS, bos_token=None, eos_token=None)
nsmc_dir = Path("nsmc")

with open(nsmc_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)

config = Config("conf/dataset/nsmc.json")
config.update({"vocab": str(nsmc_dir / "vocab.pkl")})
config.save("conf/dataset/nsmc.json")
import pickle
from model.utils import Vocab
from pretrained.tokenization import BertTokenizer

# loading BertTokenizer
ptr_tokenizer = BertTokenizer.from_pretrained(
    'pretrained/vocab.korean.rawtext.list', do_lower_case=False)
list_of_tokens = list(ptr_tokenizer.vocab.keys())

# generate vocab
vocab = Vocab(list_of_tokens,
              padding_token='[PAD]',
              unknown_token='[UNK]',
              bos_token=None,
              eos_token=None,
              reserved_tokens=['[CLS]', '[SEP]', '[MASK]'],
              token_to_idx={'[UNK]': 1})

# save vocab
with open('pretrained/vocab.pkl', mode='wb') as io:
    pickle.dump(vocab, io)
Exemple #15
0
        # [n_conversations, conversation_length (various)]

        conversation_length = [
            min(len(conversation), max_conv_len)
            for conversation in conversations
        ]

        sentences, sentence_length = preprocess_utils.pad_sentences(
            conversations,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len)

        print('Saving preprocessed data at', split_data_dir)
        to_pickle(conversation_length,
                  split_data_dir.joinpath('conversation_length.pkl'))
        to_pickle(conversations, split_data_dir.joinpath('sentences.pkl'))
        to_pickle(sentence_length,
                  split_data_dir.joinpath('sentence_length.pkl'))

        if split_type == 'train':
            print('Save Vocabulary...')
            vocab = Vocab(tokenizer)
            vocab.add_dataframe(conversations)
            vocab.update(max_size=max_vocab_size, min_freq=min_freq)

            print('Vocabulary size: ', len(vocab))
            vocab.pickle(ubuntu_dir.joinpath('word2id.pkl'),
                         ubuntu_dir.joinpath('id2word.pkl'))

        print('Done!')
train = pd.read_csv(config.train, sep="\t")

list_of_tokens_qa = train["question1"].apply(
    lambda sen: split_morphs(sen)).tolist()
list_of_tokens_qb = train["question2"].apply(
    lambda sen: split_morphs(sen)).tolist()
list_of_tokens = list_of_tokens_qa + list_of_tokens_qb

count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create("fasttext",
                                     source="wiki.ko",
                                     load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)

morph_vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
morph_vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy()

with open(qpair_dir / "morph_vocab.pkl", mode="wb") as io:
    pickle.dump(morph_vocab, io)

config.update({"coarse_vocab": str(qpair_dir / "morph_vocab.pkl")})

# jamo
chosung_list = [
    'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ',
    'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]

jungsung_list = [
    'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ',
import pickle
from model.utils import Vocab

chosung_list = [
    'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ',
    'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]
jungsung_list = [
    'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ',
    'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
]
jongsung_list = [
    ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ',
    'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]

list_of_jamos = sorted(set(chosung_list + jungsung_list + jongsung_list))
vocab = Vocab(list_of_jamos, bos_token=None, eos_token=None)

with open('data/vocab.pkl', mode='wb') as io:
    pickle.dump(vocab, io)
Exemple #18
0
config = Config("conf/dataset/sample.json")
tr = pd.read_csv(config.train, sep='\t')

# korean vocab
split_ko = Stemmer(language='ko')
count_ko = Counter(
    itertools.chain.from_iterable(tr['ko'].apply(
        split_ko.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_ko, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create('fasttext',
                                     source='wiki.ko',
                                     load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab_ko.embedding = array
vocab_ko_filepath = sample_dir / "vocab_ko.pkl"
config.update({"source_vocab": str(vocab_ko_filepath)})

with open(vocab_ko_filepath, mode='wb') as io:
    pickle.dump(vocab_ko, io)

# english vocab
split_en = Stemmer(language='en')
count_en = Counter(
    itertools.chain.from_iterable(tr['en'].apply(
        split_en.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_en)
ptr_embedding = nlp.embedding.create('fasttext',
                                     source='wiki.simple',
Exemple #19
0
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--checkpoint', type=str, default=None)
    parser.add_argument('--mode', type=str, default='test')  # or valid
    kwargs = parser.parse_args()

    config = get_config_from_dir(kwargs.checkpoint, mode=kwargs.mode)
    print(config)

    print('Loading Vocabulary...')
    vocab = Vocab()
    vocab.load(config.word2id_path, config.id2word_path)
    print(f'Vocabulary size: {vocab.vocab_size}')

    config.vocab_size = vocab.vocab_size

    emotion_sentences = None
    if config.emotion:
        emotion_sentences = load_pickle(config.emojis_path)

    # Load infersent embeddings if necessary
    infersent_sentences = None
    if config.infersent:
        print('Loading infersent sentence embeddings...')
        infersent_sentences = load_pickle(config.infersent_path)
        embedding_size = infersent_sentences[0][0].shape[0]
Exemple #20
0
    else:
        print("Already you have {}".format(config_filename))
    print("Saving the config of {} is done.".format(args.type))

    # saving vocab of pretraining model
    ptr_tokenizer = BertTokenizer.from_pretrained(
        args.type, do_lower_case="uncased" in args.type
    )

    idx_to_token = list(ptr_tokenizer.vocab.keys())
    token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)}
    vocab = Vocab(
        list_of_tokens=idx_to_token,
        unknown_token="[UNK]",
        padding_token="[PAD]",
        bos_token=None,
        eos_token=None,
        reserved_tokens=["[CLS]", "[SEP]", "[MASK]"],
        token_to_idx=token_to_idx
    )
    vocab_filename = "{}-vocab.pkl".format(args.type)
    vocab_filepath = ptr_dir / vocab_filename

    if not vocab_filepath.exists():
        with open(vocab_filepath, mode="wb") as io:
            pickle.dump(vocab, io)
    else:
        print("Already you have {}".format(vocab_filename))

    print("Saving the vocab of {} is done".format(args.type))