# korean vocab
split_ko = Stemmer(language='ko')
count_ko = Counter(
    itertools.chain.from_iterable(tr_dataset['ko'].apply(
        split_ko.extract_stem).tolist()))
list_of_token_ko = sorted(
    [token[0] for token in count_ko.items() if token[1] >= 15])
tmp_vocab = nlp.Vocab(Counter(list_of_token_ko),
                      bos_token=None,
                      eos_token=None)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None)
vocab_ko.embedding = array

with open(data_dir / 'vocab_ko.pkl', mode='wb') as io:
    pickle.dump(vocab_ko, io)

# english vocab
split_en = Stemmer(language='en')
count_en = Counter(
    itertools.chain.from_iterable(tr_dataset['en'].apply(
        split_en.extract_stem).tolist()))
list_of_token_en = [token[0] for token in count_en.items() if token[1] >= 15]
tmp_vocab = nlp.Vocab(Counter(list_of_token_en))
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple')
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()
# extracting morph in sentences
list_of_tokens = tr["document"].apply(split_morphs).tolist()

# generating the vocab
token_counter = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=token_counter,
                      min_freq=10,
                      bos_token=None,
                      eos_token=None)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko")
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab = Vocab(
    tmp_vocab.idx_to_token,
    padding_token="<pad>",
    unknown_token="<unk>",
    bos_token=None,
    eos_token=None,
)
vocab.embedding = array

# saving vocab
with open(nsmc_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)

config.update({"vocab": str(nsmc_dir / "vocab.pkl")})
config.save("conf/dataset/nsmc.json")
Beispiel #3
0
import pandas as pd
import itertools
import gluonnlp as nlp
from pathlib import Path
from collections import Counter
from model.split import split_morphs
from model.utils import Vocab
from utils import Config

qpair_dir = Path("qpair")
config = Config("conf/dataset/qpair.json")
train = pd.read_csv(config.train, sep="\t")

list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens = list_of_tokens_qa + list_of_tokens_qb

count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)

vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy()

with open(qpair_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)

config.update({"vocab": str(qpair_dir / "vocab.pkl")})
config.save("conf/dataset/qpair.json")