Esempio n. 1
0
def main(config):
    print(config)

    list_of_tokens = []
    if config.is_tokenized:
        # read tokens
        with open(config.corpus, 'r', encoding='utf8') as reader:
            for li, line in enumerate(reader):
                list_of_tokens += line.strip().split()
    else:
        # select tokenizer
        if config.tokenizer == 'mecab':
            from konlpy.tag import Mecab
            tokenizer = Tokenizer(tokenization_fn=Mecab().morphs)

        # tokenization & read tokens
        with open(config.corpus, 'r', encoding='utf8') as reader:
            for li, line in enumerate(reader):
                list_of_tokens += tokenizer.tokenize(line.strip())

    # build vocabulary
    vocab = Vocab(list_of_tokens=list_of_tokens,
                  unk_token=config.unk_token,
                  pad_token=config.pad_token,
                  bos_token=config.bos_token,
                  eos_token=config.eos_token,
                  min_freq=config.min_freq,
                  lower=config.lower)
    vocab.build()
    print('Vocabulary size: ', len(vocab))

    # save vocabulary
    with open(config.vocab, 'wb') as writer:
        pickle.dump(vocab, writer)
    print('Vocabulary saved to', config.vocab)
Esempio n. 2
0
        from nltk.tokenize import word_tokenize
        tokenization_fn = word_tokenize
    elif config.tokenizer ==TOKENIZER[1]:
        from konlpy.tag import Mecab
        tokenization_fn = Mecab().morphs
    
    tokenizer = Tokenizer(tokenization_fn=tokenization_fn,
                          is_sentence=config.is_sentence,
                          max_seq_length=config.max_seq_length)

    # Tokenization & read tokens
    list_of_tokens = []
    with open(config.corpus, 'r', encoding='-utf-8', errors='ignore') as reader:
        for li, line in enumerate(reader):
            text = ' '.join(line.split('\t')[1:]).strip()
            list_of_tokens += tokenizer.tokenize(text)

    # Build vocabulary
    vocab = Vocab(list_of_tokens=list_of_tokens,
                  unk_token=config.unk_token,
                  pad_token=config.pad_token,
                  bos_token=config.bos_token,
                  eos_token=config.eos_token,
                  min_freq=config.min_freq,
                  lower=config.lower)
    vocab.build()
    if config.pretrained_vectors:
        pretrained_vectors = load_pretrained(fname=config.pretrained_vectors)
        vocab.from_pretrained(pretrained_vectors=pretrained_vectors)
    print('Vocabulary size: ', len(vocab))