Esempio n. 1
0
def test_vocabulary_getitem():
    counter = nlp.data.utils.Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>',
                      bos_token=None, eos_token=None, reserved_tokens=None)

    i1 = vocab['c']
    assert i1 == 2
    assert vocab.to_indices('c') == 2

    i2 = vocab[['c']]
    assert i2 == [2]
    assert vocab.to_indices(['c']) == [2]

    i3 = vocab[['<unk>', 'non-exist']]
    assert i3 == [0, 0]
    assert vocab.to_indices(['<unk>', 'non-exist']) == [0, 0]

    i4 = vocab[['a', 'non-exist', 'a', 'b']]
    assert i4 == [4, 0, 4, 3]
    assert vocab.to_indices(['a', 'non-exist', 'a', 'b']) == [4, 0, 4, 3]

    no_unk_vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token=None,
                             bos_token=None, eos_token=None, reserved_tokens=None)
    assert no_unk_vocab['c'] == 1
    assert no_unk_vocab.to_indices('c') == 1

    assert no_unk_vocab[['c']] == [1]
    assert no_unk_vocab.to_indices(['c']) == [1]

    for words in [['<unk>', 'non-exist'], ['a', 'non-exist', 'a', 'b']]:
        with pytest.raises(KeyError):
            no_unk_vocab.to_indices(words)
Esempio n. 2
0
def test_vocab_serialization():
    # Preserving unknown_token behaviour
    vocab = nlp.Vocab(unknown_token=None)
    with pytest.raises(KeyError):
        vocab['hello']
    loaded_vocab = nlp.Vocab.from_json(vocab.to_json())
    with pytest.raises(KeyError):
        loaded_vocab['hello']

    vocab = nlp.Vocab(unknown_token='abc')
    vocab['hello']
    loaded_vocab = nlp.Vocab.from_json(vocab.to_json())
    loaded_vocab['hello']
Esempio n. 3
0
def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset,
                 model_name):
    all_token = []
    max_len = 0
    for dataset in (train_dataset, dev_dataset, test_dataset):
        for line in dataset:
            line = _clean_str(line[0], data_name).split()
            max_len = max_len if max_len > len(line) else len(line)
            all_token.extend(line)
    vocab = nlp.Vocab(nlp.data.count_tokens(all_token))
    if (model_name == 'rand'):
        emb = nlp.embedding.TokenEmbedding()
        emb[emb.unknown_token] = nd.zeros(300)
        vocab.set_embedding(emb)
    else:
        vocab.set_embedding(
            nlp.embedding.create('Word2Vec',
                                 source='GoogleNews-vectors-negative300'))
    for word in vocab.embedding._idx_to_token:
        if (vocab.embedding[word] == nd.zeros(300)).sum() == 300:
            vocab.embedding[word] = nd.random.uniform(0, 0.05, 300)
    vocab.embedding['<unk>'] = nd.random.uniform(0, 0.05, 300)
    vocab.embedding['<pad>'] = nd.zeros(300)
    vocab.embedding['<bos>'] = nd.zeros(300)
    vocab.embedding['<eos>'] = nd.zeros(300)
    print('maximum length (in tokens): ', max_len)
    return vocab, max_len
def test_bptt_batchify_padding_token():
    vocab = nlp.Vocab(nlp.data.utils.Counter(['a', 'b', 'c']),
                      padding_token=None)
    seq_len = 35
    batch_size = 80

    # Padding token must always be specified for StreamBPTTBatchify
    with pytest.raises(ValueError):
        nlp.data.batchify.StreamBPTTBatchify(vocab,
                                             seq_len,
                                             batch_size,
                                             last_batch='discard')

    with pytest.raises(ValueError):
        nlp.data.batchify.StreamBPTTBatchify(vocab,
                                             seq_len,
                                             batch_size,
                                             last_batch='keep')

    # Padding token must be specified for last_batch='keep' for CorpusBPTTBatchify
    with pytest.raises(ValueError):
        nlp.data.batchify.CorpusBPTTBatchify(vocab,
                                             seq_len,
                                             batch_size,
                                             last_batch='keep')

    nlp.data.batchify.CorpusBPTTBatchify(vocab,
                                         seq_len,
                                         batch_size,
                                         last_batch='discard')
Esempio n. 5
0
def build_vocabulary(embeddings, tr_df, val_df=None, tst_df=None):
    """
    Inputs: arrays representing the training, and optionally validation and test data (transductive case)
    Outputs: vocabulary (Tokenized text as in-place modification of input arrays or returned as new arrays)
    """
    all_tokens = []

    # appends the other datasets if they are not null
    datasets = [tr_df]
    if val_df is not None:
        datasets.append(val_df)
    if tst_df is not None:
        datasets.append(tst_df)

    # For each dataset, get each twit, tokenize it, and add each token to the list of tokens
    for dataset in datasets:
        for text_instance in dataset['text'].values:
            tokens = word_tokenize(text_instance)
            all_tokens.extend(tokens)

    # Count the tokens and create a vocab object
    counter = nlp.data.count_tokens(all_tokens)
    vocab = nlp.Vocab(counter)

    # Attach selected embeddings to the vocabulary
    vocab.set_embedding(nlp.embedding.create('glove', source=embeddings))

    return vocab
Esempio n. 6
0
def get_train_data(args):
    """Helper function to get training data."""
    with print_time('load training dataset'):
        dataset = nlp.data.Text8(segment='train')

    with print_time('count tokens'):
        counter = nlp.data.count_tokens(itertools.chain.from_iterable(dataset))

    vocab = nlp.Vocab(counter,
                      unknown_token=None,
                      padding_token=None,
                      bos_token=None,
                      eos_token=None,
                      min_freq=5)

    idx_to_counts = np.array([counter[w] for w in vocab.idx_to_token])
    negatives_weights = idx_to_counts**0.75
    negatives_sampler = nlp.data.UnigramCandidateSampler(
        weights=mx.nd.array(negatives_weights))

    # Skip "unknown" tokens
    with print_time('code dataset'):
        coded_dataset = [[
            vocab[token] for token in sentence if token in vocab
        ] for sentence in dataset]
        coded_dataset = [
            sentence for sentence in coded_dataset if len(sentence)
        ]

    with print_time('prune frequent words from sentences'):
        f = idx_to_counts / np.sum(idx_to_counts)
        idx_to_pdiscard = 1 - np.sqrt(args.frequent_token_subsampling / f)

        prune_sentences_ = functools.partial(prune_sentences,
                                             idx_to_pdiscard=idx_to_pdiscard)
        coded_dataset = list(map(prune_sentences_, coded_dataset))

    if args.ngram_buckets:  # Fasttext model
        with print_time('prepare subwords'):
            subword_function = nlp.vocab.create_subword_function(
                'NGramHashes',
                ngrams=args.ngrams,
                num_subwords=args.ngram_buckets)

            # Store subword indices for all words in vocabulary
            idx_to_subwordidxs = list(subword_function(vocab.idx_to_token))
            get_subwords_masks = get_subwords_masks_factory(idx_to_subwordidxs)
            max_subwordidxs_len = max(len(s) for s in idx_to_subwordidxs)
            if max_subwordidxs_len > 500:
                warnings.warn(
                    'The word with largest number of subwords '
                    'has {} subwords, suggesting there are '
                    'some noisy words in your vocabulary. '
                    'You should filter out very long words '
                    'to avoid memory issues.'.format(max_subwordidxs_len))

        return (coded_dataset, negatives_sampler, vocab, subword_function,
                get_subwords_masks)
    else:
        return coded_dataset, negatives_sampler, vocab
def build_vocab(hparams, types="fasttext", source="wiki.simple", min_freq=10):
    lyrics_train = load_lyrics('train')
    lyrics_valid = load_lyrics('valid')
    lyrics_test = load_lyrics('test')

    # Extract token in sentence
    total_vocab = lyrics_train + lyrics_valid + lyrics_test
    list_of_tokens = []
    for i in total_vocab:
        list_of_tokens.append(preprocessing(i))

    token_counter = Counter(itertools.chain.from_iterable(list_of_tokens))
    tmp_vocab = nlp.Vocab(counter=token_counter,
                          min_freq=10,
                          bos_token=None,
                          eos_token=None)

    # connecting SISG embedding with vocab
    ptr_embedding = nlp.embedding.create(types, source=source)
    tmp_vocab.set_embedding(ptr_embedding)
    array = tmp_vocab.embedding.idx_to_vec.asnumpy()

    vocab = Vocab(
        tmp_vocab.idx_to_token,
        padding_token="<pad>",
        unknown_token="<unk>",
        bos_token=None,
        eos_token=None,
    )
    vocab.embedding = array

    # saving vocab
    with open(hparams.dataset_path + "/vocab.pkl", mode="wb") as io:
        pickle.dump(vocab, io)
Esempio n. 8
0
def corpus_process():
    # TODO: try to repalce it with torchtext vocab?
    print(gluonnlp.embedding.list_sources('glove'))
    glove = gluonnlp.embedding.create('glove', source='glove.6B.50d')
    vocab = gluonnlp.Vocab(gluonnlp.data.Counter(glove.idx_to_token))
    vocab.set_embedding(glove)
    # print(vocab['<pad>','<unk>'])
    # print(vocab.idx_to_token[3])
    embeddings = vocab.embedding.idx_to_vec
    # We use imdb5k first
    data_train = pd.read_csv(os.path.join(args.data_path, 'imdb5k_train.csv'))
    data_test = pd.read_csv(os.path.join(args.data_path, 'imdb5k_test.csv'))
    data_train.replace(to_replace='neg', value=0, inplace=True)
    data_train.replace(to_replace='pos', value=1, inplace=True)
    data_test.replace(to_replace='neg', value=0, inplace=True)
    data_test.replace(to_replace='pos', value=1, inplace=True)
    X_train, y_train = get_token_id(data_train['text'],
                                    vocab), np.asarray(data_train['label'])
    X_test, y_test = get_token_id(data_test['text'],
                                  vocab), np.asarray(data_test['label'])
    # print(len(y_train))
    X_train_new, y_train_new = X_train[:4000], y_train[:4000]
    X_valid, y_valid = X_train[4000:], y_train[4000:]
    # print(X_train_new,len(y_train_new),len(y_valid))
    train = (X_train, y_train)
    train_new = (X_train_new, y_train_new)
    valid = (X_valid, y_valid)
    test = (X_test, y_test)
    pickle.dump(train, open(args.data_path + '/train.pkl', 'wb'))
    pickle.dump(train_new, open(args.data_path + '/train_new.pkl', 'wb'))
    pickle.dump(valid, open(args.data_path + '/valid.pkl', 'wb'))
    pickle.dump(test, open(args.data_path + '/test.pkl', 'wb'))
    pickle.dump(embeddings, open(args.data_path + '/embedding_matrix', 'wb'))
Esempio n. 9
0
def test_corpus_bptt_batchify(batch_size, seq_len, wikitext2_test_and_counter):
    data, counter = wikitext2_test_and_counter
    vocab = nlp.Vocab(counter)

    # unsupported last_batch
    with pytest.raises(ValueError):
        bptt_keep = nlp.data.batchify.CorpusBPTTBatchify(
            vocab, seq_len, batch_size, last_batch='unsupported')

    # last_batch='keep'
    bptt_keep = nlp.data.batchify.CorpusBPTTBatchify(
        vocab, seq_len, batch_size, last_batch='keep')
    X, Y = zip(*(bptt_keep(data)))
    X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0)
    coded = mx.nd.concat(
        X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist()
    assert vocab[list(data)] == coded[:len(data)]
    assert all(pad == vocab[vocab.padding_token] for pad in coded[len(data):])

    # last_batch='discard'
    bptt_discard = nlp.data.batchify.CorpusBPTTBatchify(
        vocab, seq_len, batch_size, last_batch='discard')
    X, Y = zip(*(bptt_discard(data)))
    X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0)
    coded = mx.nd.concat(
        X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist()
    assert len(data) - len(coded) < batch_size * seq_len
Esempio n. 10
0
    def build_vocab(self):
        """
        """
        embedding_list = []

        for source_name in ['wiki.ko', 'cc.ko.300']:
            tmp_vocab = nlp.Vocab(counter=Counter(self.sp.tokens),
                                  unknown_token='<unk>',
                                  padding_token='<pad>',
                                  min_freq=1,
                                  bos_token=None,
                                  eos_token=None,
                                  token_to_idx={'<unk>': 1})
            embedding = nlp.embedding.create('fasttext', source=source_name)
            tmp_vocab.set_embedding(embedding)
            array = tmp_vocab.embedding.idx_to_vec.asnumpy()
            array[1] = array.mean(axis=0)
            embedding_list.append(array)
            OOV = int(((array == 0.).sum(axis=1) == array.shape[1]).sum())
            print(f"The number of OOV is {OOV} by {array.shape[0]}")
            self.index.update({"OOV": OOV})

        self.vocab.embedding = embedding_list

        self.index.update({'token2idx': self.vocab.token_to_idx})
        self.index.update(
            {'idx2token': {v: k
                           for k, v in self.vocab.token_to_idx.items()}})
def get_vocabulary_embeddings(examples):
    glove_6b50d = nlp.embedding.create('glove', source='glove.6B.100d')
    vocab = nlp.Vocab(nlp.data.Counter(glove_6b50d.idx_to_token))
    vocab.set_embedding(glove_6b50d)

    prompt_text = ' '.join([
        ' '.join(
            [' '.join(preprocess_glove(turn[0])) for turn in conversation])
        for conversation in examples
    ])
    respon_text = ' '.join([
        ' '.join(
            [' '.join(preprocess_glove(turn[1])) for turn in conversation])
        for conversation in examples
    ])
    tokens = (prompt_text + respon_text).split(' ')

    vocabulary = sorted(list(set(tokens)))
    print('Total vocabulary {}'.format(len(vocabulary)))

    for token in vocabulary:
        if token not in vocab:
            print(token)

    vocabulary = [token for token in vocabulary if token in vocab]
    print('Embeddable vocabulary {}'.format(len(vocabulary)))
Esempio n. 12
0
def get_train_data(args):
    """Helper function to get training data."""
    with print_time('load training dataset'):
        dataset = nlp.data.Text8(segment='train')

    with print_time('count tokens'):
        counter = nlp.data.count_tokens(itertools.chain.from_iterable(dataset))

    vocab = nlp.Vocab(counter,
                      unknown_token=None,
                      padding_token=None,
                      bos_token=None,
                      eos_token=None,
                      min_freq=5)

    idx_to_counts = mx.nd.array([counter[w] for w in vocab.idx_to_token])
    negatives_weights = idx_to_counts**0.75
    negatives_sampler = nlp.data.UnigramCandidateSampler(
        weights=negatives_weights)

    # Skip "unknown" tokens
    with print_time('code dataset'):
        coded_dataset = [[
            vocab[token] for token in sentence if token in vocab
        ] for sentence in dataset]

    with print_time('prune frequent words from sentences'):
        frequent_tokens_subsampling_constant = 1e-3
        f = idx_to_counts / mx.nd.sum(idx_to_counts)
        idx_to_pdiscard = (
            mx.nd.sqrt(frequent_tokens_subsampling_constant / f) +
            frequent_tokens_subsampling_constant / f).asnumpy()

        prune_sentences_ = functools.partial(prune_sentences,
                                             idx_to_pdiscard=idx_to_pdiscard)
        coded_dataset = list(map(prune_sentences_, coded_dataset))

    with print_time('prepare subwords'):
        subword_function = nlp.vocab.create_subword_function(
            'NGramHashes', ngrams=args.ngrams, num_subwords=args.ngram_buckets)

        # Precompute a idx to subwordidxs mapping to support fast lookup
        idx_to_subwordidxs = list(subword_function(vocab.idx_to_token))
        max_subwordidxs_len = max(len(s) for s in idx_to_subwordidxs)

        # Padded max_subwordidxs_len + 1 so each row contains at least one -1
        # element which can be found by np.argmax below.
        idx_to_subwordidxs = np.stack(
            np.pad(b.asnumpy(), (0, max_subwordidxs_len - len(b) + 1), \
                   constant_values=-1, mode='constant')
            for b in idx_to_subwordidxs).astype(np.float32)
        idx_to_subwordidxs = mx.nd.array(idx_to_subwordidxs)

        logging.info(
            'Using %s to obtain subwords. '
            'The word with largest number of subwords '
            'has %s subwords.', subword_function, max_subwordidxs_len)

    return (coded_dataset, negatives_sampler, vocab, subword_function,
            idx_to_subwordidxs)
Esempio n. 13
0
    def __init__(self,
                 text_vocab,
                 scentences,
                 seq_len,
                 is_cased,
                 tag_list,
                 tag_vocab=None):
        self.text_vocab = text_vocab
        self.seq_len = seq_len
        self.tag_list = tag_list

        self.ernie_tokenizer = nlp.data.BERTTokenizer(self.text_vocab,
                                                      lower=not is_cased)

        predected_sentence = [] if scentences is None else load_segment(
            scentences, self.ernie_tokenizer)

        if tag_vocab is None:
            logging.info('Indexing tags...')
            tag_counter = nlp.data.count_tokens([tag for tag in self.tag_list])
            self.tag_vocab = nlp.Vocab(tag_counter,
                                       padding_token=NULL_TAG,
                                       bos_token=None,
                                       eos_token=None,
                                       unknown_token=None)
        else:
            self.tag_vocab = tag_vocab

        self.null_tag_index = self.tag_vocab[NULL_TAG]

        self.predect_inputs = [
            self._encode_as_input(sentence) for sentence in predected_sentence
        ]

        logging.info('tag_vocab: %s', self.tag_vocab)
Esempio n. 14
0
def preprocess_dataset_stream(stream,
                              logging,
                              min_freq=5,
                              max_vocab_size=None):
    counter = None
    i = 0
    for data in iter(stream):
        i += 1
        counter = nlp.data.count_tokens(itertools.chain.from_iterable(data),
                                        counter=counter)
        if i % 100 == 0:
            logging.info("{} Files pre-processed".format(i))
    counter = trim_counter_large_tokens(counter, 20)
    vocab = nlp.Vocab(counter,
                      unknown_token=None,
                      padding_token=None,
                      bos_token=None,
                      eos_token=None,
                      min_freq=min_freq,
                      max_size=max_vocab_size)
    idx_to_counts = [counter[w] for w in vocab.idx_to_token]

    def code(sentence):
        return [vocab[token] for token in sentence if token in vocab]

    def code_corpus(corpus):
        return corpus.transform(code)

    stream = stream.transform(code_corpus)
    return stream, vocab, idx_to_counts
Esempio n. 15
0
File: data.py Progetto: BwRy/tmnt
def preprocess_dataset_stream(stream, logging, min_freq=5, max_vocab_size=None, pre_embedding=None):
    if pre_embedding:
        counter = nlp.data.Counter(pre_embedding.idx_to_token)
        ## increase counts so these terms aren't filtered out of the vocabulary
        for i in range(int(math.log2(min_freq * 2))):
            counter = counter + counter
    else:
        counter = None
    i = 0
    for data in iter(stream):
        i += 1
        counter = nlp.data.count_tokens(itertools.chain.from_iterable(data), counter=counter)
        if i % 100 == 0:
            logging.info("{} Files pre-processed".format(i))
    counter = trim_counter_large_tokens(counter, 30)
    vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
                          bos_token=None, eos_token=None, min_freq=min_freq,
                          max_size=max_vocab_size)
    idx_to_counts = [counter[w] for w in vocab.idx_to_token]

    def code(sentence):
        return [vocab[token] for token in sentence if token in vocab]

    def code_corpus(corpus):
        return corpus.transform(code)

    stream = stream.transform(code_corpus) 
    return stream, vocab, idx_to_counts
Esempio n. 16
0
    def make_vocab(self):
        # train path
        train_path = self.data_path + '/snli_1.0_train.txt'
        # train data를 tab으로 구별 document, label 컬럼으로 불러옴
        tr = pd.read_csv(train_path, sep='\t').loc[:,
                                                   ['sentence1', 'sentence2']]
        # Mecab 정의
        # tokenizer = MeCab()
        # document 열의 데이터를 Mecab의 형태소로 나눈 것들을 list로 변환
        tokenized = tr['sentence1'].apply(
            lambda elm: str(elm).split()).tolist()
        tokenized += tr['sentence2'].apply(
            lambda elm: str(elm).split()).tolist()
        # tokenized 에서 각 단어의 count 저장
        counter = nlp.data.count_tokens(
            itertools.chain.from_iterable(tokenized))

        # counter에서 최소 10번 이상 나온것들을 vocab에 저장
        vocab = nlp.Vocab(counter=counter,
                          min_freq=10,
                          bos_token=None,
                          eos_token=None)

        nlp.embedding.list_sources()
        # wiki.ko 데이터를 fasttext로 벡터화 한 임베딩 가져오기
        embedding = nlp.embedding.create('Glove', source='glove.6B.300d')

        # 만든 vocab에 벡터 적용
        vocab.set_embedding(embedding)

        # vocab.pkl 저장
        with open(self.data_path + '/vocab.pkl', mode='wb') as io:
            pickle.dump(vocab, io)
Esempio n. 17
0
 def get_vocab(self):
     if self.vocab is not None:
         return self.vocab
     else:
         tok_to_idx = self.vectorizer.vocabulary_
         cv_vocab = {v: 1 for v in tok_to_idx}
         cur_idx = len(tok_to_idx)
         if self.additional_feature_keys:
             if isinstance(self.additional_feature_keys, list):
                 for f in self.additional_feature_keys:
                     cv_vocab[f] = 1
                     tok_to_idx[f] = cur_idx
                     cur_idx += 1
             else:
                 ## assume it's a dictionary
                 for k in self.additional_feature_keys:
                     for v in self.additional_feature_keys[k]:
                         cv_vocab[k + ':' + v] = 1
                         tok_to_idx[k + ':' + v] = cur_idx
                         cur_idx += 1
         vocab = nlp.Vocab(cv_vocab,
                           token_to_idx=tok_to_idx,
                           unknown_token=None,
                           eos_token=None,
                           bos_token=None,
                           padding_token=None)
         self.vocab = vocab
     return vocab
Esempio n. 18
0
def test_text_models():
    val = nlp.data.WikiText2(segment='val', root='tests/data/wikitext-2')
    val_freq = get_frequencies(val)
    vocab = nlp.Vocab(val_freq)
    text_models = [
        'standard_lstm_lm_200', 'standard_lstm_lm_650',
        'standard_lstm_lm_1500', 'awd_lstm_lm_1150', 'awd_lstm_lm_600'
    ]
    pretrained_to_test = {
        'standard_lstm_lm_1500': 'wikitext-2',
        'standard_lstm_lm_650': 'wikitext-2',
        'standard_lstm_lm_200': 'wikitext-2',
        'awd_lstm_lm_1150': 'wikitext-2',
        'awd_lstm_lm_600': 'wikitext-2'
    }

    for model_name in text_models:
        eprint('testing forward for %s' % model_name)
        pretrained_dataset = pretrained_to_test.get(model_name)
        model, _ = get_text_model(model_name,
                                  vocab=vocab,
                                  dataset_name=pretrained_dataset,
                                  pretrained=pretrained_dataset is not None,
                                  root='tests/data/model/')

        print(model)
        if not pretrained_dataset:
            model.collect_params().initialize()
        output, state = model(mx.nd.arange(330).reshape(33, 10))
        output.wait_to_read()
Esempio n. 19
0
def test_corpus_batchify(batch_size):
    data = nlp.data.WikiText2(segment='test',
                              root=os.path.join('tests', 'data', 'wikitext-2'))
    vocab = nlp.Vocab(nlp.data.utils.Counter(data))
    batchify = nlp.data.batchify.CorpusBatchify(vocab, batch_size)
    batches = batchify(data)
    assert batches[:].shape == (len(data) // batch_size, batch_size)
Esempio n. 20
0
    def make_vocab(self):
        tr = pd.read_csv(self._train_path,
                         sep='\t').loc[:, ['sentence1', 'sentence2']]
        tokenized = tr['sentence1'].apply(
            lambda elm: str(elm).split()).tolist()
        tokenized += tr['sentence2'].apply(
            lambda elm: str(elm).split()).tolist()
        # tokenized 에서 각 단어의 count 저장
        counter = nlp.data.count_tokens(
            itertools.chain.from_iterable(tokenized))

        # counter에서 최소 10번 이상 나온것들을 vocab에 저장
        vocab = nlp.Vocab(counter=counter,
                          min_freq=10,
                          bos_token=None,
                          eos_token=None)

        nlp.embedding.list_sources()
        # wiki.ko 데이터를 fasttext로 벡터화 한 임베딩 가져오기
        embedding = nlp.embedding.create(
            'word2vec', source='GoogleNews-vectors-negative300')

        # 만든 vocab에 벡터 적용
        vocab.set_embedding(embedding)

        # vocab.pkl 저장
        with open(Path.cwd() / 'data_in' / 'vocab.pkl', mode='wb') as io:
            pickle.dump(vocab, io)
Esempio n. 21
0
def build_vocabulary(train_array, test_array):
    """
        Inputs: arrays representing the training, validation and test data
        Outputs: vocabulary (Tokenized text as in-place modification of input arrays or returned as new arrays)
    """
    # List of all tokens in the dataset.
    all_tokens = []
    # Keep track of all types of labels.
    all_labels = set()

    for array in (train_array, test_array):
        for i, instance in enumerate(array):
            sent, label_string = instance
            tokens = [START_TOKEN, *sent.lower().split(' '), STOP_TOKEN]
            labels = label_string.split(',')

            # In-place modification of array.
            array[i] = (tokens, labels)

            # Update running count of all tokens and all labels types.
            all_tokens.extend(tokens)
            all_labels.update(labels)

    counter = nlp.data.count_tokens(all_tokens)
    vocab = nlp.Vocab(counter)

    return vocab, all_labels
Esempio n. 22
0
def test_vocabulary_to_tokens():
    counter = nlp.data.utils.Counter(
        ['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = nlp.Vocab(counter,
                      max_size=None,
                      min_freq=1,
                      unknown_token='<unknown>',
                      bos_token=None,
                      eos_token=None,
                      reserved_tokens=None)
    i1 = vocab.to_tokens(2)
    assert i1 == 'c'

    i2 = vocab.to_tokens([2])
    assert i2 == ['c']

    i3 = vocab.to_tokens([0, 0])
    assert i3 == ['<unknown>', '<unknown>']

    i4 = vocab.to_tokens([4, 0, 4, 3])
    assert i4 == ['a', '<unknown>', 'a', 'b']

    for indices in [6, [6, 7]]:
        with pytest.raises(ValueError):
            vocab.to_tokens(indices)
Esempio n. 23
0
def _build_vocab(data_name, train_dataset, test_dataset):
    all_token = []
    max_len = 0
    for i, line in enumerate(train_dataset):
        train_dataset[i][0] = _clean_str(line[0], data_name)
        line = train_dataset[i][0].split()
        max_len = max_len if max_len > len(line) else len(line)
        all_token.extend(line)
    for i, line in enumerate(test_dataset):
        test_dataset[i][0] = _clean_str(line[0], data_name)
        line = test_dataset[i][0].split()
        max_len = max_len if max_len > len(line) else len(line)
    all_token.extend(line)
    vocab = nlp.Vocab(nlp.data.count_tokens(all_token))
    vocab.set_embedding(
        nlp.embedding.create('Word2Vec',
                             source='GoogleNews-vectors-negative300'))
    for word in vocab.embedding._idx_to_token:
        if (vocab.embedding[word] == nd.zeros(300)).sum() == 300:
            vocab.embedding[word] = nd.random.normal(-1.0, 1.0, 300)
    vocab.embedding['<unk>'] = nd.zeros(300)
    vocab.embedding['<pad>'] = nd.zeros(300)
    vocab.embedding['<bos>'] = nd.zeros(300)
    vocab.embedding['<eos>'] = nd.zeros(300)
    print('maximum length (in tokens): ', max_len)
    return vocab, max_len
Esempio n. 24
0
def gluonnlp_main():
    """
    추후 적용 예정
    :return:
    """
    import gluonnlp as nlp

    cwd = Path.cwd()
    full_path = cwd / 'data_in/Chatbot_data-master/ChatbotData.csv'
    tr_input, val_input, tr_label, val_label = load_data(data_path=full_path)

    total_input = tr_input + val_input
    mecab_tokenizer = Mecab()

    # extracting morph in sentences
    _list_of_tokens = [
        mecab_tokenizer.morphs(input_item) for input_item in total_input
    ]
    list_of_tokens = []
    for _ in _list_of_tokens:
        list_of_tokens += _

    # making the vocab
    counter = nlp.data.count_tokens(
        itertools.chain.from_iterable(list_of_tokens))
    vocab = nlp.Vocab(counter=counter,
                      min_freq=5,
                      bos_token=None,
                      eos_token=None)
Esempio n. 25
0
 def __init__(self, dataset_token, embedding):
     self.dataset_token = dataset_token
     self.seqs = [sample[0] + sample[1] for sample in dataset_token]
     self.counter = nlp.data.count_tokens(
         list(itertools.chain.from_iterable(self.seqs)))
     self.vocab = nlp.Vocab(self.counter, max_size=40000)
     self.vocab.set_embedding(nlp.embedding.GloVe(source=embedding))
Esempio n. 26
0
def test_wikitext2():
    batch_size = 80
    seq_len = 35

    train = nlp.data.WikiText2(
        segment='train', root=os.path.join('tests', 'data', 'wikitext-2'))
    val = nlp.data.WikiText2(
        segment='val', root=os.path.join('tests', 'data', 'wikitext-2'))
    test = nlp.data.WikiText2(
        segment='test', root=os.path.join('tests', 'data', 'wikitext-2'))
    train_freq, val_freq, test_freq = [nlp.data.utils.Counter(x) for x in [train[0], val[0], test[0]]]
    assert len(train[0]) == 2075677, len(train[0])
    assert len(train_freq) == 33278, len(train_freq)
    assert len(val[0]) == 216347, len(val[0])
    assert len(val_freq) == 13777, len(val_freq)
    assert len(test[0]) == 244102, len(test[0])
    assert len(test_freq) == 14143, len(test_freq)
    assert test_freq['English'] == 32, test_freq['English']

    vocab = nlp.Vocab(train_freq)
    serialized_vocab = vocab.to_json()
    assert len(serialized_vocab) == 962190, len(serialized_vocab)
    assert json.loads(serialized_vocab)['idx_to_token'] == vocab._idx_to_token

    train_data = train.bptt_batchify(vocab, seq_len, batch_size, last_batch='discard')
    assert len(train_data) == 741, len(train_data)

    for i, (data, target) in enumerate(train_data):
        mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy())
        assert data.shape == target.shape == (seq_len, batch_size)

    train_data = train.bptt_batchify(vocab, seq_len, batch_size, last_batch='keep')
    assert len(train_data) == 742, len(train_data)
    assert train_data[-1][0].shape[0] <= seq_len
    for i, (data, target) in enumerate(train_data):
        mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy())
        assert data.shape == target.shape

    train_freq, val_freq, test_freq = [nlp.data.utils.Counter(x) for x in [train[0], val[0], test[0]]]
    train = nlp.data.WikiText2(
        segment='train',
        skip_empty=False,
        root=os.path.join('tests', 'data', 'wikitext-2'))
    val = nlp.data.WikiText2(
        segment='val',
        skip_empty=False,
        root=os.path.join('tests', 'data', 'wikitext-2'))
    test = nlp.data.WikiText2(
        segment='test',
        skip_empty=False,
        root=os.path.join('tests', 'data', 'wikitext-2'))
    assert len(train[0]) == 2088628, len(train[0])
    assert len(train_freq) == 33278, len(train_freq)
    assert len(val[0]) == 217646, len(val[0])
    assert len(val_freq) == 13777, len(val_freq)
    assert len(test[0]) == 245569, len(test[0])
    assert len(test_freq) == 14143, len(test_freq)
    assert test_freq['English'] == 32, test_freq['English']
    batched_data = train.batchify(vocab, batch_size)
    assert batched_data.shape == (26107, batch_size)
Esempio n. 27
0
def test_bptt_batchify(batch_size, seq_len):
    data = nlp.data.WikiText2(segment='test',
                              root=os.path.join('tests', 'data', 'wikitext-2'))
    vocab = nlp.Vocab(nlp.data.utils.Counter(data[0]))

    # unsupported last_batch
    with pytest.raises(ValueError):
        data.bptt_batchify(vocab,
                           seq_len,
                           batch_size,
                           last_batch='unsupported')

    # last_batch='keep'
    X, Y = zip(
        *(data.bptt_batchify(vocab, seq_len, batch_size, last_batch='keep')))
    X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0)
    coded = mx.nd.concat(X, Y[-1].expand_dims(0),
                         dim=0).T.reshape(-1).asnumpy().tolist()
    assert vocab[data[0]] == coded[:len(data[0])]
    assert all(pad == vocab[vocab.padding_token]
               for pad in coded[len(data[0]):])

    # last_batch='discard'
    X, Y = zip(*(
        data.bptt_batchify(vocab, seq_len, batch_size, last_batch='discard')))
    X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0)
    coded = mx.nd.concat(X, Y[-1].expand_dims(0),
                         dim=0).T.reshape(-1).asnumpy().tolist()
    assert len(data[0]) - len(coded) < batch_size * seq_len
def get_train_data(args):
    """Helper function to get training data."""
    counter = dict()
    with io.open(args.vocab, 'r', encoding='utf-8') as f:
        for line in f:
            token, count = line.split('\t')
            counter[token] = int(count)
    vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
                      bos_token=None, eos_token=None, min_freq=1)

    npz = np.load(args.cooccurrences)
    row, col, counts = npz['row'], npz['col'], npz['data']

    rank_dtype = 'int32'
    if row.max() >= np.iinfo(np.int32).max:
        rank_dtype = 'int64'
        # MXNet has no support for uint32, so we must fall back to int64
        logging.info('More words than could be counted using int32. '
                     'Using int64 to represent word indices.')
    row = mx.nd.array(row, dtype=rank_dtype)
    col = mx.nd.array(col, dtype=rank_dtype)
    # row is always used as 'source' and col as 'context' word. Therefore
    # duplicate the entries.

    assert row.shape == col.shape
    row = mx.nd.concatenate([row, col])
    col = mx.nd.concatenate([col, row[:len(row) // 2]])

    counts = mx.nd.array(counts, dtype='float32')
    counts = mx.nd.concatenate([counts, counts])

    return vocab, row, col, counts
    def make_vocab(self):
        jamo_list = sorted(
            set(self.chosung_list + self.jungsung_list + self.jongsung_list))
        counter = nlp.data.count_tokens(jamo_list)
        vocab = nlp.Vocab(counter=counter, bos_token=None, eos_token=None)

        with open(self.data_path + '/' + 'vocab_char.pkl', mode='wb') as io:
            pickle.dump(vocab, io)
Esempio n. 30
0
 def build_vocab(self, dataset, reserved_tokens=None):
     # get_input(ex): id_, ..., label
     sentences = itertools.chain.from_iterable([self.get_input(ex)[1:-1] for ex in dataset])
     tokens = [self.tokenizer.tokenize(s) for s in sentences]
     counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(tokens)))
     vocab = nlp.Vocab(counter, bos_token=None, eos_token=None, reserved_tokens=reserved_tokens)
     logger.info('built vocabulary of size {}'.format(len(vocab)))
     return vocab