Python _filter_vocab Examples

Programming Language: Python

Namespace/Package Name: baseline.reader

Method/Function: _filter_vocab

Examples at hotexamples.com: 10

Python _filter_vocab - 10 examples found. These are the top rated real world Python examples of baseline.reader._filter_vocab extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_readers.py Project: wxiaopei/baseline

def test_one_filters(data):
    keys, words, vocab = data
    min_f = {keys[0]: -1, keys[1]: 1}
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_called_once()

Example #2

Show file

File: test_readers.py Project: wxiaopei/baseline

def test_filters_both(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, 6)
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]] and 'a' not in vocab[keys[1]]
    assert 'b' not in vocab[keys[0]] and 'b' not in vocab[keys[1]]
    assert 'c' in vocab[keys[0]] and 'c' in vocab[keys[1]]

Example #3

Show file

File: test_readers.py Project: wxiaopei/baseline

def test_no_filters(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, -1)
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_not_called()

Example #4

Show file

File: reader_parallel_classify.py Project: wenshuoliu/baseline

    def build_vocab(self, files, **kwargs):
        label_idx = len(self.label2index)
        files = listify(files)
        vocab = {k: Counter() for k in self.vectorizers.keys()}
        for file_name in files:
            if file_name is None: continue
            with codecs.open(file_name + self.data, encoding='utf-8',
                             mode='r') as data_file:
                with codecs.open(file_name + self.labels,
                                 encoding='utf-8',
                                 mode='r') as label_file:
                    for d, l in zip(data_file, label_file):
                        if d.strip() == "": continue
                        label = l.rstrip()
                        text = ParallelSeqLabelReader.get_sentence(
                            d, self.clean_fn)
                        if len(text) == 0: continue

                        for k, vectorizer in self.vectorizers.items():
                            vocab_file = vectorizer.count(text)
                            vocab[k].update(vocab_file)

                        if label not in self.label2index:
                            self.label2index[label] = label_idx
                            label_idx += 1

        vocab = _filter_vocab(vocab, kwargs.get('min_f', {}))

        return vocab, self.get_labels()

Example #5

Show file

File: test_readers.py Project: dpressel/baseline

def test_one_filters(data):
    keys, words, vocab = data
    min_f = {keys[0]: -1, keys[1]: 1}
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_called_once()

Example #6

Show file

File: test_readers.py Project: dpressel/baseline

def test_no_filters(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, -1)
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_not_called()

Example #7

Show file

File: test_readers.py Project: dpressel/baseline

def test_filters_both(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, 6)
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]] and 'a' not in vocab[keys[1]]
    assert 'b' not in vocab[keys[0]] and 'b' not in vocab[keys[1]]
    assert 'c' in vocab[keys[0]] and 'c' in vocab[keys[1]]

Example #8

Show file

File: reader_parallel_classify.py Project: dpressel/baseline

    def build_vocab(self, files, **kwargs):
        label_idx = len(self.label2index)
        files = listify(files)
        vocab = {k: Counter() for k in self.vectorizers.keys()}
        for file_name in files:
            if file_name is None: continue
            with codecs.open(file_name + self.data, encoding='utf-8', mode='r') as data_file:
                with codecs.open(file_name + self.labels, encoding='utf-8', mode='r') as label_file:
                    for d, l in zip(data_file, label_file):
                        if d.strip() == "": continue
                        label = l.rstrip()
                        text = ParallelSeqLabelReader.get_sentence(d, self.clean_fn)
                        if len(text) == 0: continue

                        for k, vectorizer in self.vectorizers.items():
                            vocab_file = vectorizer.count(text)
                            vocab[k].update(vocab_file)

                        if label not in self.label2index:
                            self.label2index[label] = label_idx
                            label_idx += 1

        vocab = _filter_vocab(vocab, kwargs.get('min_f', {}))

        return vocab, self.get_labels()

Example #9

Show file

File: test_readers.py Project: wxiaopei/baseline

def test_filters_one(data):
    keys, words, vocab = data
    gold = deepcopy(vocab)
    min_f = {keys[0]: 5, keys[1]: 2}
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]]
    assert 'b' not in vocab[keys[0]]
    assert 'c' in vocab[keys[0]]
    assert vocab[keys[1]] == gold[keys[1]]

Example #10

Show file

File: test_readers.py Project: dpressel/baseline

def test_filters_one(data):
    keys, words, vocab = data
    gold = deepcopy(vocab)
    min_f = {keys[0]: 5, keys[1]: 2}
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]]
    assert 'b' not in vocab[keys[0]]
    assert 'c' in vocab[keys[0]]
    assert vocab[keys[1]] == gold[keys[1]]