コード例 #1
0
ファイル: test_readers.py プロジェクト: wxiaopei/baseline
def test_one_filters(data):
    keys, words, vocab = data
    min_f = {keys[0]: -1, keys[1]: 1}
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_called_once()
コード例 #2
0
ファイル: test_readers.py プロジェクト: wxiaopei/baseline
def test_filters_both(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, 6)
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]] and 'a' not in vocab[keys[1]]
    assert 'b' not in vocab[keys[0]] and 'b' not in vocab[keys[1]]
    assert 'c' in vocab[keys[0]] and 'c' in vocab[keys[1]]
コード例 #3
0
ファイル: test_readers.py プロジェクト: wxiaopei/baseline
def test_no_filters(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, -1)
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_not_called()
コード例 #4
0
    def build_vocab(self, files, **kwargs):
        label_idx = len(self.label2index)
        files = listify(files)
        vocab = {k: Counter() for k in self.vectorizers.keys()}
        for file_name in files:
            if file_name is None: continue
            with codecs.open(file_name + self.data, encoding='utf-8',
                             mode='r') as data_file:
                with codecs.open(file_name + self.labels,
                                 encoding='utf-8',
                                 mode='r') as label_file:
                    for d, l in zip(data_file, label_file):
                        if d.strip() == "": continue
                        label = l.rstrip()
                        text = ParallelSeqLabelReader.get_sentence(
                            d, self.clean_fn)
                        if len(text) == 0: continue

                        for k, vectorizer in self.vectorizers.items():
                            vocab_file = vectorizer.count(text)
                            vocab[k].update(vocab_file)

                        if label not in self.label2index:
                            self.label2index[label] = label_idx
                            label_idx += 1

        vocab = _filter_vocab(vocab, kwargs.get('min_f', {}))

        return vocab, self.get_labels()
コード例 #5
0
ファイル: test_readers.py プロジェクト: dpressel/baseline
def test_one_filters(data):
    keys, words, vocab = data
    min_f = {keys[0]: -1, keys[1]: 1}
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_called_once()
コード例 #6
0
ファイル: test_readers.py プロジェクト: dpressel/baseline
def test_no_filters(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, -1)
    with patch('baseline.reader.filter') as filt_mock:
        filt_mock.return_value = [('a', 1)]
        _ = _filter_vocab(vocab, min_f)
    filt_mock.assert_not_called()
コード例 #7
0
ファイル: test_readers.py プロジェクト: dpressel/baseline
def test_filters_both(data):
    keys, words, vocab = data
    min_f = dict.fromkeys(keys, 6)
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]] and 'a' not in vocab[keys[1]]
    assert 'b' not in vocab[keys[0]] and 'b' not in vocab[keys[1]]
    assert 'c' in vocab[keys[0]] and 'c' in vocab[keys[1]]
コード例 #8
0
    def build_vocab(self, files, **kwargs):
        label_idx = len(self.label2index)
        files = listify(files)
        vocab = {k: Counter() for k in self.vectorizers.keys()}
        for file_name in files:
            if file_name is None: continue
            with codecs.open(file_name + self.data, encoding='utf-8', mode='r') as data_file:
                with codecs.open(file_name + self.labels, encoding='utf-8', mode='r') as label_file:
                    for d, l in zip(data_file, label_file):
                        if d.strip() == "": continue
                        label = l.rstrip()
                        text = ParallelSeqLabelReader.get_sentence(d, self.clean_fn)
                        if len(text) == 0: continue

                        for k, vectorizer in self.vectorizers.items():
                            vocab_file = vectorizer.count(text)
                            vocab[k].update(vocab_file)

                        if label not in self.label2index:
                            self.label2index[label] = label_idx
                            label_idx += 1

        vocab = _filter_vocab(vocab, kwargs.get('min_f', {}))

        return vocab, self.get_labels()
コード例 #9
0
ファイル: test_readers.py プロジェクト: wxiaopei/baseline
def test_filters_one(data):
    keys, words, vocab = data
    gold = deepcopy(vocab)
    min_f = {keys[0]: 5, keys[1]: 2}
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]]
    assert 'b' not in vocab[keys[0]]
    assert 'c' in vocab[keys[0]]
    assert vocab[keys[1]] == gold[keys[1]]
コード例 #10
0
ファイル: test_readers.py プロジェクト: dpressel/baseline
def test_filters_one(data):
    keys, words, vocab = data
    gold = deepcopy(vocab)
    min_f = {keys[0]: 5, keys[1]: 2}
    vocab = _filter_vocab(vocab, min_f)
    assert 'a' not in vocab[keys[0]]
    assert 'b' not in vocab[keys[0]]
    assert 'c' in vocab[keys[0]]
    assert vocab[keys[1]] == gold[keys[1]]