def test_one_filters(data): keys, words, vocab = data min_f = {keys[0]: -1, keys[1]: 1} with patch('baseline.reader.filter') as filt_mock: filt_mock.return_value = [('a', 1)] _ = _filter_vocab(vocab, min_f) filt_mock.assert_called_once()
def test_filters_both(data): keys, words, vocab = data min_f = dict.fromkeys(keys, 6) vocab = _filter_vocab(vocab, min_f) assert 'a' not in vocab[keys[0]] and 'a' not in vocab[keys[1]] assert 'b' not in vocab[keys[0]] and 'b' not in vocab[keys[1]] assert 'c' in vocab[keys[0]] and 'c' in vocab[keys[1]]
def test_no_filters(data): keys, words, vocab = data min_f = dict.fromkeys(keys, -1) with patch('baseline.reader.filter') as filt_mock: filt_mock.return_value = [('a', 1)] _ = _filter_vocab(vocab, min_f) filt_mock.assert_not_called()
def build_vocab(self, files, **kwargs): label_idx = len(self.label2index) files = listify(files) vocab = {k: Counter() for k in self.vectorizers.keys()} for file_name in files: if file_name is None: continue with codecs.open(file_name + self.data, encoding='utf-8', mode='r') as data_file: with codecs.open(file_name + self.labels, encoding='utf-8', mode='r') as label_file: for d, l in zip(data_file, label_file): if d.strip() == "": continue label = l.rstrip() text = ParallelSeqLabelReader.get_sentence( d, self.clean_fn) if len(text) == 0: continue for k, vectorizer in self.vectorizers.items(): vocab_file = vectorizer.count(text) vocab[k].update(vocab_file) if label not in self.label2index: self.label2index[label] = label_idx label_idx += 1 vocab = _filter_vocab(vocab, kwargs.get('min_f', {})) return vocab, self.get_labels()
def test_one_filters(data): keys, words, vocab = data min_f = {keys[0]: -1, keys[1]: 1} with patch('baseline.reader.filter') as filt_mock: filt_mock.return_value = [('a', 1)] _ = _filter_vocab(vocab, min_f) filt_mock.assert_called_once()
def test_no_filters(data): keys, words, vocab = data min_f = dict.fromkeys(keys, -1) with patch('baseline.reader.filter') as filt_mock: filt_mock.return_value = [('a', 1)] _ = _filter_vocab(vocab, min_f) filt_mock.assert_not_called()
def test_filters_both(data): keys, words, vocab = data min_f = dict.fromkeys(keys, 6) vocab = _filter_vocab(vocab, min_f) assert 'a' not in vocab[keys[0]] and 'a' not in vocab[keys[1]] assert 'b' not in vocab[keys[0]] and 'b' not in vocab[keys[1]] assert 'c' in vocab[keys[0]] and 'c' in vocab[keys[1]]
def build_vocab(self, files, **kwargs): label_idx = len(self.label2index) files = listify(files) vocab = {k: Counter() for k in self.vectorizers.keys()} for file_name in files: if file_name is None: continue with codecs.open(file_name + self.data, encoding='utf-8', mode='r') as data_file: with codecs.open(file_name + self.labels, encoding='utf-8', mode='r') as label_file: for d, l in zip(data_file, label_file): if d.strip() == "": continue label = l.rstrip() text = ParallelSeqLabelReader.get_sentence(d, self.clean_fn) if len(text) == 0: continue for k, vectorizer in self.vectorizers.items(): vocab_file = vectorizer.count(text) vocab[k].update(vocab_file) if label not in self.label2index: self.label2index[label] = label_idx label_idx += 1 vocab = _filter_vocab(vocab, kwargs.get('min_f', {})) return vocab, self.get_labels()
def test_filters_one(data): keys, words, vocab = data gold = deepcopy(vocab) min_f = {keys[0]: 5, keys[1]: 2} vocab = _filter_vocab(vocab, min_f) assert 'a' not in vocab[keys[0]] assert 'b' not in vocab[keys[0]] assert 'c' in vocab[keys[0]] assert vocab[keys[1]] == gold[keys[1]]
def test_filters_one(data): keys, words, vocab = data gold = deepcopy(vocab) min_f = {keys[0]: 5, keys[1]: 2} vocab = _filter_vocab(vocab, min_f) assert 'a' not in vocab[keys[0]] assert 'b' not in vocab[keys[0]] assert 'c' in vocab[keys[0]] assert vocab[keys[1]] == gold[keys[1]]