def _make_docs_getter(self, max_docs_per_category): if max_docs_per_category is None: docs_getter = DocsAndLabelsFromCorpus(self.term_doc_matrix) else: docs_getter = DocsAndLabelsFromCorpusSample( self.term_doc_matrix, max_docs_per_category) if self.scatterchartdata.use_non_text_features: docs_getter = docs_getter.use_non_text_features() return docs_getter
def test_categories(self): for obj in [DocsAndLabelsFromCorpusSample(self.corpus, 1), DocsAndLabelsFromCorpus(self.corpus)]: output = obj.get_labels_and_texts() self.assertEqual(output['categories'], ['hamlet', 'jay-z/r. kelly', '???']) metadata = ['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0', 'element 4 1', 'element 5 1', 'element 6 1', 'element 7 1', 'element 8 1', 'element 9 2'] output = obj.get_labels_and_texts_and_meta(metadata) self.assertEqual(output['categories'], ['hamlet', 'jay-z/r. kelly', '???'])
def test_max_per_category(self): docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2, seed=0) metadata = np.array([ 'element 0 0', 'element 1 0', 'element 2 0', 'element 3 0', 'element 4 1', 'element 5 1', 'element 6 1', 'element 7 1', 'element 8 1', 'element 9 2' ]) output = docs_and_labels.get_labels_and_texts_and_meta(metadata) self.assertTrue('texts' in output) self.assertTrue('labels' in output) self.assertTrue('meta' in output) self.assertTrue('extra' not in output) d = {} for text, lab, meta in zip(output['texts'], output['labels'], output['meta']): d.setdefault(lab, []).append(text) for lab, documents in d.items(): self.assertLessEqual(len(documents), 2) json.dumps(d) docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2) output = docs_and_labels.get_labels_and_texts() self.assertTrue('texts' in output) self.assertTrue('labels' in output) self.assertTrue('meta' not in output) self.assertTrue('extra' not in output) d = {} for text, lab in zip(output['texts'], output['labels']): d.setdefault(lab, []).append(text) for lab, documents in d.items(): self.assertLessEqual(len(documents), 2) json.dumps(d) docs_and_labels = DocsAndLabelsFromCorpusSample( self.parsed_corpus, max_per_category=2).use_non_text_features() output = docs_and_labels.get_labels_and_texts() self.assertTrue('texts' in output) self.assertTrue('labels' in output) self.assertTrue('meta' not in output) self.assertTrue('extra' in output) d = {} for text, lab in zip(output['texts'], output['labels']): d.setdefault(lab, []).append(text) for lab, documents in d.items(): self.assertLessEqual(len(documents), 2) json.dumps(d)
def test_alternative_text_field(self): DocsAndLabelsFromCorpus(self.corpus) DocsAndLabelsFromCorpus(self.parsed_corpus) with self.assertRaises(CorpusShouldBeParsedCorpusException): DocsAndLabelsFromCorpus(self.corpus, alternative_text_field='orig') d = DocsAndLabelsFromCorpus(self.parsed_corpus, alternative_text_field='orig') self.assertEqual(d.get_labels_and_texts()['texts'][0], d.get_labels_and_texts()['texts'][0].upper()) d = DocsAndLabelsFromCorpus(self.parsed_corpus) self.assertNotEqual(d.get_labels_and_texts()['texts'][0], d.get_labels_and_texts()['texts'][0].upper()) d = DocsAndLabelsFromCorpusSample(self.parsed_corpus, 2, alternative_text_field='orig', seed=0) texts = d.get_labels_and_texts()['texts'] self.assertEqual(texts[0], texts[0].upper()) d = DocsAndLabelsFromCorpusSample(self.parsed_corpus, 2) self.assertNotEqual(d.get_labels_and_texts()['texts'][0], d.get_labels_and_texts()['texts'][0].upper())
def test_max_per_category(self): docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2, seed=0) metadata = np.array(['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0', 'element 4 1', 'element 5 1', 'element 6 1', 'element 7 1', 'element 8 1', 'element 9 2']) output = docs_and_labels.get_labels_and_texts_and_meta(metadata) self.assertTrue('texts' in output) self.assertTrue('labels' in output) self.assertTrue('meta' in output) self.assertTrue('extra' not in output) d = {} for text, lab, meta in zip(output['texts'], output['labels'], output['meta']): d.setdefault(lab, []).append(text) for lab, documents in d.items(): self.assertLessEqual(len(documents), 2) json.dumps(d) docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2) output = docs_and_labels.get_labels_and_texts() self.assertTrue('texts' in output) self.assertTrue('labels' in output) self.assertTrue('meta' not in output) self.assertTrue('extra' not in output) d = {} for text, lab in zip(output['texts'], output['labels']): d.setdefault(lab, []).append(text) for lab, documents in d.items(): self.assertLessEqual(len(documents), 2) json.dumps(d) docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2).use_non_text_features() output = docs_and_labels.get_labels_and_texts() self.assertTrue('texts' in output) self.assertTrue('labels' in output) self.assertTrue('meta' not in output) self.assertTrue('extra' in output) d = {} for text, lab in zip(output['texts'], output['labels']): d.setdefault(lab, []).append(text) for lab, documents in d.items(): self.assertLessEqual(len(documents), 2) json.dumps(d)