def test_information_gain(self): # Assert information gain weights. # Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf v = vector.Corpus([ vector.Document({"wind": 1}, type=False), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 1}, type=True), vector.Document({"wind": 1}, type=False), vector.Document({"wind": 1}, type=False) ]) self.assertAlmostEqual(v.information_gain("wind"), 0.52, places=2) print "patten.vector.Corpus.information_gain()"
def corpus(top=None): """ Returns a Corpus of e-mail messages. Document type=True => HAM, False => SPAM. Documents are mostly of a technical nature (developer forum posts). """ documents = [] for score, message in Datasheet.load( os.path.join(PATH, "corpora", "spam-apache.csv")): document = vector.Document(message, stemmer="porter", top=top, type=int(score) > 0) documents.append(document) return vector.Corpus(documents)
def test_lsa_concepts(self): try: import numpy except ImportError: return # Assert LSA concept space. corpus = vector.Corpus( (vector.Document("cats purr"), vector.Document("cats meow"), vector.Document("dogs howl"), vector.Document("dogs bark"))) corpus.reduce(2) # Intuitively, we'd expect two concepts: # 1) with cats + purr + meow grouped together, # 2) with dogs + howl + bark grouped together. i1, i2 = 0, 0 for i, concept in enumerate(corpus.lsa.concepts): self.assertTrue(isinstance(concept, dict)) if concept["cats"] > 0.5: self.assertTrue(concept["purr"] > 0.5) self.assertTrue(concept["meow"] > 0.5) self.assertTrue(concept["howl"] == 0.0) self.assertTrue(concept["bark"] == 0.0) i1 = i if concept["dogs"] > 0.5: self.assertTrue(concept["howl"] > 0.5) self.assertTrue(concept["bark"] > 0.5) self.assertTrue(concept["purr"] == 0.0) self.assertTrue(concept["meow"] == 0.0) i2 = i # We'd expect the "cat" documents to score high on the "cat" concept vector. # We'd expect the "dog" documents to score high on the "dog" concept vector. v1 = corpus.lsa[corpus.documents[0].id] v2 = corpus.lsa[corpus.documents[2].id] self.assertTrue(v1[i1] > 0.7) self.assertTrue(v1[i2] == 0.0) self.assertTrue(v2[i1] == 0.0) self.assertTrue(v2[i2] > 0.7) # Assert LSA.transform() for unknown documents. v = corpus.lsa.transform(vector.Document("cats dogs")) self.assertAlmostEqual(v[0], 0.34, places=2) self.assertAlmostEqual(v[1], 0.34, places=2) print "pattern.vector.LSA.concepts" print "pattern.vector.LSA.transform()"