def test_author_classification_egypt_dataset(self): TestAuthor.drop_collection() ws = WarehouseServer() for author in [author for author in ws.get_authors(type=Author)]: if len(author.tweets) > 200: t = TestAuthor() t.screen_name = author.screen_name t.tweets = author.tweets t.save() authors = ws.get_authors(type=TestAuthor) for author in authors: print '-----------------------' print author.screen_name vector = author.update_feature_vector() print vector classifier = TreeClassifier() attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] train_set = numpy.array([author.get_feature_vector_with_type() for author in TrainingAuthor.objects]) classifier.train(train_set, attributes) for author in authors: prediction = "No prediction" if len(author.feature_vector) > 0: prediction = classifier.classify(author.get_feature_vector_with_type()) print author.screen_name print prediction print '----------------------' TestAuthor.drop_collection()
def test_author_classification_dummy_dataset(self): train_set = numpy.array([[0.2, 0.5, 0.2, 0.2, 0.1, 10., 0], [0.2, 0.3, 0.12, 0.1, 0.1, 10., 0], [0.2, 0.2, 0.08, 0.2, 0.01, 20., 0], [0.2, 0.5, 0.1, 0.1, 0.2, 5., 0], [0.2, 0.1, 0.2, 0.2, 0.3, 20., 0], [0.7, 0.5, 0.2, 0.8, 0.3, 0.1, 1], [0.6, 0.8, 5.2, 0.2, 0.6, 0.3, 1], [0.2, 0.6, 8.2, 0.9, 0.9, 0.1, 1], [0.5, 0.9, 1.2, 0.1, 0.1, 0.2, 1], [0.9, 0.1, 0.9, 0.6, 0.3, 0.6, 1]]) attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] classifier = TreeClassifier() classifier.train(train_set, attributes) example = [0.2, 0.5, 0.2, 0.2, 0.1, 100, 0] prediction = classifier.classify(example) self.assertEquals(0, prediction.value)
def test_author_classification_real_dataset(self): # 0 --> Celebrirty # 1 --> Media # 2 --> Journalists # 3 --> Common people classifier = TreeClassifier() train_set = numpy.array([[0.18, 0.57, 0.01, 0.053, 0.0, 52872., 0], #Bill Gates [0.5, 0.1, 0.0, 0.09, 0.0, 151., 0], #Justin Bieber [0.096, 0.4, 0.06, 0.2, 0.0, 14052., 0], #Ashton Kutcher [0.06, 0.051, 0.04, 0.62, 0.0, 216342., 0], #Oprah [0.026, 0.72, 0.03, 0.07, 0.0, 79183., 0], #Amy Winehouse [0.006, 0.85, 0.52, 0.0, 0.0, 55025., 1], #BBC [0.17, 0.77, 0.86, 0.03, 0.0, 5540., 1], #CNN [0.25, 0.73, 2., 0.0, 0.0, 264., 1], #HuffPost [0.02, 0.99, 0.79, 0.03, 0.0, 5034., 1], #AlJazeera [0.13, 0.31, 2.2, 0.33, 0.0, 14., 2], #Ali Velshi [0.19, 0.19, 2.8, 0.17, 0.0, 4., 2], #MujMash [0.053, 0.23, 2.5, 0.3, 0.0, 12., 2], #StevePoliti [0.09, 0.16, 11.3, 0.36, 0.0, 8., 2], #Rachel King [0.4, 0.36, 8.6, 0.16, 0.0, 0.32, 3], #George Eracleous [0.2, 0.12, 7.1, 0.35, 0.0, 0.38, 3], #Nik Adhia [0.26, 0.15, 2.4, 0.31, 0.0, 0.9, 3], #A person [0.55, 0.13, 13.2, 0.25, 0.0, 0.7, 3]]) #Yet another person attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] classifier.train(train_set, attributes) celebrity_example = [0.096, 0.3, 0.03, 0.08, 0.0, 94258, 0] media_example = [0.0, 0.8, 0.89, 0.031, 0.0, 184., 0] common_example = [0.55, 0.13, 13.2, 0.25, 0.0, 0.7, 0] journalist_example = [0.24, 0.48, 14.7, 0.11, 0.0, 11.9, 0] prediction_celebrity = classifier.classify(celebrity_example) prediction_media = classifier.classify(media_example) prediction_journalist = classifier.classify(journalist_example) prediction_common = classifier.classify(common_example) calculated = [prediction_celebrity, prediction_media, prediction_journalist, prediction_common] expected = [0, 1, 2, 3] self.assertEqual(expected, calculated)