Example #1
0
class TestDecisionTreeClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(
                len(train_set)))
Example #2
0
class TestDecisionTreeClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(len(train_set)))
Example #3
0
    # test a new item  usage:
    newitem = 'dsdsjdlaskdjkl'
    cl.classify(newitem)

    # top five contriobuting feats
    cl.show_informative_features(5) 

    # get the label probability distribution with the prob_classify(text) method
    prob_dist = cl.prob_classify(newitem)
    prob_dist.max()
    relevant = round(prob_dist.prob("pos"), 2)
    irrelevant = round(prob_dist.prob("neg"), 2)
    ## 
    # method B - train with unigrams
    #
    cl.update(new_train) # can call it like this
    accuracy = cl.accuracy(test + new_test)


    ### can pass a custom feature-extractor function to the clasifier
    ## maybe try with one that removes key hashtag terms and see if it 
    # improves or not
    # A feature extractor is simply a function with document 
    # (the text to extract features from)
    # as the first argument.
    # The function may include a second argument, 
    # train_set (the training dataset), if necessary.
    #
    #

Example #4
0
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = DecisionTreeClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[100:150]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))

# Show 5 most informative features
#cl.show_informative_features(5)

#save_classifier = open("naivebayes.pickle","wb")
#p.dump(cl, save_classifier)
#save_classifier.close()
    key = ' '.join(blob.noun_phrases)
    value = row[1].upper()
    if count < 120:
        training_data.append((key, value))
    else:
        test_data.append((key, value))
    count += 1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(
    classifier.classify(
        u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities."
        .encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
                writer.writerow(row)
   key = ' '.join(blob.noun_phrases)
   value = row[1].upper()
   if count < 120:
      training_data.append((key, value))
   else:
      test_data.append((key, value))
   count+=1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
                writer.writerow(row)
                flag = False
            else:
                item = row[12]