Beispiel #1
0
class TestDecisionTreeClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(
                len(train_set)))
Beispiel #2
0
class TestDecisionTreeClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(len(train_set)))
   key = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', key)
   value = value.upper()
   training_data.append((unicode(key, 'utf-8', 'ignore'), value))



classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data=[]
test_set=[]
print "Step in 1"
for key, row in test_data_raw.iterrows():
   item = row[0]
   item = unicode(item, 'utf-8', 'ignore')
   item = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', item)
   classification = classifier.classify(item)
   #test_data.append({'Incident': item,  'Classification' : classification})
   test_set.append((item, classification))
"""
with open('Classifier_results.csv', 'wb') as f:
   w = csv.DictWriter(f, fieldnames=['Incident', 'Classification'])
   w.writeheader()
   for row in test_data:
       w.writerow(row)
"""
print classifier.accuracy(test_set)
Beispiel #4
0
 def test_custom_feature_extractor(self):
     cl = DecisionTreeClassifier(train_set, custom_extractor)
     cl.classify("Yay! I'm so happy it works.")
     assert_equal(cl.train_features[0][1], 'positive')
print()
# Decision Tree Classifier
print("Training Decision Tree Classifier...")
start_dtc = time.time()
with open('training.json', 'r') as training:
    dtc = DecisionTreeClassifier(training, format="json")
stop_dtc = time.time()
print("Training Decision Tree Classifier completed...")
elapsed = stop_dtc - start_dtc
print("Training time (in seconds): " + str(elapsed))
print("Testing Decision Tree Classifier...")
correct = 0
start_dtc = time.time()
for i in range(0, len(sentences)):
    category = str(dtc.classify(sentences[i])).lower()
    expected = str(categories[i]).lower()
    if category == expected:
        correct += 1
stop_dtc = time.time()
elapsed = stop_dtc - start_dtc
print("Number of tests: " + str(len(sentences)))
print("Correct tests: " + str(correct))
accuracy = correct / len(sentences)
print("Decision Tree Classifier accuracy: " + str(accuracy))
print("Testing time (in seconds): " + str(elapsed))

print()
# SVM Classifier
print("Training SVM Classifier...")
start_svm = time.time()
Beispiel #6
0

    # item = item.decode('ascii', errors="replace")
    exit('')
    ## use the blob method as it is more convenient
    # unicode issues?
    blob = TextBlob(item)
    for np in blob.noun_phrases:
        print (np)


    cl.accuracy(test)

    # test a new item  usage:
    newitem = 'dsdsjdlaskdjkl'
    cl.classify(newitem)

    # top five contriobuting feats
    cl.show_informative_features(5) 

    # get the label probability distribution with the prob_classify(text) method
    prob_dist = cl.prob_classify(newitem)
    prob_dist.max()
    relevant = round(prob_dist.prob("pos"), 2)
    irrelevant = round(prob_dist.prob("neg"), 2)
    ## 
    # method B - train with unigrams
    #
    cl.update(new_train) # can call it like this
    accuracy = cl.accuracy(test + new_test)
        '', key)
    value = value.upper()
    training_data.append((unicode(key, 'utf-8', 'ignore'), value))

classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data = []
test_set = []
print "Step in 1"
for key, row in test_data_raw.iterrows():
    item = row[0]
    item = unicode(item, 'utf-8', 'ignore')
    item = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', item)
    classification = classifier.classify(item)
    #test_data.append({'Incident': item,  'Classification' : classification})
    test_set.append((item, classification))
"""
with open('Classifier_results.csv', 'wb') as f:
   w = csv.DictWriter(f, fieldnames=['Incident', 'Classification'])
   w.writeheader()
   for row in test_data:
       w.writerow(row)
"""
print classifier.accuracy(test_set)
        training_data.append((key, value))
    else:
        test_data.append((key, value))
    count += 1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(
    classifier.classify(
        u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities."
        .encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
                writer.writerow(row)
                flag = False
            else:
                item = row[12]
   value = row[1].upper()
   if count < 120:
      training_data.append((key, value))
   else:
      test_data.append((key, value))
   count+=1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
                writer.writerow(row)
                flag = False
            else:
                item = row[12]
                item = unicode(item, 'utf-8', 'ignore')