Example #1
0
def train(vine_data, data):
    shuffle(data)
    print(len(vine_data))
    print(len(data))
    rate = 0.7
    boundary = int(rate * (len(vine_data) + len(data)))
    with open('../data/accurate.txt', 'a') as f:
        f.write('\ntotal size: ' + str(boundary) + '\n')
        cl2 = DecisionTreeClassifier(vine_data +
                                     data[:boundary - len(vine_data)])
        accurate2 = cl2.accuracy(data[boundary - len(vine_data):])
        print(accurate2)
    return cl2
Example #2
0
class TestDecisionTreeClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(
                len(train_set)))
Example #3
0
class TestDecisionTreeClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(len(train_set)))
   value = value.upper()
   training_data.append((unicode(key, 'utf-8', 'ignore'), value))



classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data=[]
test_set=[]
print "Step in 1"
for key, row in test_data_raw.iterrows():
   item = row[0]
   item = unicode(item, 'utf-8', 'ignore')
   item = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', item)
   classification = classifier.classify(item)
   #test_data.append({'Incident': item,  'Classification' : classification})
   test_set.append((item, classification))
"""
with open('Classifier_results.csv', 'wb') as f:
   w = csv.DictWriter(f, fieldnames=['Incident', 'Classification'])
   w.writeheader()
   for row in test_data:
       w.writerow(row)
"""
print classifier.accuracy(test_set)
Example #5
0
print(len(words), len(tags))

for i in range(1000):
    if (i < 800):
        temp = (words[i], tags[i])
        train.append(temp)
    else:
        temp = (words[i], tags[i])
        test.append(temp)
print(train)
print(test)

naive = NaiveBayesClassifier(train)
dtc = DecisionTreeClassifier(train)
mec = MaxEntClassifier(train)

print("NaiveBayesClassifier Accuracy: {0}".format(naive.accuracy(test)))
print("DecisionTreeClassifier Accuracy: {0}".format(dtc.accuracy(test)))
print("MaxEntClassifier Accuracy: {0}".format(mec.accuracy(test)))

cl = NaiveBayesClassifier(train)
print("NaiveBayesClassifier Accuracy: {0}".format(cl.accuracy(test)))
for i in range(0, len(test)):
    tag = cl.classify(test[i])
    pred_tags.append(tag)
    if (tag == test_tags[i]):
        count += 1
print(len(pred_tags), len(test_tags))
print(count)
Example #6
0
            if(param[1] not in ['bigrams','trigrams']):
                test.append( (doc[param[0]][param[1]], str(doc['class'])) )
            else:
                # join the ngrams together so we can use them
                ngrams = join_ngrams(doc[param[0]][param[1]])
                test.append( (ngrams, str(doc['class'])) )


        cl = DecisionTreeClassifier(train)
        type = 'DecisionTree'
        # cl = NaiveBayesClassifier(train)
        # type = 'NaiveBayes'

        # wraps NLTK simply: return nltk.classify.accuracy(self.classifier, 
        # test_features) 
        acc = cl.accuracy(test) * 100
        print('Classifier Type      | ', type, ' with ', '.'.join(param))
        print('Accuracy, train/test | ', '=',  str(acc), '% ,', len(train), \
            '/', len(test))
        #cl.show_informative_features(30)
        print ('\n')
        print ('\n')


    # item = item.decode('ascii', errors="replace")
    exit('')
    ## use the blob method as it is more convenient
    # unicode issues?
    blob = TextBlob(item)
    for np in blob.noun_phrases:
        print (np)
Example #7
0
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = DecisionTreeClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[100:150]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))

# Show 5 most informative features
#cl.show_informative_features(5)

#save_classifier = open("naivebayes.pickle","wb")
#p.dump(cl, save_classifier)
#save_classifier.close()
    value = value.upper()
    training_data.append((unicode(key, 'utf-8', 'ignore'), value))

classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data = []
test_set = []
print "Step in 1"
for key, row in test_data_raw.iterrows():
    item = row[0]
    item = unicode(item, 'utf-8', 'ignore')
    item = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', item)
    classification = classifier.classify(item)
    #test_data.append({'Incident': item,  'Classification' : classification})
    test_set.append((item, classification))
"""
with open('Classifier_results.csv', 'wb') as f:
   w = csv.DictWriter(f, fieldnames=['Incident', 'Classification'])
   w.writeheader()
   for row in test_data:
       w.writerow(row)
"""
print classifier.accuracy(test_set)
Example #9
0
def Train():
    train_file_anger = open("H:\\EmotionDetection\\Saif mohammad\\taining set\\anger-ratings-0to1.txt",'r',encoding="utf8",);
    train = [] 
    value_set = 20
    i = 0
    for line in train_file_anger.readlines():      
        for element in line[5:-1].split('\n'):
           # train.append((element,'anger'))
           if i==value_set:
               break
           a = element.split('anger')
           train.append((a[0].lstrip(),'anger'))
           i = i+1
    i=0
    train_file_fear =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\fear-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_fear.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('fear')
           train.append((a[0].lstrip(),'fear'))
           i = i+1
           
    i=0
    train_file_joy =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\joy-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_joy.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('joy')
           train.append((a[0].lstrip(),'joy'))
           i = i+1
    i=0
    train_file_sadness =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\sadness-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_sadness.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('sadness')
           train.append((a[0].lstrip(),'sadness'))
           i = i+1
    value_set = 20            
    test = []
    i=0 
    test_file_anger = open("H:\\EmotionDetection\\Saif mohammad\\test set\\with intensity labels\\anger.txt",'r',encoding="utf8",);
    for line in test_file_anger.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break
            #test.append((element,'anger'))
            b= element.split('anger')
            test.append((b[0].lstrip(),'anger'))
            i= i+1
    i=0
    test_file_fear = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\fear.txt",'r',encoding="utf8",);
    for line in test_file_fear.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('fear')
            test.append((b[0].lstrip(),'fear'))
            i=i+1
    i=0        
    test_file_joy = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\joy.txt",'r',encoding="utf8",);
    for line in test_file_joy.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('joy')
            test.append((b[0].lstrip(),'joy'))
            i= i+1
    i=0        
    test_file_sadness = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\sadness.txt",'r',encoding="utf8",);
    for line in test_file_sadness.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('sadness')
            test.append((b[0].lstrip(),'sadness'))
            i= i+1


    model =DecisionTreeClassifier(train)
    print("accuracy label of Naive Bayes Classifier:{:.4f}".format(model.accuracy(test)))
    print("Training completed....")
    #Dumping model NaiveBayes
    fp=open("H:\\Project_CDAC\\Models\\model_NB.pkl","wb")
    pickle.dump(model,fp)
    fp.close()
    print("Serialization of model completed")
Example #10
0
# for i in range(int(size)):
#     trains.append(train[i])
# for i in range(250, int(size)+250):
#     trains.append(train[i])

trains = train

if choice == "1":
    print("\n" + "#NaiveBayesClassifier")
    cl1 = NaiveBayesClassifier(trains)
    print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n")

elif choice == "2":
    print("\n" + "#DecisionTreeClassifier")
    cl2 = DecisionTreeClassifier(trains)
    print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n")

elif choice == "3":
    print("\n" + "#MaxEntClassifier")
    cl3 = MaxEntClassifier(trains)
    print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test),
          "\n")

elif choice == "4":
    print("\n" + "#NLTKClassifier")
    cl4 = NLTKClassifier(trains)
    print("Classifier: NLTK -- Accuracy: ", cl4.accuracy(test), "\n")

else:
    print("Bad input!")
    blob = TextBlob(unicode(key, 'utf-8', 'ignore'))
    key = ' '.join(blob.noun_phrases)
    value = row[1].upper()
    if count < 120:
        training_data.append((key, value))
    else:
        test_data.append((key, value))
    count += 1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(
    classifier.classify(
        u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities."
        .encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
Example #12
0
           ('Fantastic Mr Fox is an awesome film!', 'neg'),
           ('Dragonball Evolution is simply terrible!!', 'pos')]
"""
Textblob provides in-build classifiers module to create a custom classifier. 
So, let’s quickly import it and create a basic classifier.
"""

from textblob.classifiers import NaiveBayesClassifier
classifier = NaiveBayesClassifier(training)
print(classifier.accuracy(testing))
"""
classifier.show_informative_features()
classifier.show_informative_features(3)

As, we can see that if the text contains “is”, 
then there is a high probability that the statement will be negative.
"""

classifier.classify("the weather is terrible!")
classifier.classify("I am very happy today")
classifier.classify("This book was so helpful")
classifier.classify("I'm excited to try my new classifier.")
classifier.classify("This is an awesome library!")

## decision tree classifier
from textblob.classifiers import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(training)
print(dt_classifier.accuracy(testing))

###############################################################################
   blob = TextBlob(unicode(key, 'utf-8', 'ignore'))
   key = ' '.join(blob.noun_phrases)
   value = row[1].upper()
   if count < 120:
      training_data.append((key, value))
   else:
      test_data.append((key, value))
   count+=1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
                writer.writerow(row)
                flag = False
            else: