Exemple #1
0
def get_classifier():
    print str('getting classifier')
    if os.path.isfile('output/classifier-random-tree.pickle'):
        cl = load_classifier()
    else:
        print str('creating classifier')
        with open('output/train-even.csv', 'r') as trainingFile:
            cl = DecisionTreeClassifier(trainingFile, format="csv")
            save_classifier(cl)
    return cl
def classifier(agent_name):
    data_intent = list(db.intents.find({'agent_name': agent_name}))
    list_intent = []
    for i in data_intent:
        dict_intent = {}
        dict_intent[i['intent_name']] = i['user_expressions']
        list_intent.append(dict_intent)
    intent_list = dict([(key, d[key]) for d in list_intent for key in d])
    d = {i: k for k, v in intent_list.items() for i in v}
    train = list(d.items())
    clf = DecisionTreeClassifier(train)
    joblib.dump(clf, '/home/dev/Botzup/Botzup/' + agent_name + '.pkl')
Exemple #3
0
def train(vine_data, data):
    shuffle(data)
    print(len(vine_data))
    print(len(data))
    rate = 0.7
    boundary = int(rate * (len(vine_data) + len(data)))
    with open('../data/accurate.txt', 'a') as f:
        f.write('\ntotal size: ' + str(boundary) + '\n')
        cl2 = DecisionTreeClassifier(vine_data +
                                     data[:boundary - len(vine_data)])
        accurate2 = cl2.accuracy(data[boundary - len(vine_data):])
        print(accurate2)
    return cl2
class TestTreeTextClasiffier(luigi.Task):
    """docstring for TestNaiveBayesTextClasiffier"""
    lang = luigi.Parameter()

    def output(self):
        conf = Conf()
        path = conf.getAbsPath()
        return luigi.LocalTarget('%s/Data/tree_%s.clasi' % (path, self.lang))

    def requires(self):
        return [GenerateTextByLang(self.lang)]

    def run(self):
        d2v = None
        modelLoc = ""
        ficheroTweets = None
        for input in self.input():
            if "check" in input.path:
                d2v = Doc2Vec()
                modelLoc = input.path.replace("check", "model")
            else:
                ficheroTweets = input.path

        lab = LabeledLineSentence(ficheroTweets, ides="String")
        all_train = []
        for tweet in lab:
            tag = tweet.tags
            if "POS" in tag[0] or "NEG" in tag[0]:

                phrase = ' '.join(str(x) for x in tweet.words)
                #print phrase
                try:
                    all_train.append(
                        (phrase.encode('ascii',
                                       'ignore'), tag[0].split("_")[0]))
                except Exception, e:
                    pass

        leng = 2000
        train = int(leng * 0.80)
        shuffle(all_train)

        #print all_train[:train]
        cl = DecisionTreeClassifier(all_train[:train])
        #print all_train[train:leng]
        print cl.accuracy(all_train[train:leng])
Exemple #5
0
 def test_custom_feature_extractor(self):
     cl = DecisionTreeClassifier(train_set, custom_extractor)
     cl.classify("Yay! I'm so happy it works.")
     assert_equal(cl.train_features[0][1], 'positive')
Exemple #6
0
 def setUp(self):
     self.classifier = DecisionTreeClassifier(train_set)
Exemple #7
0

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


var= open("train.csv","w")
train_csv=pd.DataFrame.to_csv(train, index= False)         
var.write(train_csv)
var.close()

train = pd.concat([X_train['text'],y_train], axis=1)
with open('train.csv', 'r') as fp2:
    cl2 = DecisionTreeClassifier(fp2, format="csv")        


pred_train=[]
feature_train=[]
true_train= train.label
for instance in train.text:
    feature_train.append(feats(instance))
    blob = TextBlob(instance, classifier=cl2)
    pred_train.append(int(float(blob.classify())))
    

count=0
for i in range(len(pred_train)):
    if pred_train[i] == y_train[i]:
        count= count+1
    test = list(zip(test_data, test_target))

    start_time = time.time()
    cl = NaiveBayesClassifier(train)
    # Compute accuracy
    print("NaiveBayes Accuracy: {0}".format(cl.accuracy(test)))

    # Show 10 most informative features
    cl.show_informative_features(10)
    print(cl.informative_features(10))
    elapsed_time = time.time() - start_time
    print(elapsed_time)

    if (not ignoreDT):
        start_time = time.time()
        cl = DecisionTreeClassifier(train)
        print("DecisionTree Accuracy: {0}".format(cl.accuracy(test)))
        print(cl.pseudocode())
        elapsed_time = time.time() - start_time
        print(elapsed_time)

    start_time = time.time()
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.pipeline import Pipeline

    class StemmedCountVectorizer(CountVectorizer):
        def build_analyzer(self):
            analyzer = super(StemmedCountVectorizer, self).build_analyzer()
            return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
# print training_data_raw
training_data_set = {}
for key, row in training_data_raw.iterrows():
    training_data_set[row[0]] = row[1]

training_data = []

for key, value in training_data_set.iteritems():
    key = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', key)
    value = value.upper()
    training_data.append((unicode(key, 'utf-8', 'ignore'), value))

classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data = []
test_set = []
print "Step in 1"
for key, row in test_data_raw.iterrows():
    item = row[0]
    item = unicode(item, 'utf-8', 'ignore')
    item = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', item)
Exemple #10
0
def Train():
    train_file_anger = open("H:\\EmotionDetection\\Saif mohammad\\taining set\\anger-ratings-0to1.txt",'r',encoding="utf8",);
    train = [] 
    value_set = 20
    i = 0
    for line in train_file_anger.readlines():      
        for element in line[5:-1].split('\n'):
           # train.append((element,'anger'))
           if i==value_set:
               break
           a = element.split('anger')
           train.append((a[0].lstrip(),'anger'))
           i = i+1
    i=0
    train_file_fear =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\fear-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_fear.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('fear')
           train.append((a[0].lstrip(),'fear'))
           i = i+1
           
    i=0
    train_file_joy =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\joy-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_joy.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('joy')
           train.append((a[0].lstrip(),'joy'))
           i = i+1
    i=0
    train_file_sadness =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\sadness-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_sadness.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('sadness')
           train.append((a[0].lstrip(),'sadness'))
           i = i+1
    value_set = 20            
    test = []
    i=0 
    test_file_anger = open("H:\\EmotionDetection\\Saif mohammad\\test set\\with intensity labels\\anger.txt",'r',encoding="utf8",);
    for line in test_file_anger.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break
            #test.append((element,'anger'))
            b= element.split('anger')
            test.append((b[0].lstrip(),'anger'))
            i= i+1
    i=0
    test_file_fear = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\fear.txt",'r',encoding="utf8",);
    for line in test_file_fear.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('fear')
            test.append((b[0].lstrip(),'fear'))
            i=i+1
    i=0        
    test_file_joy = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\joy.txt",'r',encoding="utf8",);
    for line in test_file_joy.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('joy')
            test.append((b[0].lstrip(),'joy'))
            i= i+1
    i=0        
    test_file_sadness = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\sadness.txt",'r',encoding="utf8",);
    for line in test_file_sadness.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('sadness')
            test.append((b[0].lstrip(),'sadness'))
            i= i+1


    model =DecisionTreeClassifier(train)
    print("accuracy label of Naive Bayes Classifier:{:.4f}".format(model.accuracy(test)))
    print("Training completed....")
    #Dumping model NaiveBayes
    fp=open("H:\\Project_CDAC\\Models\\model_NB.pkl","wb")
    pickle.dump(model,fp)
    fp.close()
    print("Serialization of model completed")
# trains = []
# for i in range(int(size)):
#     trains.append(train[i])
# for i in range(250, int(size)+250):
#     trains.append(train[i])

trains = train

if choice == "1":
    print("\n" + "#NaiveBayesClassifier")
    cl1 = NaiveBayesClassifier(trains)
    print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n")

elif choice == "2":
    print("\n" + "#DecisionTreeClassifier")
    cl2 = DecisionTreeClassifier(trains)
    print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n")

elif choice == "3":
    print("\n" + "#MaxEntClassifier")
    cl3 = MaxEntClassifier(trains)
    print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test),
          "\n")

elif choice == "4":
    print("\n" + "#NLTKClassifier")
    cl4 = NLTKClassifier(trains)
    print("Classifier: NLTK -- Accuracy: ", cl4.accuracy(test), "\n")

else:
    print("Bad input!")
training_array_stemmed_without_sw=data.get_training_array_stemmed_without_sw(data)
# print('training_array_stemmed_without_sw')
# print(training_array_stemmed_without_sw)
test_array = data.get_test_array(data)
print('test_array')
print(test_array)
test_array_without_sw = data.get_test_array_without_sw(data)
print('test_array_without_sw')
print(test_array_without_sw)
test_array_stemmed_without_sw =  data.get_test_array_stemmed_without_sw(data)
print('test_array_stemmed_without_sw')
print(test_array_stemmed_without_sw)

print('\n************ DecisionTreeClassifier ********************\n')
print('Before pre-processing \n')
cl = DecisionTreeClassifier(training_array)
classify_review(cl)
print('\n After removing stop-words \n')
cl = DecisionTreeClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
cl = DecisionTreeClassifier(training_array_stemmed_without_sw)
classify_review(cl)
print('\n ************ NaiveBayesClassifier ********************\n')
print('Before pre-processing\n')
cl = NaiveBayesClassifier(training_array)
classify_review(cl)
print('\n After removing stop-words \n')
cl = NaiveBayesClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
Exemple #13
0
           ('Fantastic Mr Fox is an awesome film!', 'neg'),
           ('Dragonball Evolution is simply terrible!!', 'pos')]
"""
Textblob provides in-build classifiers module to create a custom classifier. 
So, let’s quickly import it and create a basic classifier.
"""

from textblob.classifiers import NaiveBayesClassifier
classifier = NaiveBayesClassifier(training)
print(classifier.accuracy(testing))
"""
classifier.show_informative_features()
classifier.show_informative_features(3)

As, we can see that if the text contains “is”, 
then there is a high probability that the statement will be negative.
"""

classifier.classify("the weather is terrible!")
classifier.classify("I am very happy today")
classifier.classify("This book was so helpful")
classifier.classify("I'm excited to try my new classifier.")
classifier.classify("This is an awesome library!")

## decision tree classifier
from textblob.classifiers import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(training)
print(dt_classifier.accuracy(testing))

###############################################################################