def test_classifications(self):
     false_positives = 0
     false_negatives = 0
     correct = 0
     wrong = 0
     engine = create_engine('sqlite:///data.db')
     Session = sessionmaker(bind=engine)
     session = Session()
     training_data = session.query(model.Corpus).all()
     training_values = [rec.title + ' ' + rec.text for rec in training_data]
     training_targets = [rec.category for rec in training_data]
     training_values, testing_values, training_targets, testing_targets = cross_validation.train_test_split(training_values, training_targets, test_size=0.3, random_state=0)
     classifier = main.Classifier(training_values, training_targets)
     for (i, message_text) in enumerate(testing_values):
         classification = classifier.classify(message_text)[0]
         if testing_targets[i] == 'good' and classification != 'good':
             false_positives += 1
         elif testing_targets[i] != 'good' and classification == 'good':
             false_negatives += 1
         elif testing_targets[i] == classification:
             correct += 1
         else:
             wrong += 1
     print('{} false positives ({})'.format(false_positives, float(false_positives)/len(testing_values)))
     print('{} false negatives ({})'.format(false_negatives, float(false_negatives)/len(testing_values)))
     print('{} correct ({})'.format(correct, float(correct)/len(testing_values)))
     print('{} wrong ({})'.format(wrong, float(wrong)/len(testing_values)))
Example #2
0
 def test_classifications(self):
     false_positives = 0
     false_negatives = 0
     correct = 0
     wrong = 0
     engine = create_engine('sqlite:///data.db')
     Session = sessionmaker(bind=engine)
     session = Session()
     training_data = session.query(model.Corpus).all()
     training_values = [rec.title + ' ' + rec.text for rec in training_data]
     training_targets = [rec.category for rec in training_data]
     training_values, testing_values, training_targets, testing_targets = cross_validation.train_test_split(
         training_values, training_targets, test_size=0.2, random_state=0)
     classifier = main.Classifier(training_values, training_targets)
     for (i, message_text) in enumerate(testing_values):
         classification = classifier.classify(message_text)[0]
         if testing_targets[i] == 'good' and classification != 'good':
             false_positives += 1
             print(message_text)
             print('[Suspected {}; actually good]'.format(classification))
             print('---')
         elif testing_targets[i] != 'good' and classification == 'good':
             false_negatives += 1
         elif testing_targets[i] == classification:
             correct += 1
         else:
             wrong += 1
             print(message_text)
             print('[Suspected {}; actually {}]'.format(
                 classification, testing_targets[i]))
             print('---')
     print('{} false positives ({})'.format(
         false_positives,
         float(false_positives) / len(testing_values)))
     print('{} false negatives ({})'.format(
         false_negatives,
         float(false_negatives) / len(testing_values)))
     print('{} correct ({})'.format(correct,
                                    float(correct) / len(testing_values)))
     print('{} wrong ({})'.format(wrong,
                                  float(wrong) / len(testing_values)))
     if float(false_positives) / len(testing_values) > 0.05:
         raise Exception('False positive rate too high!')
     elif float(correct) / len(testing_values) < 0.6:
         raise Exception('Correct identification rate too low!')
Example #3
0
    def makeDocx(self):

        self.progressBar.setProperty("value", 0)

        cls = classifier.Classifier()

        nature = 0

        for n, line in enumerate(self.lines):
            found = cls.predict(line)
            found_class = cls.getClass(found)

            if (found_class == "N"):
                nature += 1

            self.progressBar.setProperty("value", (n / len(self.lines)) * 100)

        self.progressBar.setProperty("value", 100)
        cls.createDocx(self.inputFile.split("/")[-1])
        self.labelNumOfNature.setText("Znaleziono " + str(nature) +
                                      " zdań o przyrodzie\nz " +
                                      str(len(self.lines)) + " wszytkich zdań")
Example #4
0
import main
import model
import settings

param_grid = [
    {
        'C': [1, 5, 10, 50],
        'loss': ['hinge', 'squared_hinge'],
        'tol': [1e-6, 1e-4, 1e-2, 1e-1],
        'multi_class': ['ovr', 'crammer_singer'],
        'class_weight': ['balanced']
    }
]

if __name__ == '__main__':
    engine = create_engine(settings.DATABASE_URI)
    Session = sessionmaker(bind=engine)
    session = Session()
    data = session.query(model.Corpus).all()
    data_values = [col.title + ' ' + col.text for col in data]
    data_targets = [col.category for col in data]
    classifier = main.Classifier()
    classifier.vectorizer.fit_transform(data_values)
    grid_search = GridSearchCV(classifier.classifier, param_grid, n_jobs=-1)
    grid_search.fit(classifier.vectorizer.transform(data_values), data_targets)
    print('Best score: {}'.format(grid_search.best_score_))
    parameters = grid_search.best_estimator_.get_params()
    for parameter in parameters.keys():
        print("{} - {}".format(parameter, parameters[parameter]))