def test_classifications(self): false_positives = 0 false_negatives = 0 correct = 0 wrong = 0 engine = create_engine('sqlite:///data.db') Session = sessionmaker(bind=engine) session = Session() training_data = session.query(model.Corpus).all() training_values = [rec.title + ' ' + rec.text for rec in training_data] training_targets = [rec.category for rec in training_data] training_values, testing_values, training_targets, testing_targets = cross_validation.train_test_split(training_values, training_targets, test_size=0.3, random_state=0) classifier = main.Classifier(training_values, training_targets) for (i, message_text) in enumerate(testing_values): classification = classifier.classify(message_text)[0] if testing_targets[i] == 'good' and classification != 'good': false_positives += 1 elif testing_targets[i] != 'good' and classification == 'good': false_negatives += 1 elif testing_targets[i] == classification: correct += 1 else: wrong += 1 print('{} false positives ({})'.format(false_positives, float(false_positives)/len(testing_values))) print('{} false negatives ({})'.format(false_negatives, float(false_negatives)/len(testing_values))) print('{} correct ({})'.format(correct, float(correct)/len(testing_values))) print('{} wrong ({})'.format(wrong, float(wrong)/len(testing_values)))
def test_classifications(self): false_positives = 0 false_negatives = 0 correct = 0 wrong = 0 engine = create_engine('sqlite:///data.db') Session = sessionmaker(bind=engine) session = Session() training_data = session.query(model.Corpus).all() training_values = [rec.title + ' ' + rec.text for rec in training_data] training_targets = [rec.category for rec in training_data] training_values, testing_values, training_targets, testing_targets = cross_validation.train_test_split( training_values, training_targets, test_size=0.2, random_state=0) classifier = main.Classifier(training_values, training_targets) for (i, message_text) in enumerate(testing_values): classification = classifier.classify(message_text)[0] if testing_targets[i] == 'good' and classification != 'good': false_positives += 1 print(message_text) print('[Suspected {}; actually good]'.format(classification)) print('---') elif testing_targets[i] != 'good' and classification == 'good': false_negatives += 1 elif testing_targets[i] == classification: correct += 1 else: wrong += 1 print(message_text) print('[Suspected {}; actually {}]'.format( classification, testing_targets[i])) print('---') print('{} false positives ({})'.format( false_positives, float(false_positives) / len(testing_values))) print('{} false negatives ({})'.format( false_negatives, float(false_negatives) / len(testing_values))) print('{} correct ({})'.format(correct, float(correct) / len(testing_values))) print('{} wrong ({})'.format(wrong, float(wrong) / len(testing_values))) if float(false_positives) / len(testing_values) > 0.05: raise Exception('False positive rate too high!') elif float(correct) / len(testing_values) < 0.6: raise Exception('Correct identification rate too low!')
def makeDocx(self): self.progressBar.setProperty("value", 0) cls = classifier.Classifier() nature = 0 for n, line in enumerate(self.lines): found = cls.predict(line) found_class = cls.getClass(found) if (found_class == "N"): nature += 1 self.progressBar.setProperty("value", (n / len(self.lines)) * 100) self.progressBar.setProperty("value", 100) cls.createDocx(self.inputFile.split("/")[-1]) self.labelNumOfNature.setText("Znaleziono " + str(nature) + " zdań o przyrodzie\nz " + str(len(self.lines)) + " wszytkich zdań")
import main import model import settings param_grid = [ { 'C': [1, 5, 10, 50], 'loss': ['hinge', 'squared_hinge'], 'tol': [1e-6, 1e-4, 1e-2, 1e-1], 'multi_class': ['ovr', 'crammer_singer'], 'class_weight': ['balanced'] } ] if __name__ == '__main__': engine = create_engine(settings.DATABASE_URI) Session = sessionmaker(bind=engine) session = Session() data = session.query(model.Corpus).all() data_values = [col.title + ' ' + col.text for col in data] data_targets = [col.category for col in data] classifier = main.Classifier() classifier.vectorizer.fit_transform(data_values) grid_search = GridSearchCV(classifier.classifier, param_grid, n_jobs=-1) grid_search.fit(classifier.vectorizer.transform(data_values), data_targets) print('Best score: {}'.format(grid_search.best_score_)) parameters = grid_search.best_estimator_.get_params() for parameter in parameters.keys(): print("{} - {}".format(parameter, parameters[parameter]))