def kfold_svm(data): start = timeit.default_timer() accuracy, precision, recall, f1, stdev = kfoldcv(SVM, data, folds=10, type=CLASSIFICATION, kernel=POLYNOMIAL) stop = timeit.default_timer() print '*SVM*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'STDev: ' + str(stdev) print 'Time: ' + str(stop - start) print
def kfold_slp(data, itr=3): start = timeit.default_timer() accuracy, precision, recall, f1, stdev = kfoldcv(SLP, data, folds=10, iterations=itr) stop = timeit.default_timer() print '*SLP3*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'STDev: ' + str(stdev) print 'Time: ' + str(stop - start) print
def kfold_nb(data): start = timeit.default_timer() accuracy, precision, recall, f1, stdev = kfoldcv(NB, data, folds=10, method=MULTINOMIAL) stop = timeit.default_timer() print '*NB*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'STDev: ' + str(stdev) print 'Time: ' + str(stop - start) print
def kfold_knn(data, kk=9): start = timeit.default_timer() accuracy, precision, recall, f1, stdev = kfoldcv(KNN, data, folds=10, k=kk, distance=COSINE) stop = timeit.default_timer() print '*KNN9*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'STDev: ' + str(stdev) print 'Time: ' + str(stop - start) print
def validate(trainingSet): #Displays as (accuracy, precision, recall, F1, stdev) print '\n10-fold cross validation results on training set:' print kfoldcv(NB, trainingSet, folds=10) print ''
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:]+"$"] = 1 v[len(name)] = 1 return v data = csv("given-names.csv") # Test average (accuracy, precision, recall, F-score). print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78) # Train and save the classifier. # With final=True, discards the original training data (= smaller file). g = GenderByName(train=data) g.save("gender-by-name.svm", final=True) # Next time, we can simply load the trained classifier. # Keep in mind that the script that loads the classifier # must include the code for the GenderByName class description, # otherwise Python won't know how to load the data. g = GenderByName.load("gender-by-name.svm") for name in (
print accuracy, precision, recall, f1 # confusion matrix print nb.distribution print nb.confusion_matrix(data[500:]) print nb.confusion_matrix(data[500:])(True) # (TP, TN, FP, FN) # precision and recall print nb.test(data[500:], target=True) print nb.test(data[500:], target=False) print nb.test(data[500:]) # k-fold cross validation data = csv('data/input/reviews.csv') data = [(review, int(rating) >= 3) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] print kfoldcv(NB, data, folds=10) print kfoldcv(KNN, data, folds=10, k=3, distance=EUCLIDEAN) # feature selection def v(review1): v3 = parsetree(review1, lemmata=True)[0] v4 = [w.lemma for w in v3 if w.tag.startswith(('JJ', 'NN', 'VB', '!'))] v5 = count(v4) return v5 data = csv('data/input/reviews.csv') data = [(v(review), int(rating) >= 3) for review, rating in data] print kfoldcv(NB, data) data = csv('data/input/reviews.csv')
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:] + "$"] = 1 v[len(name)] = 1 return v data = csv(pd("given-names.csv")) # Test average (accuracy, precision, recall, F-score, standard deviation). print(kfoldcv(GenderByName, data, folds=3)) # (0.81, 0.79, 0.77, 0.78, 0.00) # Train and save the classifier in the current folder. # With final=True, discards the original training data (= smaller file). g = GenderByName(train=data) g.save(pd("gender-by-name.svm"), final=True) # Next time, we can simply load the trained classifier. # Keep in mind that the script that loads the classifier # must include the code for the GenderByName class description, # otherwise Python won't know how to load the data. g = GenderByName.load(pd("gender-by-name.svm")) for name in ("Felix", "Felicia", "Rover", "Kitty", "Legolas", "Arwen", "Jabba",
URL = re.compile(r"https?://[^\s]+") # http://www.emrg.be REF = re.compile(r"@[a-z0-9_./]+", flags=re.I) # @tom_de_smedt from pattern.db import Datasheet, pd train = [] for name, alignment, tweet in Datasheet.load(pd("good-evil.csv")): tweet = URL.sub("http://", tweet) # Anonymize URLs. tweet = REF.sub("@friend", tweet) # Anonymize usernames. train.append((ngram_vector(tweet, 5), alignment)) # ------------------------------------------------------------------------------------ # Let's look at the statistical accuracy of the classifier: print kfoldcv(SVM, train, folds=3) print # This returns an (accuracy, precision, recall, F1-score, stdev)-tuple. # The F1-score is the most important. # An SVM trained on our data would be 94.6% accurate in knowing good from evil # (this is a suspiciously high accuracy). # ------------------------------------------------------------------------------------ classifier = SVM(train) print classifier.distribution print # This reveals that there are 13,000 good tweets, and 5,000 evil tweets. # This means the classifier is biased to predict "good",
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:] + "$"] = 1 v[len(name)] = 1 return v data = csv(pd("given-names.csv")) # Test average (accuracy, precision, recall, F-score, standard deviation). print(kfoldcv(GenderByName, data, folds=3)) # (0.81, 0.79, 0.77, 0.78, 0.00) # Train and save the classifier in the current folder. # With final=True, discards the original training data (= smaller file). g = GenderByName(train=data) g.save(pd("gender-by-name.svm"), final=True) # Next time, we can simply load the trained classifier. # Keep in mind that the script that loads the classifier # must include the code for the GenderByName class description, # otherwise Python won't know how to load the data. g = GenderByName.load(pd("gender-by-name.svm")) for name in (
#How many features were there before print len(refinedfeatures) cleanFeatures = [i for i in refinedfeatures if i not in avoidList] #How many features after filtering out avoid list print len(cleanFeatures) #model redefines model = model.filter(features=cleanFeatures) #This will give k fold cross validation results; Instead of Naive Bayes you can try SVM, SLP, KNN etc print kfoldcv(NB, model) #Writing the features and their weights to a csv file listofFeatures = [] for i in cleanFeatures: innerList= [i,model.ig(i)] listofFeatures.append(innerList)
#print(vectors.vectors) else: vectors = documents if options["train"]: if classifier_type == "SVM": classifier = SVM(train=vectors, type=svm_type, kernel=svm_kernel) else: classifier = getattr(pattern.vector, classifier_type)(train=vectors) print("Classes: " + repr(classifier.classes)) #performance = kfoldcv(NB, vectors, folds=n_fold) performance = kfoldcv(type(classifier), vectors, folds=n_fold) print("Accuracy: %.3f\n" \ "Precision: %.3f\n" \ "Recall: %.3f\n" \ "F1: %.3f\n" \ "Stddev:%.3f" % performance) print() print("Confusion matrx:") print(classifier.confusion_matrix(vectors).table) classifier.save(trained_filename) elif options["predict"]: classifier = Classifier.load(trained_filename) print("#Author\tURL\tPrediction\tActual") for v in vectors:
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:] + "$"] = 1 v[len(name)] = 1 return v data = csv("given-names.csv") # Test average (accuracy, precision, recall, F-score). print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78) # Train and save the classifier. # With final=True, discards the original training data (= smaller file). g = GenderByName(train=data) g.save("gender-by-name.svm", final=True) # Next time, we can simply load the trained classifier. # Keep in mind that the script that loads the classifier # must include the code for the GenderByName class description, # otherwise Python won't know how to load the data. g = GenderByName.load("gender-by-name.svm") for name in ("Felix", "Felicia", "Rover", "Kitty", "Legolas", "Arwen", "Jabba",