def AdaBoostClassifier_predict(features, classes, unknown): """ Provices the most likely author for each unknown text """ FP = Feature_Preprocessor(features, True, False, 30) features = FP.batch_normalize(features) unknown = FP.batch_normalize(unknown) clf = AdaBoostClassifier(n_estimators=10, learning_rate=0.998, algorithm='SAMME.R', random_state=1) clf.fit(features, classes) return clf.predict(unknown)
def DecisionTreeClassifier_predict(features, classes, unknown): """ Provices the most likely author for each unknown text """ FP = Feature_Preprocessor(features, True, True, 30) features = FP.batch_normalize(features) unknown = FP.batch_normalize(unknown) clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=2, splitter='best') clf.fit(features, classes) return clf.predict(unknown)
def KNeighborsClassifier_predict(features, classes, unknown): """ Provices the most likely author for each unknown text """ FP = Feature_Preprocessor(features, True, False, 30) features = FP.batch_normalize(features) unknown = FP.batch_normalize(unknown) clf = KNeighborsClassifier(n_neighbors=4, weights='distance', algorithm='brute', metric='minkowski', p=1) clf.fit(features, classes) return clf.predict(unknown)
def SVM_predict(features, classes, unknown): """ Provices the most likely author for each unknown text """ FP = Feature_Preprocessor(features, True, False, 30) features = FP.batch_normalize(features) unknown = FP.batch_normalize(unknown) clf = SVC(kernel='rbf', C=2.4, degree=1, gamma=0.7/len(features[0])) clf.fit(features, classes) return clf.predict(unknown)
def AdaBoostClassifier_predict_texttype(features, classes, unknown): """ Predicts the type of a text (binary classification) Parameters optimized for natural vs obfuscated. """ from sklearn.ensemble import AdaBoostClassifier FP = Feature_Preprocessor(features, True, False, 30) features = FP.batch_normalize(features) unknown = FP.batch_normalize(unknown) clf = AdaBoostClassifier(n_estimators=80, learning_rate=0.998, algorithm='SAMME.R', random_state=1) clf.fit(features, classes) return clf.predict(unknown)
def SVM_predict_rank(features, classes, unknown, actual_classes): """ Proviced a ranking of the different authors by likelyhood of having authored each unknown text. """ FP = Feature_Preprocessor(features, True, False, 30) features = FP.batch_normalize(features) unknown = FP.batch_normalize(unknown) clf = SVC(probability=True, kernel='rbf', C=2.4, degree=1, gamma=0.7/len(features[0])) clf.fit(features, classes) # I'm sorry about the following lines: predictions = map(lambda x : zip(clf.classes_, x), clf.predict_log_proba(unknown)) orderings = zip(map(lambda x : sorted(x, key = lambda s : s[1], reverse=True), predictions), actual_classes) orderings = [([ e[0] for e in l[0] ], l[1]) for l in orderings] rankings = map(lambda x : x[0].index(x[1]), orderings ) return rankings
return set_f, set_c if __name__ == '__main__': from matplotlib import pyplot as plt print "Loading data.." from feature_extraction.Cached_Features import data print "Normalizing..." # Select features data = data_select_specific_features(data, ['bi_char_dist', 'legomena', 'word_length', 'tri_char_dist', 'mono_tag_dist', 'sentence_length', 'readability']) # Get the data separated in features and classes features, classes = get_feature_vectors_from_data(data) # Compres the features to two numbers (points) FP = Feature_Preprocessor(features, False, True, 2) features = FP.batch_normalize(features) print "Data processed, now plotting..." # Convert a list of points to two lists of x and y points (fortran style) x = [ p[0] for p in features ] y = [ p[1] for p in features ] # Split into points of interest and normal points (obfuscated texts ad natural texts) interest = [ p for p in zip(x,y,classes) if p[2] == 1 ] normal = [ p for p in zip(x,y,classes) if p[2] == 0 ] # scatterplot the points norml = plt.scatter(zip(*normal)[0], zip(*normal)[1], marker='x', c='b', s=40)