def classification(model, get_features = True, train_bow = True): root_path = '/home/mitya/Documents/CMake/mlschool/mlschool_01/' csv_train = root_path + 'final_train.csv' csv_test = root_path + 'final_test.csv' features_folder = './arrays/' csv_reader = pd.read_csv(csv_train, sep = ',') vocabulary = bow.bow('bag_of_words', 'kmeans', 10) if get_features == True: vocabulary.transform_data(csv_train, root_path + 'train_dataset/', features_folder) vocabulary.transform_data(csv_test, root_path + 'test_dataset/', features_folder) if train_bow == True: vocabulary.fit(csv_train, features_folder, 100) X = vocabulary.transform(csv_train, features_folder, 20) y = csv_reader['image_label'].values[:X.shape[0]].ravel() preds = model.fit(X, y).predict_proba(vocabulary.transform(csv_test, features_folder, 10))[:, 1] #csv_reader = pd.read_csv(csv_test, sep = ',').drop('image_url', 1) #csv_reader['image_label'] = preds #csv_reader.to_csv(root_path + '/res.csv', index = False) return preds
def getBowNeighbors(img, ka=1000): path = 'train/data/' files = os.listdir(path) dic = {} for i in range(ka): if (i + 1) % 100 == 0: print('Done {0:d}/{1:d}'.format(i + 1, ka)) x = np.random.randint(len(files) - 2) I = cv.imread(path + files[x]) e = bow(img, I) dic[files[x]] = e files = sorted(dic, key=dic.__getitem__) return files
def extra(text,n,op): parser = PlaintextParser(text, Tokenizer(language)) if op == 1: return(bow(text,n)) elif op == 2: return(lexs(parser,n)) elif op == 3: return(luhn(parser,n)) elif op == 4: return(lsa(parser,n)) elif op == 5: return(textrank(parser,n)) elif op == 6: return(sumbasic(parser,n)) elif op == 7: return(klsum(parser,n)) elif op == 8: return(reduction(parser,n)) elif op ==9: return(tfidf(text,n))
def bow_proxy(tu): return bow.bow(*tu)
def bow_proxy(tu): return bow.bow(*tu) if __name__ == "__main__": cars_dir = "PNGImages/cars" cows_dir = "PNGImages/cows" bikes_dir = "PNGImages/bikes" # Generate Descriptors p = Pool(3) results = p.map_async(gen_dir, [cars_dir, cows_dir, bikes_dir], 1) results.get() # # Find means files = descriptor.select_sample(cars_dir, cows_dir, bikes_dir) centers = bow.kmeans(files) wrapped_centers = [CentroidWrapper(c.tolist(), i) for i, c in enumerate(centers)] # Construct kd-tree tree = kdtree.create(point_list=wrapped_centers, dimensions=128) if not tree.is_balanced: tree = tree.rebalance() p.close() p.join() # Calculate bow for every image bow.bow(cars_dir, tree) bow.bow(cows_dir, tree) bow.bow(bikes_dir, tree)
import euclidean path = 'F:/Kuliah/STKI/3-11-2017 kuis/text files/' articles = {} for item in os.listdir(path): if item.endswith(".txt"): with open(path + "/" + item, 'r',encoding='utf-8') as file: articles[item] = preprocessing.preprotext(file.read()) #representasi bow list_of_bow = [] for key, value in articles.items(): list_of_bow.append(bow.bow(value.split())) #membuat matrix matrix_akhir = matrix.matrix(list_of_bow) dokumen= ['bk.txt', 'ed.txt', 'ot.txt', 'en.txt', 'lf.txt', 'bl.txt', 'tk.txt'] dokumens = {} #---------------------------------------------------------------------- for item in os.listdir(path): for item in dokumen: with open(path + "/" + item, 'r', encoding ='utf-8') as file: dokumens[item] = preprocessing.preprotext(file.read()) #representasi bow
import longsword, greatsword, bow, axe, mace Quickshot = bow.bow('10', '25', '50') Quickshot.print_stats() Bludgeoner = mace.mace('10', '10', '5') Bludgeoner.print_stats()
def models(X_tr_n, y_tr, X_te_n, classifier): if(classifier == "c_svm"): ###################### C SVM - Accuracy - 0.44503 ############################# model = SVC() model.fit(X_tr_n, y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) # save_out(y_te_p,labels_string,sorted_files_te,'submission/testLabels_CSVM.csv') elif(classifier == "c_svm_l1"): ###################### C SVM L1 - Accuracy - 0.44503 ############################# model = LinearSVC(penalty='l1',dual=False) model.fit(X_tr_n, y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) elif(classifier == "log_reg"): ###################### Logistic regression ############################# model = linear_model.LogisticRegression() model.fit(X_tr_n, y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) elif(classifier == "c_svm_param"): ###################### C SVM Param - Accuracy - 0.50164 ############################# model = grid_search(X_tr_n,y_tr) print "Best params = " print model.best_params_ # model = SVC(C=10,kernel='rbf',gamma=0.001) # model.fit(X_tr_n, y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) elif(classifier == "knn"): ###################### KNN - Accuracy - ############################# model = KNeighborsClassifier(n_neighbors=20) model.fit(X_tr_n, y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) elif(classifier == "naive_bayes"): ###################### Naive Bayes - Accuracy - ############################# model = GaussianNB() model.fit(X_tr_n, y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) elif(classifier == "ols"): ###################### OLS - Accuracy - ############################# model = linear_model.LinearRegression() model.fit(X_tr_n,y_tr) y_tr_p = model.predict(X_tr_n) y_tr_p = np.round(y_tr_p) y_te_p = model.predict(X_te_n) y_te_p = np.round(y_te_p) elif(classifier == "ridge_reg"): ###################### Ridge Regression - Accuracy - ############################# model = linear_model.Ridge(alpha=0.001) model.fit(X_tr_n,y_tr) y_tr_p = model.predict(X_tr_n) y_tr_p = np.round(y_tr_p) y_te_p = model.predict(X_te_n) y_te_p = np.round(y_te_p) elif(classifier == "lasso"): ###################### Lasso - Accuracy - ############################# model = linear_model.Lasso(alpha=.15,max_iter=-1) model.fit(X_tr_n,y_tr) y_tr_p = model.predict(X_tr_n) y_tr_p = np.round(y_tr_p) y_te_p = model.predict(X_te_n) y_te_p = np.round(y_te_p) elif(classifier == "adaboost"): ###################### AdaBoost ########################################### # model = AdaBoostClassifier(RandomForestClassifier(max_features=50, n_estimators=10, max_depth=20), # n_estimators=100,learning_rate=2) model = AdaBoostClassifier(linear_model.SGDClassifier(n_iter=50),n_estimators=100,learning_rate=1, algorithm="SAMME") # model = AdaBoostClassifier(n_estimators=100,learning_rate=2) model.fit(X_tr_n,y_tr) y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) # elif(classifier == "voting"): # clf1 = DecisionTreeClassifier(max_depth=4) # clf2 = KNeighborsClassifier(n_neighbors=7) # clf3 = SVC(kernel='rbf', probability=True) # model = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2,1,2]) # model.fit(X_tr_n,y_tr) # y_tr_p = model.predict(X_tr_n) # y_te_p = model.predict(X_te_n) elif(classifier == "random_forest"): ###################### Random Forest ########################################### # model = RandomForestClassifier(n_estimators=100,n_jobs=4) # Grid search clf = RandomForestClassifier(n_jobs=3) param_grid = {"max_depth": [10, 20, 30], "max_features": [50, 100, 200], "n_estimators": [10,50,100]} # run grid search model = GridSearchCV(clf, param_grid=param_grid) model.fit(X_tr_n,y_tr) print model.best_params_ y_tr_p = model.predict(X_tr_n) y_te_p = model.predict(X_te_n) elif(classifier == "nn"): ############################### NN ################################### # tensorFlowNN(X_tr,y_tr,X_te,y_te) y_tr_p, y_te_p = keras_CNN(X_tr, y_tr, X_te) elif(classifier == "bow"): ############################### BOW ################################### X_tr_full_res, s = read_X_full_res('data/train') X_te_full_res, s = read_X_full_res('data/test') bow_obj = bow(kmeans_K = 100) X_bow_tr = bow_obj.fit_predict(X_tr_full_res) X_bow_te = bow_obj.predict(X_te_full_res) model = SVC() model.fit(X_bow_tr, y_tr) y_tr_p = model.predict(X_bow_tr) y_te_p = model.predict(X_bow_te) else: print "No Classifier selected" return False print_accuracy(y_tr, y_tr_p, "Training") return y_te_p
#A = [0, 0, 5, 3, 5, 2, 0, 1, 0, 0, 0] #B = [0, 2, 1, 0, 1, 0, 3, 0, 1, 0, 0] # #print (euclidean(A,B)) articles = {} for item in os.listdir(path): if item.endswith(".txt"): with open(path + "/" + item, 'r', encoding="utf-8") as file: articles[item] = preprocessing.preprotext(file.read()) #representasi bow list_of_bow = [] for key, value in articles.items(): list_of_bow.append(bow.bow(value.split())) #membuat matrix matrix_akhir = matrix.matrix(list_of_bow) #print (matrix_akhir) #print (euclidean(matrix_akhir[0], matrix_akhir[3])) #jarak = {} #for key, vektor in zip(articles.keys(), matrix_akhir): # jarak[key] = euclidean.euclidean(matrix_akhir[0], vektor) # #print (jarak) #jarak = [] #