def main(met_train, met_test, aqi_train, aqi_test, test): while True: ch = int( input("\n\nchose among the following classifier\n" "1.Rnadom Forrest\n" "2.K-NN\n" "3.SVM\n" "4.Decision Tree\n" "5.exit\n")) if ch == 1: model, accuracy = Classifiers.Random_Forest_Classifier( met_train, met_test, aqi_train, aqi_test) print(model.predict(test)) print(accuracy) elif ch == 2: model, accuracy = Classifiers.KNN(met_train, met_test, aqi_train, aqi_test) print(model.predict(test)) print(accuracy) elif ch == 3: model, accuracy = Classifiers.SVM(met_train, met_test, aqi_train, aqi_test) print(model.predict(test)) print(accuracy) elif ch == 4: model, accuracy = Classifiers.Decision_tree( met_train, met_test, aqi_train, aqi_test) print(model.predict(test)) print(accuracy) elif ch == 5: break
def clf_vote(dataVolume,no_rand_rate=0): data = createData.createData(dataVolume) # 星期(2) 天气(2) 时间(6) 行进方向(4) 离基站方向(6) 前一个BS(6) x, y = data[:, :-1], data[:, -1] if no_rand_rate != 0 : for i in range(len(y)): if y[i] % no_rand_rate == 0: y[i] = random.randint(1, 6) X_train, X_test, y_train, y_test = train_test_split(x , y, test_size = 0.5, random_state = 42) svm_clf = Classifiers.svm_estimator(X_train,y_train) RF_clf = Classifiers.RandomForest_best_estimator(X_train,y_train) DT_clf = Classifiers.DecisionTree_best_estimator(X_train,y_train) mlp_clf = Classifiers.MLPClassifier_estimator(X_train,y_train) count = 0.0 time_start = time.time() pre_svm = list(svm_clf.predict(X_test)) pre_RF = list(RF_clf.predict(X_test)) pre_DT = list(DT_clf.predict(X_test)) pre_MLP = list(mlp_clf.predict(X_test)) time_end = time.time() time_pre = time_end - time_start for i in range(len(pre_svm)): vote_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} vote_dict[pre_svm[i]] += 1 vote_dict[pre_RF[i]] += 1 vote_dict[pre_DT[i]] += 1 vote_dict[pre_MLP[i]] += 1 vote_result = max(vote_dict, key=vote_dict.get) if vote_result == y_test[i]: count +=1 acc = count / len(y_test) return time_pre,acc
def Cluster(): regex = re.compile("^.*.xlsx$") Lobj = ListObjects.ListObjects() #Group based on number of non-zero columns if os.path.exists(path): for Exfile in os.listdir(path): if regex.match(Exfile): read = pd.ExcelFile(path + "/" + Exfile) for sheet in read.sheet_names: dframe = read.parse(sheet, header=None) dframe = dframe.as_matrix() key = group.GetNonZeroColumn(dframe, Exfile, sheet) if key != None: group.ClassifyPattern(key, dframe, Lobj) else: print "Path " + path + " Doesn't exist" #Group Based on weight group.SubClassify("weight", Lobj) #Not worth classifying patterns on rows, as there are cases which gets #are affected due to internal column rotation #group.SubClassify("row",Lobj) group.SubClassify("zeroweight", Lobj) keys = Lobj.PatternObjectList.keys() #group.ExcelWrite(Lobj) DB = db.Write2Db("pattern", "set1") for key in keys: (column, weight, zeroweight) = key.split("_") matrices = Lobj.PatternObjectList[key] DB.Insert(key, column, weight, zeroweight, matrices) return
def real_vs_fake_flixster(): X, T = RFData.load_real_fake_data_flixster(file_index=4) # X, T = RFData.load_real_fake_data_ML_100k() # print(type(Y[0])) # Classifiers.log_reg(X, Y) #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] Classifiers.log_reg(X, T)
def real_vs_fake_libimseti(): import LibimSeTiData as LD X, T = RFData.load_real_fake_data_libimseti(file_index=11) # X, T = RFData.load_real_fake_data_ML_100k() # print(type(Y[0])) # Classifiers.log_reg(X, Y) #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] Classifiers.log_reg(X, T)
def real_vs_fake(): X, T = RFData.load_real_fake_data_ML_1m(file_index=49) #X, T = RFData.load_real_fake_data_ML_100k() #print(type(Y[0])) # Classifiers.log_reg(X, Y) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] Classifiers.log_reg(X_train, T_train) from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression cv = StratifiedKFold(n_splits=10) coefs = [] avg_coefs = np.zeros(shape=(len(X_train[1]),)) random_state = np.random.RandomState(0) for train, test in cv.split(X_train, T_train): x, t = X_train[train], T_train[train] model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(x, t) # rank the coefs: ranks = ss.rankdata(model.coef_[0]) coefs.append(ranks) # print(len(model.coef_[0]),len(X_train[0])) avg_coefs += model.coef_[0] coefs = np.average(coefs, axis=0) coefs = [[coefs[i], i, avg_coefs[i]] for i in range(len(coefs))] coefs = np.asarray(list(sorted(coefs))) values = coefs[:, 2] index_zero = np.where(values == np.min(np.abs(values))) top_male = index_zero[0][0] top_female = index_zero[0][-1] L_m = coefs[:top_male, 1] R_m = 3952 - coefs[:top_male, 0] C_m = np.abs(coefs[:top_male, 2]) L_f = coefs[coefs.shape[0] - top_female:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0] - top_female:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0] - top_female:, 2] C_f = list(reversed(np.abs(C_f))) id_index, index_id = MD.load_movie_id_index_dict() movies = [] with open("ml-1m/movies.dat", 'r') as f: for line in f.readlines(): movies.append(line.replace("\n", "")) for index, val in enumerate(L_m[0:min(10,len(L_m))]): print(index, movies[id_index[int(val)+1]], C_m[index]) for index, val in enumerate(L_f[0:min(10,len(L_f))]): print(index, movies[id_index[int(val)+1]], C_f[index])
def ExtraTrees_LBGLCM(self): global accuracies, all_classifiers, labels lbglcm_feat = self.compute_LBGLCM() n_trees = int(self.NotreesXtra.text()) max_feats = self.FeaturesXtra.currentText() clf, x_x2, y_x2, dict4 = Classifiers.Xtra( lbglcm_feat, n_trees, max_feats ) #Collecting the trained classifier, x_test, y_test and labels Y_pred_x2 = Classifiers.pred(clf, x_x2) acc_test = Classifiers.display_results(Y_pred_x2, y_x2) all_classifiers.append(clf) accuracies.append(acc_test) labels.append(dict4)
def RandomTrees_LBGLCM(self): global accuracies, all_classifiers, labels lbglcm_feat = self.compute_LBGLCM() n_trees = self.notreesRF.text() max_feats = self.FeaturesRF.currentText() clf, x_rf2, y_rf2, dict2 = Classifiers.RF_train( lbglcm_feat, n_trees, max_feats ) #Collecting the trained classifier, x_test, y_test and labels Y_pred_rf2 = Classifiers.pred(clf, x_rf2) acc_test = Classifiers.display_results(Y_pred_rf2, y_rf2) all_classifiers.append(clf) accuracies.append(acc_test) labels.append(dict2)
def GB_LBGLCM(self): global accuracies, all_classifiers lbglcm_feat = self.compute_LBGLCM() n_est = int(self.Estimators_gb.text()) max_feats = self.Features_gb.currentText() Lrate = float(self.lineEdit_4.text()) clf, x_g2, y_g2, dict6 = Classifiers.GB( lbglcm_feat, n_est, max_feats, Lrate ) #Collecting the trained classifier, x_test, y_test and labels Y_pred_g2 = Classifiers.pred(clf, x_g2) acc_test = Classifiers.display_results(Y_pred_g2, y_g2) all_classifiers.append(clf) accuracies.append(acc_test) labels.append(dict6)
def RandomTrees_GLCM(self): global accuracies, all_classifiers, labels glcm_feat = self.compute_GLCM() n_trees = self.notreesRF.text() max_feats = self.FeaturesRF.currentText() clf, x_rf1, y_rf1, dict1 = Classifiers.RF_train( glcm_feat, n_trees, max_feats ) #Collecting the trained classifier, x_test, y_test and labels Y_pred_rf1 = Classifiers.pred(clf, x_rf1) #Predicting the x_test labels acc_test = Classifiers.display_results(Y_pred_rf1, y_rf1) #accuracy of prediction all_classifiers.append(clf) accuracies.append(acc_test) labels.append(dict1)
def __init__(self,num_filters): self.featExtractor = Classifiers.IMU_CNN_3D_FEATURE_EXTRACTOR(suffix="40Hz",num_filters=num_filters,patience=250,layers=3,kern_size=32,divide_kernel_size=True) self.featExtractor.loadBestWeights() #Output of grid search: #{'bootstrap': False, 'max_depth': 142, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 60} #Initializing RF self.clf_RF = RandomForestClassifier(n_estimators=60,min_samples_split=2,min_samples_leaf=2,max_features='auto',max_depth=142,bootstrap=False)
def Analyze_SubTopic_Scores(metadata, bidx, cl, outfile, **CVargs): ''' grid_search reveals little variation. Linear 100 or linear 10 seem best, but not a huge effect Number of cases doesn't matter for MNB because most are small sample sizes (largest is 6,000). ''' print 'SUBTOPIC ANALYSIS' #num=1000 #metadata=ImportMeta(-1) path = 'Twitter/Data/' PREDS = {} for cat in set([line[1] for line in metadata.values()]): if cat == 'category' or cat == 'party': continue if cat == 'Student' or cat == 'indUnk': args = {'n_iter': 20, 'test_size': .9, 'random_state': 0} else: args = CVargs.copy() print 'RUNNINING ', cat, ' SUBTOPIC SCORES' f = 'Twitter_' + cat + '_Topic_Scores.csv' data = ImportCSVFeatureData(path + f, -1) vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd PREDS.update(preds) Write_Scores(PREDS, ['id', 'subtopic_score'], outfile) return
def CNN(self): global accuracies, all_classifiers epoch = int(self.epochs.text()) dataset_loc = self.FileLocation.text() val_split = float(self.validation_split.text()) accuracy, clf, val_datagen = Classifiers.CNN( dataset_loc, epoch, val_split ) #Collecting the test accuracy, trained classifier and y_test accuracies.append(accuracy[0]) all_classifiers.append(clf)
def Analyze_KBest_Scores(metadata, bidx, cl, outfile, **CVargs): filename = 'KBest' vec, ids, words = importArray(filename) labels = np.array([metadata[idx][0] for idx in ids]) # if 'age' not in line]) IDX = np.array(ids) # #filename='Twitter/Data/Twitter_KBest_Scores.csv' #data=ImportCSVFeatureData(filename,-1) #print 'drawing samples' #vec=np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line]) #IDX=np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) print 'drawing samples' labels = np.array([metadata[idx][0] for idx in ids]) # if 'age' not in line]) IDX = np.array(ids) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Brown/Results/Raw_Preds.csv' Write_Scores(preds, ['id', 'kbest_score'], outfile) return
def main(): u_data, v_data, fg_imgs, original_imgs, abnormal_fg_imgs = load_data( ) #load data from ref_data weight = Weight_matrix().get_weight_matrix( ) #use normalization method for feature correction thisFeatureExtractor = Feature_extractor(original_imgs, fg_imgs, abnormal_fg_imgs, u_data, v_data, weight) train_data, train_labels = thisFeatureExtractor.get_features_and_labels( 80, 140) #training frames ########################## To see the train data features distribution, uncomment next line################################## #uvPlot(train_data[:,0],train_data[:,1],train_labels,False) ############################################################################################################################# classifiers = Classifiers(train_data, train_labels) test_data, test_labels = thisFeatureExtractor.get_features_and_labels( 140, 199) #testing frames for name, model in classifiers.models.items(): #get each classifier for ind, original_img in enumerate( original_imgs[:-1]): #get each frame pos, thisImg, _, _ = thisFeatureExtractor.getPosition( fg_imgs, ind) #get the position of each person in this frame features, _ = thisFeatureExtractor.get_features_and_labels( ind, ind + 1, False) #get the features for each person in the frame labels = classifiers.models[name].predict(features) #predict label plot(pos, labels, thisImg, name) #show classifiers.prediction_metrics( test_data, test_labels, name) #metrics for each classifier based on the test data
def quickDemo(): trials = 5000 X, y = cl.makeData(train) index = rn.sample(range(0, 39739), trials) smallX = np.empty((trials, len(X[0]))) smally = np.empty(trials, dtype='|S30') count = 0 for i in index: smallX[count] = X[i] smally[count] = y[i] count = count + 1 start = time.time() cl.SVM(smallX, smally) print((time.time() - start) / 5) start = time.time() cl.NearestNeighbor(smallX, smally) print((time.time() - start) / 5) start = time.time() cl.MLP(smallX, smally) print((time.time() - start) / 5)
def Analyze_Nonword_Scores(metadata, bidx, cl, outfile, **CVargs): ''' check rows 1535 and 15349 for inf data. Should no longer have to recode 8 and 12 (herndanV and LnM) ''' print 'NONWORD ANALYSIS' #metadata=ImportMeta(-1) filename = 'Twitter/Data/Twitter_Nonword_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown #vec[:,8]=vec[:,8]*-1 #herndanV is always neg (changed in Make_Twitter_Data now) #vec[:,12]=vec[:,12]*-1 #LnM is always neg (changed in Make_Twitter_Data now) labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Nonwords_Preds.csv' Write_Scores(preds, ['id', 'nonword_score'], outfile) return
def dataVolume_runtime(dataVolume, no_rand_rate=2): # data = cd.createData(dataVolume) data = createData.createData( dataVolume) #星期(2) 天气(2) 时间(6) 行进方向(4) 离基站方向(6) 前一个BS(6) # data = data_normalized.createData(dataVolume) x, y = data[:, :-1], data[:, -1] for i in range(len(y)): if y[i] % no_rand_rate == 0: y[i] = random.randint(1, 6) print('data set done!') #DecisionTree: bestTree = clf.DecisionTree_best_estimator(x, y) DTScore = cross_val_score(bestTree, x, y) #k-Neighbors: k_best = clf.k_neighbors_best_estimator(x, y) k_Neighbors_score = cross_val_score(k_best, x, y) #randomforest: rf_best = clf.RandomForest_best_estimator(x, y) rf_score = cross_val_score(rf_best, x, y) #svm: svm_model = clf.svm_estimator(x, y) svm_score = cross_val_score(svm_model, x, y) #MLPClassifier nn_best = clf.MLPClassifier_estimator(x, y) nn_score = cross_val_score(nn_best, x, y, cv=6) #vote vote_time, vote_acc = vote.clf_vote(dataVolume * 2, no_rand_rate=no_rand_rate) # return DTScore,k_Neighbors_score,rf_score,svm_score,dttime,knntime,rftime,svmtime return float(sum(DTScore)) / len(DTScore),float(sum(k_Neighbors_score)) / len(k_Neighbors_score),\ float(sum(rf_score)) / len(rf_score),float(sum(svm_score)) / len(svm_score), \ float(sum(nn_score)) / len(nn_score),vote_acc,vote_time
def Analyze_Raw(metadata, bidx, cl, outfile, **CVargs): ''' mnb max's out at 69/70% accurate at 3,000 (or 600 training) texts. Does not increase in accuracy after that. svm: grid search showed ideal is linear kernal with C=1,10, or 100; also max's out at 74% accurate for 3,000 (goes to 76 at 10,000) ''' print 'running Raw analysis' #metadata=ImportMeta(-1) filename = 'Raw' vec, ids, words = importArray(filename) print 'drawing samples' #vec=data[0:,1:] #grab all but zeroth column #labels=data[0:,0] #grab all of zeroth column labels = np.array([metadata[idx][0] for idx in ids]) # if 'age' not in line]) IDX = np.array(ids) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Raw_Preds.csv' Write_Scores(preds, ['id', 'raw_score'], outfile) return
def Analyze_Raw_Topic_Scores(metadata, bidx, cl, outfile, **CVargs): ''' grid_search shows C>=1 is ideal. remains 71% from 500 through 7000 remains at 71% at sample sizes from 500 through 10000. ''' print 'RAW TOPIC ANALYSIS' #metadata=ImportMeta(-1) filename = 'Twitter/Data/Raw_Topic_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Raw_Topic_Preds.csv' Write_Scores(preds, ['id', 'rawTopic_score'], outfile) return
def classify(selected_classifier, directory_of_image, trained_classifiers, labels): if selected_classifier == 'GLCM+Random Forest': feat = extract(selected_classifier, directory_of_image) Ans = Classifiers.pred(trained_classifiers[0], feat) dict = labels[0] return dict[Ans[0]] if selected_classifier == "LBGLCM + Random Forest": feat = extract(selected_classifier, directory_of_image) Ans = Classifiers.pred(trained_classifiers[1], feat) dict = labels[1] return dict[Ans[0]] if selected_classifier == "GLCM + Extra Trees Classifier": feat = extract(selected_classifier, directory_of_image) Ans = Classifiers.pred(trained_classifiers[2], feat) dict = labels[2] return dict[Ans[0]] if selected_classifier == "LBGLCM + Extra Trees Classifier": feat = extract(selected_classifier, directory_of_image) Ans = Classifiers.pred(trained_classifiers[3], feat) dict = labels[3] return dict[Ans[0]] if selected_classifier == "GLCM + Gradient Boosting Classifier": feat = extract(selected_classifier, directory_of_image) Ans = Classifiers.pred(trained_classifiers[4], feat) dict = labels[4] return dict[Ans[0]] if selected_classifier == "LBGLCM + Gradient Boosting Classifier": feat = extract(selected_classifier, directory_of_image) Ans = Classifiers.pred(trained_classifiers[5], feat) dict = labels[5] return dict[Ans[0]] if selected_classifier == 'Convolutional Neural Networks': test_image = image.load_img(directory_of_image, target_size=(64, 64)) test_image = image.img_to_array(test_image) test_image = np.expand_dims(test_image, axis=0) test_image /= 255. Ans = Classifiers.pred(trained_classifiers[6], test_image) final_ans = Ans[0] dict = {} dict[0] = 'Crazing' dict[1] = 'Inclusion' dict[2] = 'Patches' dict[3] = 'Pitted Surface' dict[4] = 'RS' dict[5] = 'Scratch' return dict[np.argmax(final_ans)]
def Analyze_Individual(metadata, bidx, cl, outfile, **CVargs): ''' grid search shows C>=1 is optimal accuracy is unrelated to sample size (remains 84-89% throughout) ''' print 'INDIVIDUAL ANALYSIS' #metadata=ImportMeta(-1) filename = 'Twitter/Data/Twitter_Individual_Scores.txt' data = ImportFeatureData(filename, -1) vec = np.array([line[2:] for line in data if line[1] != 1.0 ]) #exclude cases where sex is never mentioned labels = np.array([ metadata[line[0]][0] for line in data if line[1] != 1.0 ]) # if 'age' not in line]) IDX = np.array([line[0] for line in data if line[1] != 1.0]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Individual_Preds.csv' Write_Scores(preds, ['id', 'indiv_score'], outfile) return
def worker(positives, negatives, classifiersToUse, feats, outFile, i, return_dict): """thread worker function""" # positives, negatives, featuresToUse, whereToPrint, verbose, classifiersToUse results = Classifiers.runClassifiers(positives, negatives, feats, "output.txt", False, classifiersToUse) print("done ", i) for r in results: return_dict["accuracy"] += r[1] return_dict["pos_precision"] += r[2] return_dict["pos_recall"] += r[3] return_dict["pos_f1"] += r[4] return_dict["neg_precision"] += r[5] return_dict["neg_recall"] += r[6] return_dict["neg_f1"] += r[7] return
def Analyze_LIWC(metadata, bidx, cl, outfile, **CVargs): filename = 'Twitter/Data/Twitter_LIWC_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='LIWC_Preds.csv' Write_Scores(preds, ['id', 'liwc_score'], outfile) return
def main(self): pre = Preprocessing.Preprocessing(self.features_to_consider) if self.normalise: pre.scale() # TODO different scaling options if self.classification_model == 'logisticRegression': logReg_predict = Classifiers.Classifiers(pre.features_dataframe, self.features_to_consider) if self.splitByDate: logReg_predict.splitDataset(self.trainingDateStart, self.trainingDateEnd, self.testDateStart, self.testDateEnd) logReg_predict.LogRegTrainModel() logReg_predict.LogRegTestModel() else: print("other model")
def elim(df): #print df latlong,_=Classifiers.cut_loca_price(df) boolean=np.ones((len(latlong)), dtype=bool) boolean=np.invert(boolean) for i in range(len(latlong)): for j in range(i): if np.array_equal(latlong[i], latlong[j]): boolean[j]=True tokeep=np.where(np.invert(boolean)) df=df.iloc[tokeep[0]] return df # LOCATION=Constants.filelocations.GOOGLE_NOMINATIM_HOUSEPRICE # df=DO.readgeofile(LOCATION) # train,test,train_index,test_index=DO.dataseperator(df) # print train['Latitude'][301],train['Longitude'][301] # print train['Latitude'][577],train['Longitude'][577] # redf=elim(train) # print redf
def evaluate(self): # Evaluate using k-fold crossvalidation k = 5 k_fold = StratifiedKFold(k) print("Performing %s-fold crossvalidation" % k) real_labels = [] predicted_labels = [] for train, test in k_fold.split(self.feature_vector, self.target_vector): training_set = self.feature_vector[train] testing_set = self.feature_vector[test] if self.type_dimensionality_reduction: mapping = DimensionalityReduction.reduce_dimensionality( self.feature_vector[train], self.target_vector[train], self.type_dimensionality_reduction) training_set = mapping.transform(training_set) testing_set = mapping.transform(testing_set) classifier = Classifiers.train(self.classifier_name, training_set, self.target_vector[train], self.classifier_variables) real_labels.extend(self.target_vector[test]) predicted_labels.extend(classifier.predict(testing_set)) # Get the evaluation metrics resulting_metrics = EvaluationMetrics(real_labels, predicted_labels, self.labels_order) print("\t\tDone!!") return resulting_metrics
def main(): directory = '/xxxx/features' feature_names = [name for name in os.listdir(directory) if name == '200'] feature_names.sort(reverse=True) for feature_name in feature_names: # K=dictKvalue[feature_name] K = 40 data_dir = os.path.join(directory, feature_name) dataset = load_dataset('sigactcuboid', data_dir) classifiers = Classifiers(dataset.get_data(), dataset.feature_name, dataset.model_dir, K) for name, model in classifiers.models.items(): #get each classifier model_directory = os.path.join(dataset.model_dir) modname = os.path.join( model_directory, name + '_' + 'train' + '_' + feature_name + '.pkl') model_data = load_model(modname) labels, scores, test_time = model_data.predict() timefile = os.path.join( model_directory, name + '_' + 'test' + '_' + feature_name + '.json') times2json(test_time, timefile)
X = [ list(map(int, x.split(',')[:-1])) for x in open('covtype.data').read().splitlines()[:SIZE_DATA] ] _Y = [ x.split(',')[-1] for x in open('covtype.data').read().splitlines()[:SIZE_DATA] ] larg = largestClass(_Y) # treat the largest class as positive, the rest as negative Y = [1 if x == larg else -1 for x in _Y] xTrain, xTest, yTrain, yTest = cv.train_test_split(X, Y, train_size=5000 / len(X)) # In[2]: import Classifiers as clfs clfs.KNN(xTrain, xTest, yTrain, yTest) clfs.RandomForest(xTrain, xTest, yTrain, yTest) clfs.BoostedDecisionTree(xTrain, xTest, yTrain, yTest) clfs.NeuralNets(xTrain, xTest, yTrain, yTest) #clfs.SVM(xTrain, xTest, yTrain, yTest) clfs.linearSVC(xTrain, xTest, yTrain, yTest) import Classifiers as clfs clfs.XGBoost(xTrain, xTest, yTrain, yTest) # In[ ]:
def parse_doc(self, doc, doc_id, connClassifier, argPosClassifier, senseClassifier, argClassifier, implicitsenseClassifier,PSarg1Classifier): store=0 output = [] num_sentences = len(doc['sentences']) token_id = 0 token_id_sentence=0 for i in range(num_sentences): total=set(range(num_sentences)) covered=set() uncovered=set() sentence1 = doc['sentences'][i] len_sentence1 = len(sentence1['words']) j=0 while j < len_sentence1: wordString,connLabel,skip=Classifiers.classifyConnective(sentence1,j,connClassifier) if connLabel=='N' or connLabel == 'False': token_id+=skip+1 j+=skip+1 continue argPosLabel,senseLabel,arg1List,arg2List=Classifiers.classifyOther(sentence1,wordString,j,skip,argPosClassifier,senseClassifier,argClassifier) #print doc_id if (argPosLabel=='PS' and i==0): token_id+=skip+1 j+=skip+1 continue try: sentence2 = doc['sentences'][i-1] len_sentence2 = len(sentence2['words']) words = sentence2['words'] except IndexError: store=i covered.add(i) relation = {} relation['DocID'] = doc_id relation['Connective'] = {} relation['Arg1'] = {} relation['Arg2'] = {} relation['Connective']['TokenList'] = range(token_id,token_id+skip+1) relation['Type'] = 'Explicit' if argPosLabel=='PS': #relation['Arg1']['TokenList'] = range((token_id_sentence - len_sentence2), token_id_sentence - 1) #relation['Arg2']['TokenList'] = range(token_id_sentence, (token_id_sentence + len_sentence1) - 1) arg1List = 3_ArgExtractor.arg(doc['sentences'][i-1]['parsetree'], wordString, PSarg1Classifier,doc['sentences'][i-1]['words']) relation['Arg1']['TokenList'] = [i+token_id-j for i in arg1List] #l = list(set(range(token_id-j, token_id -j + len_sentence1-1))-set([token_id])) #l.sort() #relation['Arg2']['TokenList'] =l relation['Arg2']['TokenList'] = kong-finalPSArg2Extractor.argsExtract(PSarg2classifier,doc['sentences'][i-1]['parsetree'],relation['Connective']['TokenList']) elif argPosLabel=='SS': covered.add(i) relation['Arg1']['TokenList']=[i+token_id-j for i in arg1List] relation['Arg2']['TokenList']=[i+token_id-j for i in arg2List] relation['Sense'] = [senseLabel] output.append(relation) token_id += skip token_id+=1 j+=skip+1 token_id_sentence+=len_sentence1 uncovered=list(total-covered) uncovered.sort() token_id=0 featureSet = [] for i in range(num_sentences-1): if i in uncovered: sentence1 = doc['sentences'][i] len_sentence1 = len(sentence1['words']) token_id += len_sentence1 sentence2 = doc['sentences'][i+1] len_sentence2 = len(sentence2['words']) relation = {} relation['Type'] = 'Implicit' relation['DocID'] = doc_id relation['Arg1'] = {} relation['Arg1']['TokenList'] = range((token_id - len_sentence1), token_id - 1) relation['Arg2'] = {} relation['Arg2']['TokenList'] = range(token_id, (token_id + len_sentence2) - 1) print sentence1, sentence2, doc_id feature = ImplicitClassifier2.extractFeatures(sentence1, sentence2, doc_id, model) featureSet.append(feature) senseType = implicitsenseClassifier.classify(feature) print senseType senseType = unicode(senseType) relation['Sense'] = [senseType] relation['Connective'] = {} relation['Connective']['TokenList'] = [] output.append(relation) f = pickle.dump(featureSet, open('implicitFeatureSet.p', 'wb')) return output
im = [] iiList = [] label = [] numNot = 200 # Number of non-face images to use numYes = 200 # Number of face images to use strong = pickle.load(open("strongClassifier.pkl", 'rb')) filesNOT = glob.glob( '../faces/test/non-face/*.pgm') # List of filenames of non-faces filesYES = glob.glob( '../faces/test/face/*.pgm') # List of filenames of faces for count in range(numNot): # loop through desired number of non-faces i = cv2.imread(filesNOT[count], -1) # Read image im.append(i) # Append image to list of images ii = Classifiers.getIntegralImage(i) # Calculate integral image iiList.append(ii) # Append ii to list of integral images label.append(0) # Append label 0 = non-face print('Loaded not images') for count in range(numYes): # loop through desired number of faces i = cv2.imread(filesYES[count], -1) # Read image im.append(i) # Append image to list of images ii = Classifiers.getIntegralImage(i) # Calculate integral image iiList.append(ii) # Append ii to list of integral images label.append(1) # Append label 1 = face print('Loaded is images') label = np.array(label) # Convert list to 1D array print(strong.getPerformance(iiList, label)) predict = strong.predict(iiList)
with open(os.path.join(TRAINING_SET_FOLDER, filename), 'r') as f: for line in f: line2 = line.strip().split('\t') if len(line2) == 2: data.append(line2) #print(len(data)) #data = data[:500] N = len(data) limit = (4 * N) // 5 # limite entre indices d'entraînement et de test labels_training = [line[0] for line in data[:limit]] X_training = [line[1] for line in data[:limit]] X_token_tr = c.tokenize(X_training) labels_bin_tr = [ 1 if label in ['Positive'] else 0 for label in labels_training ] labels_bin_tr = np.array(labels_bin_tr) labels_learning = [line[0] for line in data[limit:]] X_learning = [line[1] for line in data[limit:]] X_token_te = c.tokenize(X_learning) labels_bin_te = [ 1 if label in ['Positive'] else 0 for label in labels_learning ] labels_bin_te = np.array(labels_bin_te) print("Proportion de revues positives:", np.mean(labels_bin_te))