max_features="auto", random_state=0, n_jobs=4), "AdaBoost": AdaBoostClassifier(n_estimators=500, random_state=0), "GradientBoost": GradientBoostingClassifier(n_estimators=500, learning_rate=1.0, max_depth=None, random_state=0), "NaiveBayes": GaussianNB(), "LDA": LDA(), "QDA": QDA() } ############################# Main: Run Different Classifiers ################################ data_dir = '/fraud_model/Data/Model_Data_Signal_Tmx_v3wd/' result_dir = '/fraud_model/Results/Model_Results_Signal_Tmx_v3wd_tmxrc_ind/' good_downsample_rate = 0.3 #used to scale back hit rate for job in joblist: print job result_summary = [] result_summary.append( ['Case', 'KS'] + ['HitRate@' + str(i) + '%CatchRate' for i in range(5, 105, 5)] + [
def classifierTrainTest(score, diagn, real_art, cvPartition, classifier, subjIndex, preAccMatrix, preInstOrder): x = 0 iteration = 0 idx = 0 PCNo = len(score[0]) subAccMatrix = 0 # FIX: what is test->matlab function within cvpartition class #idx = numpy.random.rand(cvPartition, iteration) #idx_test = numpy.where(idx == 1) #idx_train = numpy.where(idx != 1) #QUESTION: cv partition not scalar ,how works #iteration must be atleast 2 for idx_train, idx_test in cvPartition: #change idx to boolean array idx = numpy.zeros((len(score), 1), dtype=bool) for index in idx_test: idx[index] = True #for testing purposes #idx = numpy.zeros((len(score), 1), dtype=bool) #idx[47] = True #idx is all training in MATLAB implementation? cvTEST = numpy.zeros((sum(idx), PCNo)) diagnTEST = numpy.zeros((sum(idx), 1)) real_artTEST = numpy.zeros((sum(idx), 1)) instIndexTEST = numpy.zeros((sum(idx), 1)) cvTRAIN = numpy.zeros((len(idx) - sum(idx), PCNo)) diagnTRAIN = numpy.zeros((len(idx) - sum(idx), 1)) real_artTRAIN = numpy.zeros((len(idx) - sum(idx), 1)) k = 0 m = 0 for j in range(len(idx)): if idx[j] == 1: cvTEST[k, :] = score[j, :] diagnTEST[k] = diagn[j] real_artTEST[k] = real_art[j] instIndexTEST[k] = subjIndex[j] k = k + 1 else: cvTRAIN[m, :] = score[j, :] diagnTRAIN[m] = diagn[j] real_artTRAIN[m] = real_art[j] m = m + 1 # FIX: use scikit-learn for classifiers and predictions if classifier == "lda": #ldaModel = LDA() priorsArrays = numpy.array((.5, .5)) ldaModel = LDA(solver='eigen', priors=priorsArrays, shrinkage=1.00) #ldaModel = LDA() ldaModel.fit(cvTRAIN, diagnTRAIN) label = ldaModel.predict(cvTEST) elif classifier == 'qda': # training a quadratic discriminant classifier to the data qdaModel = QDA() priorsArrays = numpy.array((.5, .5)) #qdaModel = QDA(solver='eigen', priors=priorsArrays, shrinkage=1.00) qdaModel.fit(cvTRAIN, diagnTRAIN) label = qdaModel.predict(cvTEST) elif classifier == 'tree': # training a decision tree to the data treeModel = tree() treeModel.fit(cvTRAIN, diagnTRAIN) label = treeModel.predict(cvTEST) elif classifier == 'svm': # training a support vector machine to the data svmModel = SVC() svmModel.fit(cvTRAIN, diagnTRAIN) label = svmModel.predict(cvTEST) trueClassLabel = diagnTEST predictedClassLabel = label #from former loop subAccMatrix = numpy.column_stack( (trueClassLabel, predictedClassLabel, real_artTEST)) preAccMatrix[x:x + len(subAccMatrix[:, 0]), :] = subAccMatrix preInstOrder[x:x + len(instIndexTEST[:, 0])] = instIndexTEST x = x + len(subAccMatrix[:, 0]) #for testing purposes #break # create dictionary for return values return { 'cvTEST': cvTEST, 'diagnTEST': diagnTEST, 'real_artTEST': real_artTEST, 'instIndexTEST': instIndexTEST, 'cvTRAIN': cvTRAIN, 'diagnTRAIN': diagnTRAIN, 'real_artTRAIN': real_artTRAIN, 'trueClassLabel': trueClassLabel, 'predictedClassLabel': predictedClassLabel, 'idx': idx, 'subAccMatrix': subAccMatrix, 'preAccMatrix': preAccMatrix, 'preInstOrder': preInstOrder }
def find_best(X_train, y_train, X_validation, y_validation): classifiers = [ LogisticRegression(), KNeighborsClassifier(3), KNeighborsClassifier(n_neighbors=7, weights="uniform"), KNeighborsClassifier(n_neighbors=10, weights="uniform"), KNeighborsClassifier(n_neighbors=3, weights="uniform"), KNeighborsClassifier(n_neighbors=7, weights="distance"), KNeighborsClassifier(n_neighbors=10, weights="distance"), KNeighborsClassifier(n_neighbors=3, weights="uniform"), SVC(kernel="linear", C=0.025, probability=True), SVC(kernel="rbf", C=10, gamma=0.01, probability=True), SVC(kernel="rbf", C=1, gamma=0.01, probability=True), SVC(gamma=2, C=1, probability=True), DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(max_depth=1, criterion='entropy'), DecisionTreeClassifier(max_depth=5, criterion='entropy'), DecisionTreeClassifier(max_depth=10, criterion='entropy'), DecisionTreeClassifier(max_depth=5, criterion='entropy'), DecisionTreeClassifier(max_depth=10, criterion='gini'), DecisionTreeClassifier(max_depth=5, criterion='gini'), DecisionTreeClassifier(max_depth=1, criterion='gini'), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), RandomForestClassifier(max_depth=5, n_estimators=30, max_features=5, criterion='gini'), RandomForestClassifier(max_depth=5, n_estimators=20, max_features=10, criterion='entropy'), RandomForestClassifier(max_depth=5, n_estimators=30, max_features=10, criterion='gini'), RandomForestClassifier(max_depth=5, n_estimators=20, max_features=15, criterion='entropy'), RandomForestClassifier(max_depth=5, n_estimators=20, max_features=10, criterion='gini'), RandomForestClassifier(max_depth=5, n_estimators=30, max_features=15, criterion='entropy'), AdaBoostClassifier(), GaussianNB(), LDA(), QDA(), QDA(reg_param=0.001), QDA(reg_param=0.1), QDA(reg_param=0.01), SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) ] clf_dict = {} y_pred_list = [] for clf in classifiers: clf.fit(X_train, y_train) y_pred = clf.predict(X_validation) y_pred_list.append(y_pred) acc = metrics.accuracy_score(y_validation, y_pred) # avg_prec = metrics.average_precision_score(y_validation, y_pred) # prec = metrics.precision_score(y_validation, y_pred) # class_rep = metrics.classification_report(y_validation, y_pred, target_names=['background', 'foreground']) # f1 = metrics.f1_score(y_validation, y_pred) clf_dict[clf] = acc global best_one best_one = max(clf_dict, key=clf_dict.get) print("{" + "\n".join("{}: {}".format(k, v) for k, v in clf_dict.items()) + "}") print("\n\n********THE BEST CLASSIFIER IS********\n") print(best_one)
from sklearn.qda import QDA h = .05 # step size in the mesh names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] ''' X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) ''' # X is 1084 x 2 # y is 1084 x 1 x_file = open("X2.csv", "r") y_file = open("y2.csv", "r") X = [] y = [] obs = [] for line in y_file: y.append(float(line))
testFeatures = features[round(0.6 * Nsamples):, :] testLabels = labels[round(0.6 * Nsamples):, :] print(np.shape(trainFeatures), np.shape(testFeatures), np.shape(trainLabels), np.shape(testLabels)) K = np.unique(labels).size plt.clf() lineStyle = ['ob', '*g', '+c', 'xr', '>y'] for cls in range(K): idx = (labels == cls + 1) plt.plot(features[np.nonzero(idx)[0], 0], features[np.nonzero(idx)[0], 1], lineStyle[cls]) print('Discriminant analysis') model = QDA() y_pred = model.fit(trainFeatures, trainLabels[:, 0]).predict(testFeatures) y_pred = y_pred[:, np.newaxis] aux = (y_pred != testLabels) aux = np.sum(aux.astype(float), 0) misclassificationRate = aux / testLabels.size print(misclassificationRate) print('Logistic Regression') model = LogisticRegression(multi_class='multinomial', solver='newton-cg', C=100) #create extended features xtrainFeatures = np.concatenate( (trainFeatures, trainFeatures[:, 0:1] * trainFeatures[:, 1:2]), 1) xtestFeatures = np.concatenate(
def main(): # Load dataset ftrain = csv.reader(file(r'../input/train.csv')) ftest = csv.reader(file(r'../input/test.csv')) fweather = csv.reader(file(r'../input/weather.csv')) fspray = csv.reader(file(r'../input/spray.csv')) weatherPasstimelist = [ "Tmax", "Tmin", "Tavg", "DewPoint", "WetBulb", "PrecipTotal", "Depart" ] weatherPasstimevalue = [2, 3, 4, 6, 7, 16, 5] weatherdict = readweather(fweather) spraydict = readspray(fspray) #generate train and test data print "generate train and test data" trout = [] train_y = [] for trlist in ftrain: templine = [] if ftrain.line_num == 1: continue date = trlist[0] datelist = date.split('-') dateformate = datetime.datetime.strptime(date, "%Y-%m-%d").date() Latitude = trlist[7] Longitude = trlist[8] Species = speciesdict[trlist[2]] AddressAccuracy = trlist[9] NumMosquitos = trlist[10] WnvPresent = trlist[11] train_y.append(WnvPresent) #write weather locid = nearloc(Latitude, Longitude) weatherlist = weatherdict[date][locid] templine.append(float(Species)) for w in weatherlist[2:]: templine.append(float(w)) #time before 1,3,7,14 days passstr = '' # for days_ago in [1,2,3,5,8,12]: # day = dateformate - datetime.timedelta(days=days_ago) # weatherlistPasstime = weatherdict[str(day)][locid] # for obs in weatherPasstimevalue: # try: # templine.append(float(weatherlistPasstime[obs])) # except: # print weatherlistPasstime # exit(0) # templine.append(float(Latitude)) # templine.append(float(Longitude)) #write spray if not spraydict.has_key(date): sprayvalue = 0 else: if nearspray(spraydict[date], Latitude, Longitude): sprayvalue = 1 else: sprayvalue = 0 templine.append(sprayvalue) trout.append(templine) teout = [] test_y = [] for telist in ftest: templine = [] if ftest.line_num == 1: continue date = telist[1] dateformate = datetime.datetime.strptime(date, "%Y-%m-%d").date() datelist = date.split('-') Latitude = telist[8] Longitude = telist[9] Species = speciesdict[telist[3]] locid = nearloc(Latitude, Longitude) weatherlist = weatherdict[date][locid] test_y.append(0) templine.append(float(Species)) for w in weatherlist[2:]: templine.append(float(w)) passstr = '' # for days_ago in [1,2,3,5,8,12]: # day = dateformate - datetime.timedelta(days=days_ago) # weatherlistPasstime = weatherdict[str(day)][locid] # for obs in weatherPasstimevalue: # try: # templine.append(float(weatherlistPasstime[obs])) # except: # print weatherlistPasstime # exit(0) # templine.append(float(Latitude)) # templine.append(float(Longitude)) #write spray if not spraydict.has_key(date): sprayvalue = 0 else: if nearspray(spraydict[date], Latitude, Longitude): sprayvalue = 1 else: sprayvalue = 0 templine.append(sprayvalue) teout.append(templine) #remove feature with no distinction and less important indices = [i for i in range(len(trout[0]))] frqIndex = trimfrq(trout) for i in frqIndex: indices.remove(i) train_x = indexTodata(trout, indices) test_x = indexTodata(teout, indices) # #feature selections # ftsel = ExtraTreesClassifier() # ftsel.fit(train_x, train_y) # # train_x_new = ftsel.transform(train_x) # test_x_new = ftsel.transform(test_x) #modeling print "modeling" train_x_nor, mean, std = normalize(train_x) test_x_nor, mean, std = normalize(test_x, mean, std) classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(class_weight='auto'), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(class_weight='auto'), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), RandomForestClassifier(class_weight='auto'), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] clf = OneClassSVM(nu=0.2, kernel="rbf", gamma=65.7933224658) clf.fit(train_x_nor, train_y) train_pdt = clf.predict(train_x_nor) MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt) print ":" print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) test_pdt = clf.predict(test_x_nor) MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) print #predict test data print("predict test data") test_pdt = clf.predict(test_x_nor) fprt = open('sampleSubmissionbyKW.csv', 'w') fprt.write("ID,WnvPresent\n") id = 1 for eachy in test_pdt: fprt.write("%s,%s\n" % (str(id), str(eachy))) id = id + 1 fprt.close()
# Use the prior two days of returns as predictor values, with direction as # the response X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Train/test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.8, random_state=42 ) # Create the (parametrized) models print("Hit Rates/Confusion Matrices:\n") models = [("LR", LogisticRegression()), ("LDA", LDA()), ("QDA", QDA()), ("LSVC", LinearSVC()), ("RSVM", SVC(C=1000000.0, cach_size=200, class_weight=None, coef=0.0, degree=3, gamma=0.001, kernal='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) ) ("RF", RandomForestClassifier( n_estimators=100, criterion='gini', max_depth=None, min_samples_leaf=1, max_features='auto',bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0) )] # Iterate through the models for m in models: # Train each of the models on the training set m[1].fit(X_train, y_train)