def model_pred(trainX,trainY,testX,model_type): if model_type == "rf": clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20) clf.fit(trainX,trainY) pred = clf.predict(testX) if model_type == "gbdt": clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) pred = clf.predict(testX) if model_type == "fusion": prob = np.zeros(len(testX)) params = [100,200,300,400,500] for param in params: clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' params = [1,2,3,4,5,6,7,8,9,10] for param in params: clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' pred = list(prob >= 3) print "the pos rate is:",float(sum(pred))/len(pred) return pred
def model_pred(trainX, trainY, testX, model_type): if model_type == "rf": clf = RandomForestClassifier(n_estimators=500, n_jobs=20) clf.fit(trainX, trainY) pred = clf.predict(testX) if model_type == "gbdt": clf = GradientBoostingClassifier(n_estimators=6, learning_rate=0.9, random_state=0) clf.fit(trainX, trainY) pred = clf.predict(testX) if model_type == "fusion": prob = np.zeros(len(testX)) params = [100, 200, 300, 400, 500] for param in params: clf = RandomForestClassifier(n_estimators=param, n_jobs=20, bootstrap=True) clf.fit(trainX, trainY) prob += clf.predict(testX) ''' params = [1,2,3,4,5,6,7,8,9,10] for param in params: clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' pred = list(prob >= 3) print "the pos rate is:", float(sum(pred)) / len(pred) return pred
class LexicaseForestClassifier(BaseEstimator, ClassifierMixin): def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs): self._initial_forrest_size = n_estimators * initial_forrest_factor self._final_forrest_size = n_estimators rf_fit_args = copy(kwargs) rf_fit_args.update({'n_estimators': self._initial_forrest_size}) self._rf = RandomForestClassifier(**rf_fit_args) def fit(self, X, y): self._rf.fit(X, y) for t in self._rf.estimators_: tree_y_pred = t.predict(X) t._error_vector = squared_error_vector(y, tree_y_pred) final_estimators = [] for i in range(self._final_forrest_size): final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_)) self._rf.estimators_ = final_estimators self._rf.n_estimators = self._final_forrest_size # TODO: Set other self._rf parameters to match correct size so that predict works. def predict(self, X, y=None): return self._rf.predict(X)
def stkFoldCrossValidation(): X = pickle.load(open('X.p', 'rb')) X = np.array(X) Y = pickle.load(open('Y.p', 'rb')) Y = np.array(Y) skf = StratifiedKFold(n_splits=10) skf.get_n_splits(X, Y) k = 1 for train_index, test_index in skf.split(X, Y): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] print(k) k += 1 rf = RandomForestClassifier() rf.fit(X_train, Y_train) yp = rf.predict(X_test) print(classification_report(Y_test, yp, digits=6))
class RandomForestClassifierImpl(): def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def forest(X, y, model_path): model = RandomForestClassifier() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def calcRandomForestClassifier(channels_training, channels_testing, target_training, target_testing): clf = RandomForestClassifier(n_estimators=500, max_features=int( sqrt(len(channels_training[0])))) clf = clf.fit(channels_training, target_training) predictions = clf.predict(channels_testing) comp = [predictions, target_testing, channels_testing] return clf, comp
def calc_score(test, train): test_f, test_l = split_data_label(test) train_f, train_l = split_data_label(train) # 학습시키고 정답률 구하기 clf = RandomForestClassifier() clf.fit(train_f, train_l) pre = clf.predict(test_f) return metrics.accuracy_score(test_l, pre)
def drawfeature(train_data_path,train_file_name,test_data_path,test_file_name): train_file = os.path.join(train_data_path,train_file_name) train_data = pd.read_csv(train_file) n_data_train = train_data['text'].size print 'n_data_train is %s' %n_data_train print type(n_data_train) test_file = os.path.join(test_data_path,test_file_name) test_data = pd.read_csv(test_file) n_data_test = test_data['text'].size print 'n_data_test is %s' %n_data_test print type(n_data_test) vectorizer = CountVectorizer(analyzer='word',tokenizer = None, preprocessor = None, stop_words=None, max_features = 5000) transformer = TfidfTransformer() train_data_words = [] print 'start with words in train data set' for i in xrange(n_data_train): if((i+1)%1000 == 0): print 'Drawfeatures line %d of %d' %(i+1,n_data_train) train_data_words.append(words_to_features(train_data['text'][i])) print 'start bag of words in train data....' train_data_features = vectorizer.fit_transform(train_data_words) train_data_features = train_data_features.toarray() print 'start tfidf in train data....' train_data_features = transformer.fit_transform(train_data_features) train_data_features = train_data_features.toarray() #test-data processing test_data_words = [] for i in xrange(n_data_test): if((i+1)%1000 == 0): print 'Drawfeatures line %d of %d' %(i+1,n_data_test) test_data_words.append(words_to_features(test_data['text'][i])) test_data_features = vectorizer.fit_transform(test_data_words) test_data_features = test_data_features.toarray() print'randome forest go...' forest = RandomForestClassifier(n_estimators = 13) forest = forest.fit(train_data_features,train_data['label']) pred = forest.predict(test_data_features) pred = pd.Series(pred,name='Target') pred.to_csv('SENTI_RF.CSV',index=None, header = None) print'naive baby go...' mnb = MultinomialNB(alpha=0.01) mnb = mnb.fit(train_data_features,train_data['label']) pred = mnb.predict(test_data_features) pred = pd.Series(pred,name = 'Target') pred.to_csv('SENTI_MNB',index = None, header = True)
def RandomForestClassifer(self): ''' Function to do RandomForest Classifer. ''' train_Array = self.titanic_train_frame.values self.test_Array = self.titanic_test_frame.values randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1) randomForest.fit(train_Array[0::,1::],train_Array[0::,0]) self.predicted_probability = randomForest.predict(self.test_Array[0::,0::]) self.predicted_probability_list = self.predicted_probability.tolist()
class RFClassifier(super.abstract_classifier): def __init__(self, train_features, train_labels, num_of_trees): self.train_features = train_features self.train_labels = train_labels self.rf_member = RandomForestClassifier(num_of_trees) def train(self): self.rf_member.fit(self.train_features, self.train_labels) def classify(self, newVector): return self.rf_member.predict(newVector)
def build_and_test_model(classifier, X, Y, Z, param): accuracies = [] ari = [] for train, test in LeaveOneOut().split(X): X_train, Y_train = X[train], Y[train] X_test, Y_test, Z_test = X[test], Y[test], Z[test] predicted = None if classifier == "KNN": neigh = KNeighborsClassifier(n_neighbors=param).fit( X_train, Y_train) predicted = neigh.predict(X_test) elif classifier == "RF": clf = RandomForestClassifier(n_estimators=param, random_state=0) # ,max_depth=2, clf.fit(X_train, Y_train) predicted = clf.predict(X_test) elif classifier == "SVM": clf = svm.SVC(gamma='scale') clf.fit(X_train, Y_train) predicted = clf.predict(X_test).astype(int) elif classifier == "NAIVE": clf = GaussianNB() clf.fit(X_train, Y_train) predicted = clf.predict(X_test).astype(int) elif classifier == "RANDOM": options = list(set(Y_train)) predicted = [random.choice(options) for _ in range(len(Y_test))] accuracies.append(metrics.accuracy_score(Y_test, predicted)) ari.append(metrics.adjusted_rand_score(Z_test, predicted)) return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
def evalOne(enabledColumns): features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]] Y = [] P = [] for group in range(0,5): # print("Test group " + str(group + 1)) trainStationList = [] testStationList = [] for i in range(0,5): if i == group: testStationList.extend(groups[i]) else: trainStationList.extend(groups[i]) trainStations = set(float(station) for station in trainStationList) # reorder train stations # print("\ttrainStationList:" + str(trainStationList)) trainStationList = [s for s in all_stations if float(s) in trainStations] # print("\ttrainStationList:" + str(trainStationList)) testStations = set(float(station) for station in testStationList) # print("\ttestStationList:" + str(testStationList)) trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target") train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)] # train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)] test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)] # test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)] trainY = [] for l in trainLocation: if l in train_lower: trainY.append(0) else: trainY.append(1) testY = [] for l in testLocation: if l in test_lower: testY.append(0) else: testY.append(1) model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1) model.fit(trainX, trainY) predY = model.predict(testX) Y.extend(testY) P.extend(predY) f1 = f1_score(Y, P) accuracy = accuracy_score(Y, P) return f1, accuracy
class Model(BaseModel): """Antares implementation of scikit learn random forest classifier """ def __init__(self, categorical_features=None, n_estimators=50, n_jobs=-1, max_depth=10): ''' Example: >>> from madmex.modeling.supervised.rf import Model >>> rf = Model() >>> # Write model to db >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no') >>> # Read model from db >>> rf2 = Model.from_db('test_model') ''' super().__init__(categorical_features=categorical_features) self.model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, max_depth=max_depth) self.model_name = 'rf' def fit(self, X, y): X = self.hot_encode_training(X) self.model.fit(X, y) def predict(self, X): ''' Simply passes down the prediction from the underlying model. ''' X = self.hot_encode_predict(X) return self.model.predict(X) def predict_confidence(self, X): """Get confidence of every prediction """ X = self.hot_encode_predict(X) return self.model.predict_proba(X).max(axis=1) def score(self, X, y): ''' Test the model given a dataset and a target vector. This method applies the model that this object represents to the given dataset using the response variable y. It is a measure of the accuracy of the trained model. Usually the orginal dataset should be splitted in training and testing subsets to cross validate the model. ''' return self.model.score(X, y)
def classic_model(image_dir, image_lists, method): X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method) classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4) classifier.fit(X, y) X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method) predictions = classifier.predict(X_test) confusion = pandas.crosstab(y_test, predictions, rownames=['Actual Class'], colnames=['Predicted Class']) print confusion return accuracy_score(y_test, predictions)
def just_pred(x, y): xlen = len(x) i = range(xlen) np.random.shuffle(i) trainpct = 0.7 trainlen = int(trainpct * xlen) testlen = xlen - trainlen xtrain = x.ix[:trainlen,:] ytrain = y.ix[:trainlen] xtest = x.ix[trainlen:,:] ytest = y.ix[trainlen:] rf = RandomForestClassifier() rf.fit(xtrain, ytrain) ypred = rf.predict(xtest) return ytest, ypred
def just_pred(x, y): xlen = len(x) i = range(xlen) np.random.shuffle(i) trainpct = 0.7 trainlen = int(trainpct * xlen) testlen = xlen - trainlen xtrain = x.ix[:trainlen, :] ytrain = y.ix[:trainlen] xtest = x.ix[trainlen:, :] ytest = y.ix[trainlen:] rf = RandomForestClassifier() rf.fit(xtrain, ytrain) ypred = rf.predict(xtest) return ytest, ypred
def random_forest(profile, group, n_tree, search_number, avg_acc): ''' 对丰度表进行建模 :param profile:丰度表 :param group: 分组表 :param n_tree: 模型中树的颗数 :param search_number: 搜索随机种子的次数 :param avg_acc: 随机种子准确率的输出文件 :return: 加label后的group ''' real_label = set(group.iloc[:, 0]) label_dict = {} for i, j in enumerate(real_label): label_dict[j] = i label = [] for sample in group.index: label.append(label_dict[group.loc[sample].values[0]]) group['label'] = label n = 0 with open(avg_acc, 'w') as f: f.write('random_state\tavgAcc\n') while n < search_number: print('现在循环次数为{0}'.format(n+1)) # random random_state random_state = round(random() * 10000) rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3, random_state=random_state) acc = [] for i in range(10): sample_train = list(profile.sample(n=30).index) sample_val = list(set(profile.index).difference(sample_train)) train = profile.loc[sample_train] val = profile.loc[sample_val] label_train = group['label'].loc[sample_train] rf.fit(train, group['label'][sample_train]) pre = rf.predict(val) acc.append(metrics.accuracy_score(y_true=group['label'][sample_val], y_pred=pre)) # print('{0}\t{1}\n'.format(random_state, sum(acc) / 10)) f.write('{0}\t{1}\n'.format(random_state, sum(acc) / 10)) n += 1 return group
def decision_frist(): data = datasets.load_iris() x = data["data"] y = data["target"] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25) des = DecisionTreeClassifier(max_leaf_nodes=3) des.fit(X_train, y_train) print(des.predict(X_test)) print(des.score(X_test, y_test)) rom = RandomForestClassifier() rom.fit(X_train, y_train) print(rom.predict(X_test)) print(rom.score(X_test, y_test))
def crossval(x, y, k=5): for i in range(k): i = range(len(X)) np.random.shuffle(i) xlen = len(x) trainpct = 0.7 trainlen = int(trainpct * xlen) testlen = xlen - trainlen xtrain = x.ix[:trainlen, :] ytrain = y.ix[:trainlen] xtest = x.ix[trainlen:, :] ytest = y.ix[trainlen:] rf = RandomForestClassifier() rf.fit(xtrain, ytrain) ypred = rf.predict(xtest) print ypred
def crossval(x, y, k=5): for i in range(k): i = range(len(X)) np.random.shuffle(i) xlen = len(x) trainpct = 0.7 trainlen = int(trainpct * xlen) testlen = xlen - trainlen xtrain = x.ix[:trainlen,:] ytrain = y.ix[:trainlen] xtest = x.ix[trainlen:,:] ytest = y.ix[trainlen:] rf = RandomForestClassifier() rf.fit(xtrain, ytrain) ypred = rf.predict(xtest) print ypred
def predict2(text): # let o dataset dataset_file = os.path.join(BASE_DIR, 'dataset', 'complain.json') with open(dataset_file) as data: data = json.load(data) # Get the number of reviews based on the dataframe column size num_complain = len(data) # Initialize an empty list to hold the clean complain clean_train_complain = [] target_problem_type = [] for complain in data: clean_train_complain.append( clean_data('%s %s' % (complain['title'], complain['complain']))) target_problem_type.append(complain['category']) vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=500) train_data_feature = vectorizer.fit_transform( clean_train_complain).toarray() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=100) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run forest = forest.fit(train_data_feature, target_problem_type) clean_test_complain = [] clean_test_complain.append(clean_data(text)) test_data_features = vectorizer.transform(clean_test_complain) test_data_features = test_data_features.toarray() result = forest.predict(test_data_features) return result
def forestPredict(columName, features, trees): pd.options.mode.chained_assignment = None df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv') df['pred'] = "" #df = df.set_index([df.m_championship_id,df.m_match_group_num]) df = df.set_index([df.m_match_id]) for champId in range(1,91): champ = df[(df.m_championship_id == champId)] print(champId) if (champId < 11 or champId > 20): if (len(champ) == 380): rd = 38 elif (len(champ) == 306): rd = 34 else: rd = 30 for mid in range(2,rd+1): train = champ[champ.m_match_group_num < mid] test = champ[champ.m_match_group_num == mid] target = 'm_column_result' X = train[features] y = train[target] Z = test[features] clf = RandomForestClassifier(n_estimators=trees,max_features=None ) clf.fit(X,y) pred = clf.predict(Z) for i,p in zip(Z.index,pred) : df.set_value(i,'pred',p) nameFile = 'pred_' + columName + ".csv" df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
def predict(text, dataset_file_path): dataset_file = dataset_file_path data_file = open(dataset_file, 'r') reader = csv.reader(data_file, delimiter=';', quoting=csv.QUOTE_NONE) clean_train_data = [] target_data = [] for line in reader: clean_train_data.append(clean_data(line[0])) target_data.append(line[1]) vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=500) train_data_feature = vectorizer.fit_transform(clean_train_data).toarray() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=100) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run forest = forest.fit(train_data_feature, target_data) clean_test_complain = [] clean_test_complain.append(clean_data(text)) test_data_features = vectorizer.transform(clean_test_complain) test_data_features = test_data_features.toarray() result = forest.predict(test_data_features) return result
def plot_rf(self): n = self.bestScoreN A0 = [row[0] for row in self.dataset if row[2] == 0] A1 = [row[0] for row in self.dataset if row[2] == 1] B0 = [row[1] for row in self.dataset if row[2] == 0] B1 = [row[1] for row in self.dataset if row[2] == 1] Xplot = [] Yplot = [] Xplot, Yplot = np.meshgrid(np.arange(-0.2, 4.4, 0.2),np.arange(-0.2, 4.4, 0.2)) clf = RandomForestClassifier(n_estimators = n).fit(self.X,self.Y) predicted = clf.predict(np.c_[Xplot.ravel(), Yplot.ravel()]) predicted = predicted.reshape(Xplot.shape) plot0 = plt.scatter(A0,B0, marker='+', color = 'red') plot1 = plt.scatter(A1,B1, marker = 'o', color = 'green') plt.legend((plot0, plot1), ('label 0', 'label 1'), scatterpoints = 1) plt.xlabel('A') plt.ylabel('B') plt.title("RF Classifier") plt.contourf(Xplot, Yplot, predicted, alpha=0.5) plt.show()
def forestPredict4(): df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv') df.index = df.m_match_id df['rf1000_fs4'] = "" pd.options.mode.chained_assignment = None for t1 in range(11,91,10): print(t1) champ = df[(df.m_championship_id < t1) & (df.m_championship_id >= t1-10)].sort_values(['m_match_date']) for t2 in range(10,len(champ),10): train = champ[0:t2] test = champ[t2:t2+10] features = ['m_odd_home','m_odd_away','m_odd_underdog', 'm_odd_favorite','m_odd_draw','m_odd_medium', 'a_goals_for_mean','h_goals_for_mean'] target = 'm_column_result' X = train[features] y = train[target] Z = test[features] clf = RandomForestClassifier(n_estimators=1000) clf.fit(X,y) pred = clf.predict(Z) for t3,p in zip(Z.index,pred) : df.set_value(t3,'rf1000_fs4',p) df.to_csv(path.NOTEBOOKS_DATA + 'features3.csv',index=False);
def RandomForestIndependent(): X = pickle.load(open('X.p', 'rb')) Y = pickle.load(open('Y.p', 'rb')) print('**** *****') rf = RandomForestClassifier(n_estimators=10) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.0, random_state=3) rf.fit(X_train, Y_train) yp = rf.predict(X_train) print('**** Training *****') print(classification_report(Y_train, yp)) '''yp = rf.predict(X_test) print('**** Testing *****') print(classification_report(Y_test, yp,digits=6))''' pickle.dump(rf, open('rf.p', 'wb'))
class Model(BaseModel): ''' classdocs ''' def __init__(self, path): ''' Constructor ''' self.path = path self.model = RandomForestClassifier(n_estimators=150,n_jobs=8) self.model_name = 'rf' def fit(self, X, y): self.model.fit(X,y) def predict(self, X): ''' Simply passes down the prediction from the underlying model. ''' return self.model.predict(X) def save(self, filepath): ''' Persists the trained model to a file. ''' joblib.dump(self.model, create_filename(filepath,'%s.pkl' % self.model_name)) def load(self, filepath): ''' Loads an already train model from a file to perform predictions. ''' self.model = joblib.load(create_filename(filepath,'%s.pkl' % self.model_name)) def score(self, X, y): ''' Lets the user load a previously trained model to predict with it. ''' return self.model.score(X,y)
def forestPredict7030(columName, features, trees): pd.options.mode.chained_assignment = None df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv') df['pred'] = "" #df = df.set_index([df.m_championship_id,df.m_match_group_num]) df = df.set_index([df.m_match_id]) for champId in range(11,91,10): champ = df[(df.m_championship_id < champId) & (df.m_championship_id >= champId-10)] print(champId) if (champId != 21): train = champ[df.m_championship_id <= champId-4] test = champ[df.m_championship_id > champId-4] target = 'm_column_result' X = train[features] y = train[target] Z = test[features] clf = RandomForestClassifier(n_estimators=trees,max_features=None ) clf.fit(X,y) pred = clf.predict(Z) for i,p in zip(Z.index,pred) : df.set_value(i,'pred',p) nameFile = 'pred_' + columName + ".csv" df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
def main(): header = ["id"] feats = [] df = pd.read_csv("cora.content", sep="\t") for i in range(df.shape[1] - 2): feat = "feat_" + str(i) header.append(feat) feats.append(feat) header.append("class") feats = np.array(feats) df.columns = header x_train, x_test, y_train, y_test = train_test_split( df[feats], df["class"], test_size=0.3 ) clf = RandomForestClassifier(n_estimators=200) clf.fit(x_train, y_train) importances = clf.feature_importances_ sorted_idx = np.argsort(importances) x = list(zip(feats[sorted_idx], importances[sorted_idx])) x_sorted = sorted(x, key=lambda x: -x[1]) # Statistics y_pred = clf.predict(x_test) precision, recall, fscore, _ = score(y_test, y_pred, average="macro") print("Precision:", round(precision, 3)) print("Recall: ", round(recall, 3)) print("F-Score: ", round(fscore, 3)) print("Accuracy: ", round((y_pred == y_test).sum() / len(y_pred), 3)) selected_feats = [key for key, val in x_sorted[:20]] print(selected_feats)
class StackingFusion(FusionStrategy): ''' The StackingFusion learns a fusion strategy from training data. A classifier is trained that uses the posterior probabilities from all microphones in the sensor network as input features. ''' def __init__(self, channel_sort=ChannelSortNone()): ''' Constructor @param channel_sort: An object of type ChannelSortStrategy. ''' self.stacked_classifier = None self.channel_sort = channel_sort def train(self, log_probs, labels): ''' Train the stacked classifier @param log_probs: list of probability matrices (channels, label) @param labels: label for each feature-vector ''' print 'Train stacked classifier with %d windows' % labels.shape[0] log_probs = [self.channel_sort.sort(f) for f in log_probs] log_probs = np.vstack(log_probs) # TODO: classifier as Parameter self.stacked_classifier = RandomForestClassifier(n_estimators=10) self.stacked_classifier.fit(log_probs, labels) def apply(self, log_probs): ''' Apply fusion strategy to classifier probabilities @param log_probs: log probabilities for each channel and class in shape (channel, class) @return: Class index for the predicted class ''' log_probs = self.channel_sort.sort(log_probs) # return the classindex as a scalar not as an array return self.stacked_classifier.predict(log_probs)[0]
for train, test in kf: y_train = [] x_train = [] for i in train: y_train.append(features[i][6]) tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]] x_train.append(tmp) y_test = [] x_test = [] for i in test: y_test.append(features[i][6]) tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]] x_test.append(tmp) rf.fit(x_train, y_train) rfPredTest = rf.predict(x_test) rfPrecisionTest = precision_score(y_test, rfPredTest) rfRecallTest = recall_score(y_test, rfPredTest) rfF1Test = f1_score(y_test, rfPredTest) rfAvgPrecision += rfPrecisionTest rfAvgRecall += rfRecallTest rfAvgF1 += rfF1Test print "RF completed in ", time.time() - start, " s" print "rf:\n Precision {}\n Recall {}\n F1 {}\n".format(rfAvgPrecision / 5, rfAvgRecall / 5, rfAvgF1 / 5)
# treino, teste e avaliacao print('Iniciando o k-Fold...') for train_index, test_index in k_fold.split(tf_idf): x_train, x_test = tf_idf[train_index], tf_idf[test_index] y_train, y_test = classes[train_index], classes[test_index] # treino do modelo print(f'Gerando o Modelo {i}...') classifier = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=iteracao).fit( x_train, y_train) # classificando o conjunto de teste y_pred = classifier.predict(x_test) # metricas de desempenho aux_accuracy += accuracy_score(y_test, y_pred) aux_f1_score += f1_score(y_test, y_pred) aux_precision += precision_score(y_test, y_pred) aux_recall += recall_score(y_test, y_pred) conf_matrices += np.asarray(confusion_matrix(y_test, y_pred)) print(f'Modelo {i} finalizado e avaliado.') i += 1 # resultados print(f'\nITERATION #{iteracao} -----------------------') print(f'Accuracy = {aux_accuracy / k_fold.n_splits}') print(f'F1 Score = {aux_f1_score / k_fold.n_splits}')
x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123) # Fit Model etclf = ExtraTreesClassifier(n_estimators=20, max_depth=10, verbose=1) etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test) from sklearn.ensemble.forest import RandomForestClassifier rdclf = RandomForestClassifier(n_estimators=20, max_depth=10) rdclf.fit(x_train, y_train) metrics.confusion_matrix(rdclf.predict(x_test), y_test) from sklearn.ensemble.weight_boosting import AdaBoostClassifier adaclf = AdaBoostClassifier(n_estimators=20) adaclf.fit(x_train, y_train) metrics.confusion_matrix(adaclf.predict(x_test), y_test) metrics.confusion_matrix(etclf.predict(x_test), y_test) metrics.confusion_matrix(rdclf.predict(x_test), y_test) metrics.confusion_matrix(adaclf.predict(x_test), y_test)
y_predict = m.predict(X_test) fpr, tpr, thresh = roc_curve(y_test, y_predict, pos_label=1) auc = roc_auc_score(y_test, y_predict) print 'AUC: ', auc print 'Percentage of players that will have TJ in 2014: ',np.mean(y_predict) return fpr, tpr, auc rf_fpr, rf_tpr, rf_auc = evaluate_model(RandomForestClassifier) svc_fpr, svc_tpr, svc_auc = evaluate_model(SVC) RFC2 = RandomForestClassifier(n_estimators = 10) RFC2.fit(X, y) predict_players['predictions']=RFC2.predict(predict_players[X_cols]) predict_players.to_csv('testing.csv') print 'Players that RF thinks will have TJ in 2014', predict_players['m1_name'][predict_players['predictions']==1] the_doomed = predict_players['m1_name'][predict_players['predictions']==1] injuries2014 = pd.read_csv('.\\intermediate data\\injuries2014.csv') for each_doomed_person in the_doomed.values: if each_doomed_person in injuries2014.values: print each_doomed_person, 'has in fact undergone TJ in 2014!' else: print each_doomed_person, "did not end up having TJ in 2014..." for each_injured_person in injuries2014[injuries2014.columns[1]].values:
#download the file raw_data=urllib.urlopen(url) #get data, add column names and index feature_names=["times pregnant", "plasma glucose conc.", "distolic blood pressure (mm Hg)", "triceps skin fold thickness (mm)", "2-hour serum insulin (mu U/ml)", "body mass index (kg/m^2)", "diabetes pedigree function", "age (years)", "target"] dataset=pd.DataFrame.from_csv(raw_data) dataset=dataset.reset_index() dataset.columns=feature_names #split into train and test set train, test=train_test_split(dataset, test_size=0.3) #normalize data df_scaled_train=pd.DataFrame(preprocessing.scale(train), columns=feature_names) df_scaled_test=pd.DataFrame(preprocessing.scale(test), columns=feature_names) model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20) #train model #if getting this error, it is because a matrix with 1 column #is being passed in when a 1d array is expected. ravel() will work. #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main': #To resolve this error, convert label values to int or str as float is not a valid label-type #raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array model.fit(df_scaled_train.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_train.ix[:,'target'].astype(int))) print "Accuracy:", model.score(df_scaled_test.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_test.ix[:,'target'].astype(int))) #predict output predicted=model.predict(df_scaled_test.ix[:,'times pregnant':'age (years)']) print predicted
from sklearn.ensemble.forest import RandomForestClassifier def read(fname): labels, data = [],[] with open(fname) as f: for s in f: ss = s.split() labels.append(int(ss[-1])) data.append(map(float, ss[:-2])) return labels, data trainset = read('./trainset') testset = read('./testset') clf = RandomForestClassifier(n_estimators=10) clf.fit(trainset[1], trainset[0]) print clf.predict(testset[1]) print testset[0]
columns=X_train.columns) #:# model params = {'max_depth': 3, 'n_estimators': 75} classifier = RandomForestClassifier(**params) classifier.fit(X_train, y_train) #:# hash #:# 5475503c9e4b64dc0dcc4960399cf72c md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}') print(f'auc: {roc_auc_score(y_test, y_pred_proba)}') print(f'precision: {precision_score(y_test, y_pred)}') print(f'recall: {recall_score(y_test, y_pred)}') print(f'specificity: {tn/(tn+fp)}') print(f'f1: {f1_score(y_test, y_pred)}') #:# session info # Dodaj wersję pythona w session info
def runns(resp_var, size_of_test_data,dataset,positive_class,predictor_var, n_estimators,important_features,dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset=dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset=deal_with_nulls(dealing_with_nulls,dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors=dataset[predictor_var] predictors=pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp=dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds,resp) #-------get the important predictors feature_imp = pd.Series(rf_clf.feature_importances_, index=list(predictors.iloc[:,0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp=pd.DataFrame(data=resp,columns=[resp_var]) predictors=pd.DataFrame(prds,columns=list(predictors)) dataset=pd.concat([resp,predictors],axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data= dataset.drop(resp_var, axis = 1,inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = float(size_of_test_data), random_state = 402) # Instantiate model with n_estimators decision trees clf = RandomForestClassifier(n_jobs = 1,n_estimators = n_estimators, random_state = 142) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels,predicted)) #precision score precision = precision_score(test_labels,predicted,pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels,pred_prob[:,[1]]) #recall score rec = recall_score(test_labels,predicted,pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels,predicted,pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels,predicted,beta=0.5) #hamming_loss hamming = hamming_loss(test_labels,predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels,predicted) #logloss logloss = log_loss(test_labels,predicted) #zero-oneloss zero_one = zero_one_loss(test_labels,predicted) #auc roc area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]]) #cohen_score cohen = cohen_kappa_score(test_labels,predicted) #mathews corr mathews = matthews_corrcoef(test_labels,predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews} output=json.dumps(output) return output
from sklearn.ensemble.forest import RandomForestClassifier from sklearn.metrics.classification import classification_report import pandas as pd __author__ = 'semyon' print("reading") csv = pd.read_csv("data/train.csv") print("slicing") train_features = csv.ix[:, 'x23':'x61'].fillna(0).as_matrix() train_true = csv['y'].tolist() trtrfe = train_features[:35000, :] trtrtrue = train_true[:35000] trtefe = train_features[35000:, :] trtetrue = train_true[35000:] print("learning") for depth in [7, 10, 12, 15, 20, 30, 50, 70]: for leaf_samples in [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 20, 40, 60, 150]: # model = GradientBoostingClassifier(n_estimators=10, max_depth=depth, min_samples_leaf=leaf_samples, verbose=1) model = RandomForestClassifier(n_estimators=50, max_depth=depth, min_samples_leaf=leaf_samples, verbose=0, n_jobs=4) model.fit(trtrfe, trtrtrue) # mean accuracy on the given test data and labels # print depth, '\t', leaf_samples, '\t', model.score(trtefe, trtetrue) predicted = model.predict(trtefe) print(classification_report(trtetrue, predicted))
print "Confusion matrix:" print metrics.confusion_matrix(dat_clean.genre, predicted) ##################### data_tree = dat_clean.iloc[:,[3,4,5,6,7,8,9,10,13,14,15]] clf = clf.fit(data_tree, dat_clean.genre) # Visualize tree dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values)) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('dectree.pdf') # Repeat on test set y_test_pred = clf.predict(X_test) print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(y_test, y_test_pred)) print print "Classification report:" print metrics.classification_report(y_test, y_test_pred) print print "Confusion matrix:" print metrics.confusion_matrix(y_test, y_test_pred) # Measure performance y_pred = clf.predict_proba(X_train) # Repeat on test set y_test_pred = clf.predict_proba(X_test) tt = g_test.as_matrix()
mask = classifications != -1 print mask.sum() X = images[mask, ...].reshape(mask.sum(), np.prod(images.shape[1::])) print X.shape Y = classifications[mask] acc = [] acc_correct = [] acc_incorrect = [] acc_x_incorrect = [] k_fold = 8 for train_inx, valid_inx in StratifiedKFold(Y, k_fold): rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True) rf.fit(X[train_inx], Y[train_inx]) Yp = rf.predict(X[valid_inx]) correct = Yp== Y[valid_inx] rf.predict_proba(X[valid_inx]) p_correct = rf.predict_proba(X[valid_inx]).max(axis=1) acc_correct.append(p_correct[correct]) acc_incorrect.append(p_correct[~correct]) score = correct.mean() print score acc.append(score) acc_x_incorrect.append([images[mask][valid_inx[~correct]], Y[valid_inx[~correct]], Yp[~correct]]) print 'score', np.mean(acc)
class TAERandomForestClassifier(object): lab_encoders = {} dummy_encoder = None rfc_model = None n_estimators = 100 max_features = 7 max_depth = 16 def encode_fit(self, cat_data): #Encodes string to numeric labels tdc_set_encoded = cat_data.copy(deep=True) for cn in cat_data.columns: self.lab_encoders[cn] = preprocessing.LabelEncoder() self.lab_encoders[cn].fit(cat_data[str(cn)]) tdc_set_encoded[str(cn)] = self.lab_encoders[cn].transform( cat_data[str(cn)]) #Encodes to dummy dataset self.dummy_encoder = preprocessing.OneHotEncoder(categories="auto") self.dummy_encoder.fit(tdc_set_encoded[cat_data.columns]) #print(len(self.dummy_encoder.get_feature_names())) encoded_cat_data = pd.DataFrame( data=self.dummy_encoder.transform(tdc_set_encoded).todense(), columns=self.dummy_encoder.get_feature_names()) return encoded_cat_data def encode(self, cat_data): for cn in cat_data.columns: cat_data[str(cn)] = self.lab_encoders[cn].transform( cat_data[str(cn)]) #Encodes to dummy dataset encoded_cat_data = pd.DataFrame( data=self.dummy_encoder.transform(cat_data).todense(), columns=self.dummy_encoder.get_feature_names()) return encoded_cat_data def fit(self, x_train, y_train, cat_cols, num_cols): #Separates dataset in categorical and numbers x_train_num = x_train[num_cols].copy(deep=True) x_train_cat = x_train[cat_cols].copy(deep=True) x_train_cat = self.encode_fit(x_train_cat) x_train_num.reset_index(drop=True, inplace=True) x_train_cat.reset_index(drop=True, inplace=True) f_x_train = pd.concat([x_train_num, x_train_cat], axis=1) self.rfc_model = RandomForestClassifier(n_estimators=self.n_estimators, criterion="entropy", max_features=self.max_features, max_depth=self.max_depth) self.rfc_model = self.rfc_model.fit(f_x_train, y_train) def predict(self, x_predict, cat_cols, num_cols): #Separates dataset in categorical and numbers x_predict_num = x_predict[num_cols].copy(deep=True) x_predict_cat = x_predict[cat_cols].copy(deep=True) x_predict_cat = self.encode(x_predict_cat) f_x_predict = pd.concat([x_predict_num, x_predict_cat], axis=1) y_pred = self.rfc_model.predict(f_x_predict) return y_pred def cal_conf_matrix(self, x_test, y_test, catego_columns, numeric_cols): y_pred = self.predict(x_test, catego_columns, numeric_cols) # [[VP, FP], [FN, VN]] print("Matriz de confusión:") print(metrics.confusion_matrix(y_test, y_pred)) #Correr varias veces y ver como varia. Basado en el indice de jaccard print("Precisión:", metrics.accuracy_score(y_test, y_pred))
import autopath from datasets import training_set, test_set from util import convert_gray_scale, flatten Xr,Yr = training_set Xe,Ye = test_set Xr = flatten(convert_gray_scale(Xr)) Xe = flatten(convert_gray_scale(Xe)) rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True) rf.fit(Xr, Yr) Yp = rf.predict(Xe) print np.mean(Yp == Ye) Ypp = rf.predict_proba(Xe).max(axis=1) plt.figure(1) plt.clf() plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4, label='classified') plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4, label='misclassified') plt.legend(loc='upper left') plt.draw() plt.show() plt.figure(3)