def random_forest(train_vec,train_label, n_estimators = 10, min_samples_split = 2, min_samples_leaf = 1, criterion = "entropy"): model = RandomForestClassifier(n_estimators = 15, min_samples_split = 2, min_samples_leaf = 2,criterion = "gini") model.fit_transform(train_vec,train_label) print 'Random Forest Classification Accu: ' + str(model.score(train_vec,train_label)) return model
def predict_on_test_set(label_group): sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv")) test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv")) preprocessed = Preprocess(sample, which_labels = label_group) rf = RandomForestClassifier(n_estimators = 80, criterion = "entropy", bootstrap = True, max_features = 'sqrt', max_depth = 40) rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel()) test_preprocessed = Preprocess(test, which_labels = label_group) predicted_labels = rf.predict(test_preprocessed.features) error_rate, _ = benchmark(predicted_labels.ravel(), test_preprocessed.labels.values) plot_feature_importances(preprocessed.features.columns.values, rf.feature_importances_, label_group)
class RandomForestModel(FreshnessModel): def __init__(self, trainfile, testfile, extra_features_file = None): super(RandomForestModel, self).__init__(trainfile, testfile, extra_features_file) self.clf = RandomForestClassifier(n_estimators=10, max_depth=None) def train(self, data = None, target = None): if data is None: data = self.data if target is None: target = self.target self.clf.fit_transform(data, target) def pred(self, X): return self.clf.predict(X)
def rf_selection(x, y, n, max_depth, k): ''' :param x: features :param y: target :param n: estimators of random forests :param max_depth: max_depth of random forests :param k: numbers of features to retain :return: results feture names ''' rf = RandomForestClassifier(n_estimators=n, max_depth=max_depth) rf.fit_transform(x.values, y.values) features_name = list(x.columns) importance = list(rf.feature_importances_) df = pd.DataFrame({'feature': features_name, 'importance': importance}) df.sort_values(by='importance', ascending=False, inplace=True) return list(df['feature'].iloc[:k])
def main(): #Loading the training set and test set path1 = "C:\Python32\A2PW1.csv" path2 = "C:\Python32\A2PW3.csv" train = read_csv(path1, has_header = True) target = [x[0] for x in train] train = [x[1:] for x in train] test = read_csv(path2, has_header = True) test = [x[1:] for x in test] print('The training set is:') print(train) print('The test set is:') print(test) #create the model rf = RandomForestClassifier(n_estimators = 100) #throw the data into model rf.fit(train, target) predicted_probs = rf.predict_log_proba(test) print(predicted_probs) output_file_path = "C:\Python32\pythontoday.txt" numpy.savetxt(output_file_path, predicted_probs,delimiter=',',fmt='%1.4e') newArr = rf.fit_transform(test,target) print('newArr becomes: ',newArr)
def cross_validate_number_of_trees(label_group): sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv")) test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv")) preprocessed = Preprocess(sample, which_labels = label_group) n_trees = (5, 10, 30, 60, 80) oob_scores = [] for n_tree in n_trees: rf = RandomForestClassifier(n_estimators = n_tree, criterion = "entropy", oob_score = True, bootstrap = True, max_features = 'sqrt', max_depth = 40) rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel()) score = 1.0 - rf.oob_score_ oob_scores.append(score) print "Out-of-Bag Error for Number of Trees %s: %s" % (n_tree, score) plot_oob_error_n_tress(n_trees, oob_scores, label_group)
def RFClassify(trainData, trainLabel, testData): rfClf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) rfClf.fit_transform(trainData, trainLabel) testlabel = rfClf.predict(testData) return testlabel
def cross_validate_depth(label_group): sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv")) test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv")) preprocessed = Preprocess(sample, which_labels = label_group) depths = (2, 40, 60, 80) oob_scores = [] for depth in depths: rf = RandomForestClassifier(n_estimators = 80, criterion = "entropy", oob_score = True, bootstrap = True, max_features = 'sqrt', max_depth = depth) rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel()) score = 1.0 - rf.oob_score_ oob_scores.append(score) print "Out-of-Bag Error for Depth %s: %s" % (depth, score) plot_oob_error_depth(depths, oob_scores, label_group)
def rfc_with_smote(X, ycl, n_estimators=32, min_samples_leaf=5, max_depth=3): try: sm = SMOTETomek(k=5) # k is number of nearest neighbour X_smt, y_smt = sm.fit_sample(X, ycl) # returns re-sampled matrix and re-sampled label vector except: pass try: sm = SMOTETomek(k=2) X_smt, y_smt = sm.fit_sample(X, ycl) except: X_smt, y_smt = X, ycl X_train, X_test, y_train, y_test = train_test_split(X_smt, y_smt, test_size=0.20, random_state=42) ycl_train = (y_train > 0).astype(int) ycl_test = (y_test > 0).astype(int) sfm = SelectFromModel(RandomForestClassifier( n_estimators = n_estimators, min_samples_leaf = min_samples_leaf, max_depth = max_depth )) # select features sfm.fit(X_train, ycl_train) # fit the model using training set X_train = sfm.transform(X_train) # reduce the matrix into selected features X_test = sfm.transform(X_test) rfc = RandomForestClassifier( n_estimators = n_estimators, min_samples_leaf = min_samples_leaf, max_depth = max_depth ) rfc.fit_transform(X_train, ycl_train) precision, recall, f1, support = evaluate_model(rfc, X_test, ycl_test, threshold=0.5) # return some performance statistics of the model return rfc, precision, recall, f1, support
class RandomForest(predictor.IBayesDBForeignPredictor): """A Random Forest foreign predictor. The `targets` must be a single categorical stattype. The `conditions` may be arbitrary numerical or categorical columns. """ @classmethod def create(cls, bdb, table, targets, conditions): cols = [c for c, _ in targets + conditions] df = bdbcontrib.bql_utils.table_to_df(bdb, table, cols) rf = cls() rf.train(df, targets, conditions) rf.prng = bdb.np_prng return rf @classmethod def serialize(cls, _bdb, pred): state = { 'targets': pred.targets, 'conditions_numerical': pred.conditions_numerical, 'conditions_categorical': pred.conditions_categorical, 'rf_full': pred.rf_full, 'rf_partial': pred.rf_partial, 'categories_to_val_map': pred.categories_to_val_map } return pickle.dumps(state) @classmethod def deserialize(cls, bdb, binary): state = pickle.loads(binary) rf = cls(targets=state['targets'], conditions_numerical=state['conditions_numerical'], conditions_categorical=state['conditions_categorical'], rf_full=state['rf_full'], rf_partial=state['rf_partial'], categories_to_val_map=state['categories_to_val_map']) rf.prng = bdb.np_prng return rf @classmethod def name(cls): return 'random_forest' def __init__(self, targets=None, conditions_numerical=None, conditions_categorical=None, rf_full=None, rf_partial=None, categories_to_val_map=None): self.targets = targets self.conditions_numerical = conditions_numerical self.conditions_categorical = conditions_categorical if (conditions_numerical is not None and conditions_categorical is not None): self.conditions = self.conditions_numerical + \ self.conditions_categorical self.rf_full = rf_full self.rf_partial = rf_partial self.categories_to_val_map = categories_to_val_map def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE( ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the random forest. self._train_rf() def _train_rf(self): """Trains the random forests classifiers. We train two classifiers, `partial` which is just trained on `conditions_numerical`, and `full` which is trained on `conditions_numerical+conditions_categorical`. This safe-guard feature is critical for querying; otherwise sklearn would crash whenever a categorical value unseen in training due to filtering (but existant in df nevertheless) was passed in. """ # pylint: disable=no-member self.rf_partial.fit_transform(self.X_numerical, self.Y) self.rf_full.fit_transform( np.hstack((self.X_numerical, self.X_categorical)), self.Y) def _compute_targets_distribution(self, conditions): """Given conditions dict {feature_col:val}, returns the distribution and (class mapping for lookup) of the random label self.targets|conditions. """ if not set(self.conditions).issubset(set(conditions.keys())): raise BLE( ValueError('Must specify values for all the conditionals.\n' 'Received: {}\n' 'Expected: {}'.format( conditions, self.conditions_numerical + self.conditions_categorical))) # Are there any category values in conditions which never appeared during # training? If yes, we need to run the partial RF. unseen = any([ conditions[cat] not in self.categories_to_val_map[cat] for cat in self.conditions_categorical ]) X_numerical = [conditions[col] for col in self.conditions_numerical] if unseen: distribution = self.rf_partial.predict_proba(X_numerical) classes = self.rf_partial.classes_ else: X_categorical = [ conditions[col] for col in self.conditions_categorical ] X_categorical = utils.binarize_categorical_row( self.conditions_categorical, self.categories_to_val_map, X_categorical) distribution = self.rf_full.predict_proba( np.hstack((X_numerical, X_categorical))) classes = self.rf_partial.classes_ return distribution[0], classes def simulate(self, n_samples, conditions): distribution, classes = self._compute_targets_distribution(conditions) draws = self.prng.multinomial(1, distribution, size=n_samples) return [classes[np.where(d == 1)[0][0]] for d in draws] def logpdf(self, value, conditions): distribution, classes = self._compute_targets_distribution(conditions) if value not in classes: return -float('inf') return np.log(distribution[np.where(classes == value)[0][0]])
def RFClassify(trainData,trainLabel,testData): rfClf=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) rfClf.fit_transform(trainData, trainLabel) testlabel=rfClf.predict(testData) return testlabel
def make_test(train_source, test_source, light_type=None, validation=False, v_size=0.5, estimators=85): train = read_csv(train_source) tmp = open(train_source) feature_count = None for line in tmp: feature_count = len(line.split(",")) break trainX = np.asarray(train[range(1, feature_count)]) trainY = np.asarray(train[[0]]).ravel() # print "All Data size: " + str(len(trainX)) testX = None testY = None if validation: # --- CROSS VALIDATION --- trainX, testX, trainY, testY = cross_validation.train_test_split( trainX, trainY, test_size=v_size, random_state=0) else: # --- TEST DATA --- test = read_csv(test_source) testX = np.asarray(test[range(1, feature_count)]) testY = np.asarray(test[[0]]).ravel() if len(testX) < 100: return 0 print "Train size: " + str(len(trainX)) print "Test size: " + str(len(testX)) # --- KNN --- # clf = KNeighborsClassifier(metric='minkowski', n_neighbors=1, p=2) # --- SVM --- # clf = svm.SVC() # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, # shrinking=True, tol=0.001, verbose=False) # --- Random Forest --- clf = RandomForestClassifier(n_estimators=estimators) clf.fit_transform(trainX, trainY) true_false = 0 true_true = 0 false_true = 0 false_false = 0 true = 0 false = 0 for i in range(len(testY)): answer = clf.predict(testX[i]) if testY[i] == True: true += 1 else: false += 1 # print str(answer[0]) + " " + str(testY[i]) if answer[0] == True and testY[i] == False: true_false += 1 if answer[0] == True and testY[i] == True: true_true += 1 if answer[0] == False and testY[i] == False: false_false += 1 if answer[0] == False and testY[i] == True: false_true += 1 if validation: if true > 0: print light_type + " true_true (precision): " + str(float(true_true)/float(true)) print light_type + " false_true: " + str(float(false_true)/float(true)) if false > 0: print light_type + " true_false: " + str(float(true_false)/float(false)) print light_type + " false_false (precision): " + str(float(false_false)/float(false)) result = clf.score(testX, testY) print "Main precision for " + light_type + ": " + str(result) return result
#binning model matrix,binned matrix==stat stat, bin_edges, binnum = stats.binned_statistic(range(X.shape[1]), X, 'median', bins=int(bin_num)) #MODULO3 #APPLY THE MODEL AND PRINT THE RESULT if model == 'svm.LinearSVC()': clf=svm.LinearSVC(C=parameter) if model == 'RandomForestClassifier()': clf=RandomForestClassifier(n_estimators=parameters,n_jobs=-1) if model == 'LinearDiscriminantAnlysis()': clf=LinearDiscriminantAnalysis() out = clf.fit(stat,y) output = clf.fit_transform(stat, y) ''' # Plot SVM contour # Can't plot it because it is 160 dimensions h = .02 # step size in the mesh x_min, x_max = output[:, 0].min() - 1, output[:, 0].max() + 1 y_min, y_max = output[:, 1].min() - 1, output[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h)) arr_conc = np.c_[xx.ravel(), yy.ravel()] # concatenate two arrays together #print (arr_conc) #print (arr_conc.shape) Z = clf.predict(arr_conc) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
scale = StandardScaler() X_train = scale.fit_transform(X_train) X_test = scale.fit_transform(X_test) # _________________________________________________ # Matrix from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix, roc_curve, auc from pandas_ml import ConfusionMatrix # I'm using 'pandas_ml' for better confusion matrix than 'scikit-learn'. # ______________________________________________________________________________________________________________ # Machine Learning Classifiers from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() model.fit_transform(X_train, y_train) y_artificial = model.predict(X_test) # Predicted # Evaluation TN, FP, FN, TP = confusion_matrix(y_true=y_test, y_pred=y_artificial).ravel() print('_' * 43) print('Classifier : GradientBoostingClassifier') print('Accuracy : {0:.3f} %'.format( accuracy_score(y_true=y_test, y_pred=y_artificial) * 100.0)) # print('_'*40) print() print('Confusion Matrix :') CM = ConfusionMatrix(y_true=y_test, y_pred=y_artificial) print(CM)
import numpy as np from feature_set import get_all_feature_sets setts = get_all_feature_sets() keys = [] scores = [] for key in setts: keys.append(key) print("Loading %sand associated labels..." % key) examples = setts[key]['train_X'] labels = setts[key]['train_Y'] test_examples = setts[key]['test_X'] test_labels = setts[key]['test_Y'] Forest = RandomForestClassifier(100) Forest.fit_transform(examples, labels) print Forest.score(test_examples, test_labels) scores.append(Forest.score(test_examples, test_labels)) with open("../data/random_forest_all_feature_combinations.txt", "wb") as f: for i in range(len(keys)): f.write("%s, %d\n" % (keys[i], scores[i]))
X = scale_data(X) print("Features Data scaled") # SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=35,l1_ratio =0.2) svc = LinearSVC(class_weight='auto') model_rf = RandomForestClassifier(n_jobs=-1, bootstrap=True, n_estimators=180, min_samples_leaf=3, min_samples_split =3, criterion='gini',compute_importances=True, max_depth=6) SVC_RBF= SVC(kernel="rbf", class_weight="auto", cache_size=2600, shrinking=True) SVC_linear= SVC(kernel="poly", cache_size=2700, shrinking=True) # model_rf.fit(X,y) # X_SGD = model_rf.transform(X, threshold='1.5*mean') # forests! X_SGD = model_rf.fit_transform(X,y) print('X Reduced (by RF) features amount:') print(X_SGD.shape) def ReducedFeaturesDF(X,y): ''' Returns a dataframe with only a subset of features/columns retained ''' from sklearn.feature_selection import RFE est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto') # selectK = SelectKBest(score_func = f_classif, k=45) selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15) selectK=selectRFE selectK.fit(X,y) selectK_mask=selectK.get_support()
class RandomForest(predictor.IBayesDBForeignPredictor): """A Random Forest foreign predictor. The `targets` must be a single categorical stattype. The `conditions` may be arbitrary numerical or categorical columns. """ @classmethod def create(cls, bdb, table, targets, conditions): cols = [c for c,_ in targets+conditions] df = bdbcontrib.table_to_df(bdb, table, cols) rf = cls() rf.train(df, targets, conditions) rf.prng = bdb.np_prng return rf @classmethod def serialize(cls, _bdb, pred): state = { 'targets': pred.targets, 'conditions_numerical': pred.conditions_numerical, 'conditions_categorical': pred.conditions_categorical, 'rf_full': pred.rf_full, 'rf_partial': pred.rf_partial, 'categories_to_val_map': pred.categories_to_val_map } return pickle.dumps(state) @classmethod def deserialize(cls, bdb, binary): state = pickle.loads(binary) rf = cls(targets=state['targets'], conditions_numerical=state['conditions_numerical'], conditions_categorical=state['conditions_categorical'], rf_full=state['rf_full'], rf_partial=state['rf_partial'], categories_to_val_map=state['categories_to_val_map']) rf.prng = bdb.np_prng return rf @classmethod def name(cls): return 'random_forest' def __init__(self, targets=None, conditions_numerical=None, conditions_categorical=None, rf_full=None, rf_partial=None, categories_to_val_map=None): self.targets = targets self.conditions_numerical = conditions_numerical self.conditions_categorical = conditions_categorical if (conditions_numerical is not None and conditions_categorical is not None): self.conditions = self.conditions_numerical + \ self.conditions_categorical self.rf_full = rf_full self.rf_partial = rf_partial self.categories_to_val_map = categories_to_val_map def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE(ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the random forest. self._train_rf() def _train_rf(self): """Trains the random forests classifiers. We train two classifiers, `partial` which is just trained on `conditions_numerical`, and `full` which is trained on `conditions_numerical+conditions_categorical`. This safe-guard feature is critical for querying; otherwise sklearn would crash whenever a categorical value unseen in training due to filtering (but existant in df nevertheless) was passed in. """ # pylint: disable=no-member self.rf_partial.fit_transform(self.X_numerical, self.Y) self.rf_full.fit_transform( np.hstack((self.X_numerical, self.X_categorical)), self.Y) def _compute_targets_distribution(self, conditions): """Given conditions dict {feature_col:val}, returns the distribution and (class mapping for lookup) of the random label self.targets|conditions. """ if not set(self.conditions).issubset(set(conditions.keys())): raise BLE(ValueError( 'Must specify values for all the conditionals.\n' 'Received: {}\n' 'Expected: {}'.format(conditions, self.conditions_numerical + self.conditions_categorical))) # Are there any category values in conditions which never appeared during # training? If yes, we need to run the partial RF. unseen = any([conditions[cat] not in self.categories_to_val_map[cat] for cat in self.conditions_categorical]) X_numerical = [conditions[col] for col in self.conditions_numerical] if unseen: distribution = self.rf_partial.predict_proba(X_numerical) classes = self.rf_partial.classes_ else: X_categorical = [conditions[col] for col in self.conditions_categorical] X_categorical = utils.binarize_categorical_row( self.conditions_categorical, self.categories_to_val_map, X_categorical) distribution = self.rf_full.predict_proba( np.hstack((X_numerical, X_categorical))) classes = self.rf_partial.classes_ return distribution[0], classes def simulate(self, n_samples, conditions): distribution, classes = self._compute_targets_distribution(conditions) draws = self.prng.multinomial(1, distribution, size=n_samples) return [classes[np.where(d==1)[0][0]] for d in draws] def logpdf(self, value, conditions): distribution, classes = self._compute_targets_distribution(conditions) if value not in classes: return -float('inf') return np.log(distribution[np.where(classes==value)[0][0]])
# # Import metrics for scoring #============================================================================== from sklearn.metrics import classification_report print(classification_report(test_y, bnb_preds)) print(classification_report(test_y, gnb_preds)) print(classification_report(test_y, mnb_preds)) #%% Random Forest from sklearn.ensemble import RandomForestClassifier # Initialize a random forest rc = RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=None, min_samples_split=2, min_samples_leaf=1, verbose=1, warm_start=False, class_weight=None) # Train the forest rc.fit_transform(train_X, train_y) # Score the forest rc_preds = rc.predict(test_X) # Asses print(classification_report(test_y, rc_preds))
StartRow = 1 StartTest = 12000 EndTest = 13000 train_file = [fin[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating'] train_targets = [targets[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating'] test_file = [fin[x] for x in xrange(StartTest, EndTest)] #from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1,charset_error='ignore') X_train = vectorizer.fit_transform(train_file) X_train = X_train.todense() #from sklearn.naive_bayes import MultinomialNB #from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier #clf = MultinomialNB() #clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf = RandomForestClassifier(n_estimators = 500, compute_importances=True) #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) clf.fit_transform(X_train, train_targets) testdata = vectorizer.transform(test_file) testdata = testdata.todense() output_predictions('predictions.csv',testdata, clf)
from sklearn.metrics import classification_report print(classification_report(test_y, bnb_preds)) print(classification_report(test_y, gnb_preds)) print(classification_report(test_y, mnb_preds)) #%% Random Forest from sklearn.ensemble import RandomForestClassifier # Initialize a random forest rc = RandomForestClassifier(n_estimators = 100, max_features = 'auto', max_depth = None, min_samples_split = 2, min_samples_leaf = 1, verbose = 1, warm_start = False, class_weight = None) # Train the forest rc.fit_transform(train_X, train_y) # Score the forest rc_preds = rc.predict(test_X) # Asses print(classification_report(test_y, rc_preds))
test_columns = ['id', 'k1k2', 'locks_signal', 'emergency_signal', 'access_signal', 'THDV_M', 'THDI_M'] # 读入数据 train = pd.read_csv("/home/zhoujifa/competition/baidu_elecrticity/data_train.csv", names=train_columns) test = pd.read_csv("/home/zhoujifa/competition/baidu_elecrticity/data_test.csv", names=test_columns) train = shuffle(train) # 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置 train_xy, val = train_test_split(train, test_size=0.3, random_state=1) y = train_xy.label X = train_xy.drop(['label', 'id', 'THDV_M', 'THDI_M'], axis=1) val_y = val.label val_X = val.drop(['label', 'id', 'THDV_M', "THDI_M"], axis=1) classifier = RandomForestClassifier() classifier.fit_transform(X, y) predict_y = classifier.predict(val_X) accuracy_score = accuracy_score(val_y, predict_y) y_score = classifier.score(X, y) # auc_score = roc_auc_score(val_y, y_score) print(accuracy_score) test = test.drop(['id', 'THDV_M', "THDI_M"], axis=1) preds = classifier.predict(test) np.savetxt('/home/zhoujifa/competition/baidu_elecrticity/rf_submission.csv', np.c_[range(1, len(test) + 1), preds], delimiter=',', header='ImageId,Label', comments='', fmt='%d') print(classifier.feature_importances_)
n_estimators=180, min_samples_leaf=3, min_samples_split=3, criterion='gini', compute_importances=True, max_depth=6) SVC_RBF = SVC(kernel="rbf", class_weight="auto", cache_size=2600, shrinking=True) SVC_linear = SVC(kernel="poly", cache_size=2700, shrinking=True) # model_rf.fit(X,y) # X_SGD = model_rf.transform(X, threshold='1.5*mean') # forests! X_SGD = model_rf.fit_transform(X, y) print('X Reduced (by RF) features amount:') print(X_SGD.shape) def ReducedFeaturesDF(X, y): ''' Returns a dataframe with only a subset of features/columns retained ''' from sklearn.feature_selection import RFE est = LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto') # selectK = SelectKBest(score_func = f_classif, k=45) selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15) selectK = selectRFE
from sklearn.ensemble import RandomForestClassifier import pickle import sys import numpy as np X1=np.array(pickle.load(open('X2g_train.p', 'rb'))) X2=np.array(pickle.load(open('X3g_train.p', 'rb'))) X3=np.array(pickle.load(open('X4g_train.p', 'rb'))) X4=np.array(pickle.load(open('Xhead_train.p', 'rb'))) X=np.hstack((X2,X1,X3,X4)) y=np.array(pickle.load(open('y.p', 'rb'))) rf=RandomForestClassifier(n_estimators=200) Xr=rf.fit_transform(X,y) pickle.dump(Xr,open('X33_train_reproduce.p','wb')) print(Xr.shape) del X,X1,X2,X3,X4,Xr X1=np.array(pickle.load(open('X2g_test.p', 'rb'))) X2=np.array(pickle.load(open('X3g_test.p', 'rb'))) X3=np.array(pickle.load(open('X4g_test.p', 'rb'))) X4=np.array(pickle.load(open('Xhead_test.p', 'rb'))) X=np.hstack((X2,X1,X3,X4)) Xr=rf.transform(X) pickle.dump(Xr,open('X33_test_reproduce.p','wb')) print(Xr.shape)
D = pairwise_distances(X) D.shape #visualize plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest') plt.colorbar() #distance matrix for rotated and translated data D2 = pairwise_distances(X2) np.allclose(D, D2) #This distance matrix gives us a representation of our data that is invariant to rotations and translations #transform back into x and y coordinates not always intuitive - using only distance matrix... from sklearn.manifold import MDS model = MDS(n_components=2, dissimilarity='precomputed', random_state=1) out = model.fit_transform(D) plt.scatter(out[:, 0], out[:, 1], **colorize) plt.axis('equal') #The usefulness of this becomes more apparent when we consider the fact that distance matrices can be computed from data in any dimension def random_projection(X, dimension=3, rseed=42): assert dimension >= X.shape[1] rng = np.random.RandomState(rseed) C = rng.randn(dimension, dimension) e, V = np.linalg.eigh(np.dot(C, C.T)) return np.dot(X, V[:X.shape[1]]) X3 = random_projection(X, 3) X3.shape
title_cluster = 'Chess KM Clusters' file_cluster = 'plots/chess_km_cluster.png' km = KMeans(n_clusters=clusters, random_state=0).fit(dr_chess_trgX) # Plot histogram of cluster purities plot.plot_tsne(tx_data, km.predict(dr_chess_trgX), title_cluster, file_cluster) plot.plot_tsne(tx_data, chess_trgY, "Chess KM Real Labels", "plots/chess_km_cluster_real_labels.png") if run_em: clusters = data_funcs.best_cluster_count("chess", "em") if best_dr: comp_count = data_funcs.best_comp_count("chess", "PCA") title_cluster = 'Chess EM Clusters with PCA DR' file_cluster = 'plots/chess_em_cluster_w_PCA_DR.png' dr = PCA(n_components=comp_count, random_state=0) dr_chess_trgX = dr.fit_transform(chess_trgX) else: dr_chess_trgX = chess_trgX title_cluster = 'Chess EM Clusters' file_cluster = 'plots/chess_em_cluster.png' em = GaussianMixture(n_components=clusters, random_state=0).fit(dr_chess_trgX) plot.plot_tsne(tx_data, em.predict(dr_chess_trgX), title_cluster, file_cluster) plot.plot_tsne(tx_data, chess_trgY, "Chess EM Real Labels", "plots/chess_em_cluster_real_labels.png") if run_fmnist: tx_data = TSNE(random_state=0).fit_transform(fmnist_trgX) if run_kmeans: clusters = data_funcs.best_cluster_count("fmnist", "km") if best_dr:
del labels # Parameters for Randomforest random_state = 5342 n_jobs = 8 verbose = 2 clf1 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose) clf2 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose) clf3 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose) clf4 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose) # Start training print('training started') clf1.fit(train[:, :-1], train[:, -1]) X_new1 = clf1.transform(train[:, :-1]) X_new2 = clf3.fit_transform(train[:, :-1], train[:, -1]) # print('importances', clf1.feature_importances_) clf2.fit(X_new1, train[:, -1]) clf4.fit(X_new2, train[:, -1]) print('training completed') print('n_components = ', len(X_new1[0]), len(X_new2[0])) # We don't need training set now del train # Dimensions for train set ntest = 10873 nfeature = 16 ** 2 + 1 # For two_byte_codes, no_que_marks test = np.zeros((ntest, nfeature), dtype=int) Ids = [] # Required test set ids
from sklearn.ensemble import RandomForestClassifier import pickle import sys import numpy as np X1 = np.array(pickle.load(open('X2g_train.p'))) X2 = np.array(pickle.load(open('X3g_train.p'))) X3 = np.array(pickle.load(open('X4g_train.p'))) X4 = np.array(pickle.load(open('Xhead_train.p'))) X = np.hstack((X2, X1, X3, X4)) y = np.array(pickle.load(open('y.p'))) rf = RandomForestClassifier(n_estimators=200) Xr = rf.fit_transform(X, y) pickle.dump(Xr, open('X33_train_reproduce.p', 'w')) print Xr.shape del X, X1, X2, X3, X4, Xr X1 = np.array(pickle.load(open('X2g_test.p'))) X2 = np.array(pickle.load(open('X3g_test.p'))) X3 = np.array(pickle.load(open('X4g_test.p'))) X4 = np.array(pickle.load(open('Xhead_test.p'))) X = np.hstack((X2, X1, X3, X4)) Xr = rf.transform(X) pickle.dump(Xr, open('X33_test_reproduce.p', 'w')) print Xr.shape