def ReprodIndividualsFromRF(list_indiv, max_id, options): list_indiv = list(list_indiv) rf = RandomForestClassifier(n_estimators=len(list_indiv)) trees = list() for indiv in list_indiv: trees.append(indiv.clf) rf.estimators_ = trees rf.n_classes_ = trees[0].n_classes_ rf.classes_ = trees[0].classes_ new_dt = eqtree_rec_rf(rf, 0, max_depth=options['max_depth'], smallest_tree=False) new_id = max_id + 1 indiv3 = genetic.individual(new_dt, new_id, type_rf=False, alpha=options['alpha'], evaluate_on_data=options['on_data'], X=options['X'], y=options['y']) return indiv3
def deserialize_random_forest(model_dict): model = RandomForestClassifier(**model_dict['params']) estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']] model.estimators_ = np.array(estimators) model.classes_ = np.array(model_dict['classes_']) model.n_features_ = model_dict['n_features_'] model.n_outputs_ = model_dict['n_outputs_'] model.max_depth = model_dict['max_depth'] model.min_samples_split = model_dict['min_samples_split'] model.min_samples_leaf = model_dict['min_samples_leaf'] model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf'] model.max_features = model_dict['max_features'] model.max_leaf_nodes = model_dict['max_leaf_nodes'] model.min_impurity_decrease = model_dict['min_impurity_decrease'] model.min_impurity_split = model_dict['min_impurity_split'] if 'oob_score_' in model_dict: model.oob_score_ = model_dict['oob_score_'] if 'oob_decision_function_' in model_dict: model.oob_decision_function_ = model_dict['oob_decision_function_'] if isinstance(model_dict['n_classes_'], list): model.n_classes_ = np.array(model_dict['n_classes_']) else: model.n_classes_ = model_dict['n_classes_'] return model
def build_classifier(trees): def build_decision_tree(t): dt = DecisionTreeClassifier(random_state=0) dt.n_features_ = t.n_features dt.n_outputs_ = t.n_outputs dt.n_classes_ = t.n_classes[0] dt.classes_ = np.array([x for x in range(dt.n_classes_)]) dt.tree_ = t return dt if len(trees) > 1: clf = RandomForestClassifier(random_state=0, n_estimators=len(trees)) clf.estimators_ = [build_decision_tree(t) for t in trees] clf.n_features_ = trees[0].n_features clf.n_outputs_ = trees[0].n_outputs clf.n_classes_ = trees[0].n_classes[0] clf.classes_ = np.array([x for x in range(clf.n_classes_)]) else: clf = build_decision_tree(trees[0]) return clf
def train_random_forest_classifier(training_data, n_est=100, use_bag_of_words=False): global Training_bag_of_words_features vectorizer = CountVectorizer(analyzer="word", tokenizer=nltk.word_tokenize, preprocessor=None, stop_words=stopwords.words('english'), max_features=10000) forest = RandomForestClassifier(n_estimators=n_est, class_weight="balanced") forest.classes_ = [0, 1] if use_bag_of_words: if Training_bag_of_words_features == []: Training_bag_of_words_features = generate_bag_of_word_features( training_data[:, -2]) forest = forest.fit(Training_bag_of_words_features, training_data[:, -1].astype('float')) scores = cross_val_score(forest, Training_bag_of_words_features, training_data[:, -1].astype('float'), cv=5, scoring='roc_auc') else: forest = forest.fit( training_data[:, selected_features].astype('float'), training_data[:, -1].astype('float')) scores = cross_val_score( forest, training_data[:, selected_features].astype('float'), training_data[:, -1].astype('float'), cv=5, scoring='roc_auc') print("Scores gotten using Random Forest (# of estimators=" + str(n_est) + ")") print(scores) print(np.mean(scores)) return forest, np.mean(scores)
def run_on_feature_union(): load_word_embeddings() (training_data, _) = read_lines_from_file('data/filtered_features.csv') training_data = np.array(training_data) #training_data = generate_normalized_data(training_data) clf = RandomForestClassifier(n_estimators=100, class_weight="balanced") clf.classes_ = [0, 1] adaboost = AdaBoostClassifier(n_estimators=100) adaboost.classes_ = [0, 1] svm_clf = svm.SVC(probability=True) svm_clf.classes_ = [0, 1] randomized = ExtraTreesClassifier(n_estimators=45, max_depth=None, min_samples_split=2, class_weight="balanced") randomized.classes_ = [0, 1] pipeline = Pipeline([ ('features', FeatureUnion( [('numeric_features', NumericFeaturesExtractor()), ('bag_of_words_features', BagOfWordsExtractor()), ('w2v_features', Word2VecExtractor())], transformer_weights={ 'numeric_features': 0.5, 'bag_of_words_features': 0.9, 'w2v_features': 1.0, })), ('clf', clf) ]) pipeline.fit(training_data, training_data[:, -1].astype('float')) scores = cross_val_score(pipeline, training_data, training_data[:, -1].astype('float'), cv=10, scoring='roc_auc') print(scores) print(np.mean(scores))
index = 0 for tweet in tweets: words = tweet.split(' ') for word in words: tweetX[index][bagOfWords[word]] += 1 index += 1 #75-25 Train-Test split results X_train, X_test, y_train, y_test = train_test_split(tweetX, labels, test_size=0.25, random_state=42) rf = RandomForestClassifier(n_estimators=50) rf.classes_ = [0, 1, -1] rf.n_classes_ = 3 rf.fit(X_train, y_train) a = rf.predict(X_test) print 'Train-Test Split Results' print np.sum(a == y_test) * 100.0 / len(y_test) #Cross Validation Results ''' rf = RandomForestClassifier(n_estimators=50) rf.classes_ = [0,1,-1] rf.n_classes_ = 3 rf.fit(tweetX,labels)
def Random_Forest_Classifier(X_train, Y_train): clf_rfc = RandomForestClassifier(n_estimators=100) clf_rfc.classes_ = 2 clf_rfc = clf_rfc.fit(X_train, Y_train) return clf_rfc
feature_importance = [] for k in range(5): model = AdaBoostClassifier(n_estimators=100, learning_rate=1, base_estimator=DecisionTreeClassifier(max_depth=1, class_weight={True: 0.8, False: 0.2})) # model1 = RandomForestClassifier(n_estimators=100, max_depth=1, class_weight={True: 0.8, False: 0.2}) # model2 = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, class_weight={True: 0.8, False: 0.2}), # n_estimators=100, max_samples=1.0, max_features=1.0) # model1 = LinearSVM(C=1, class_weight={True: 0.8, False: 0.2}) # model1 = SGDClassifier(shuffle=True, loss="log", class_weight={True: 0.8, False: 0.2}) model1 = SVC(kernel="rbf", probability=True, class_weight={True: 0.8, False: 0.2}) # model2 = SVC(kernel="poly", probability=True, class_weight={True: 0.8, False: 0.2}, degree=2) # model1 = LinearDiscriminantAnalysis() # svm.classes_ = [True, False] # pre_model = BernoulliRBM(learning_rate=0.1, n_components=10, n_iter=20) # model3 = Pipeline(steps=[("rbm", pre_model), ("svm", svm)]) model.classes_ = [True, False] model1.classes_ = [True, False] # model2.classes_ = [True, False] positive_train_labels = [i for i in range(len(train_1_labels)) if train_1_labels[i]] positive_train_labels = list(np.random.choice(positive_train_labels, size=int(len(positive_train_labels)*0.8), replace=False)) # positive_test_labels = [i for i in range(len(test_1_labels)) if test_1_labels[i]] negative_train_labels = list(np.random.choice([i for i in range(len(train_1_labels)) if i not in positive_train_labels], replace=False, size=len(positive_train_labels))) # negative_test_labels = list(np.random.choice([i for i in range(len(test_1_labels)) if i not in positive_test_labels], replace=False, size=len(positive_test_labels))) # test_data = [test_data[i] for i in positive_test_labels + negative_test_labels] train_data1 = [train_data[i] for i in positive_train_labels + negative_train_labels] # test_1_labels = [test_1_labels[i] for i in positive_test_labels + negative_test_labels] train_1_labels1 = [train_1_labels[i] for i in positive_train_labels + negative_train_labels] model.fit(train_data1, train_1_labels1)
# 原始森林 RF = RandomForestClassifier(n_estimators=rfSize) RF.fit(train_x, train_y) RF_path = model_path + '/RF.m' joblib.dump(RF, RF_path) # BRAF rf3 = RandomForestClassifier(n_estimators=rf2_size) rf3.fit(training_c_x, training_c_y) rf3_path = model_path + '/rf3.m' joblib.dump(rf3, rf3_path) RF1 = RandomForestClassifier(n_estimators=rfSize) Gobaltree = rf1.estimators_ + rf3.estimators_ RF1.estimators_ = Gobaltree RF1.classes_ = rf1.classes_ RF1.n_classes_ = rf1.n_classes_ RF1.n_outputs_ = rf1.n_outputs_ RF1_path = model_path + '/braf.m' joblib.dump(RF1, RF1_path) # DBRF RF2 = RandomForestClassifier(n_estimators=rfSize) mod_Gobaltree = rf1.estimators_ + rf2.estimators_ RF2.estimators_ = mod_Gobaltree RF2.classes_ = rf2.classes_ RF2.n_classes_ = rf2.n_classes_ RF2.n_outputs_ = rf2.n_outputs_ RF2_path = model_path + '/borderlindbscan.m' joblib.dump(RF2, RF2_path)
class_weight='balanced', criterion='entropy', max_depth=None, max_features='log2', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False) clf.classes_ = classes skf = StratifiedKFold(n_splits=2) """ for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) proba = clf.predict_proba(X_test) print(proba[:20]) print(clf.score(X_test, y_test)) print(log_loss(y_test, proba)) """ clf.fit(X, y)
def main(): # Set the logger logging_dir = args.log_dir engage_mode = args.engage_mode try: os.makedirs(logging_dir) except OSError as e: print("loggiing directory exists") log_file = 'train_RandomForest_round3_' + str(engage_mode) + '.log' utils.set_logger(os.path.join(logging_dir, log_file)) logging.info( 'model = sklearn.ensemble.RandomForestClassifier, engagement mode: Retweet' ) logging.info('Using tf_idf values of tweet tokens and other features') logging.info('Does not use tweet token embeddings and user ids') logging.info('training set: '.format(args.train_file)) batch_size = args.batch_size data_dir = args.data_dir vocab_file = args.vocab_file logging.info('batch size = {}'.format(batch_size)) #load the vocab file with document count frequency for each token vocab_file_path = os.path.join(data_dir, vocab_file) vocab_handle = bz2.open(vocab_file_path, 'r') term_document_count = pickle.load(vocab_handle) #train_file = "train_first_1000.tsv" train_file = args.train_file trainfile_path = os.path.join(data_dir, train_file) #train_file = "train_first_1000.tsv" val_file = args.eval_file valfile_path = os.path.join(data_dir, val_file) TF_IDF = tf_idf.tf_idf(term_document_count) print("engagement mode: {}, value: {}".format( engage_mode, engagement_dict[engage_mode])) rng = np.random.RandomState(2020) #train(model, data_gen, TF_IDF, engage_mode) if args.mode == 'train_eval': model = RandomForestClassifier(warm_start=True, n_estimators=1) model.classes_ = [0, 1] TR_train = tweetrecords.TweetRecords(trainfile_path, batch_size, engage_mode) data_gen_train = TR_train.tokens_target_gen() logging.info('Starting Training') train_eval(model, data_gen_train, valfile_path, TF_IDF, engagement_dict[engage_mode], args, train=True) TR_train.close_file() logging.info('Finished Training') elif args.mode == 'eval': pretrained_file_path = os.path.join(args.saved_model_dir, args.pretrained_model) model = pickle.load(open(pretrained_file_path, 'rb')) eval(model, valfile_path, TF_IDF, engagement_dict[engage_mode], args) logging.info('Finished Evaluation')