Beispiel #1
0
def train_extra_Randomized_forest_classifer(training_data, n_est=10):
    randomized = ExtraTreesClassifier(n_estimators=n_est,
                                      max_depth=None,
                                      min_samples_split=2,
                                      class_weight="balanced")
    randomized.classes_ = [0, 1]
    scores = cross_val_score(randomized,
                             training_data[:,
                                           selected_features].astype('float'),
                             training_data[:, -1].astype('float'),
                             cv=10,
                             scoring='roc_auc')
    print("Scores gotten using Extra Randomized Forests (# of estimators=" +
          str(n_est) + ")")
    print(scores)
    print(np.mean(scores))
    return randomized, np.mean(scores)
Beispiel #2
0
def run_on_feature_union():
    load_word_embeddings()
    (training_data, _) = read_lines_from_file('data/filtered_features.csv')
    training_data = np.array(training_data)

    #training_data = generate_normalized_data(training_data)
    clf = RandomForestClassifier(n_estimators=100, class_weight="balanced")
    clf.classes_ = [0, 1]
    adaboost = AdaBoostClassifier(n_estimators=100)
    adaboost.classes_ = [0, 1]
    svm_clf = svm.SVC(probability=True)
    svm_clf.classes_ = [0, 1]
    randomized = ExtraTreesClassifier(n_estimators=45,
                                      max_depth=None,
                                      min_samples_split=2,
                                      class_weight="balanced")
    randomized.classes_ = [0, 1]
    pipeline = Pipeline([
        ('features',
         FeatureUnion(
             [('numeric_features', NumericFeaturesExtractor()),
              ('bag_of_words_features', BagOfWordsExtractor()),
              ('w2v_features', Word2VecExtractor())],
             transformer_weights={
                 'numeric_features': 0.5,
                 'bag_of_words_features': 0.9,
                 'w2v_features': 1.0,
             })), ('clf', clf)
    ])
    pipeline.fit(training_data, training_data[:, -1].astype('float'))
    scores = cross_val_score(pipeline,
                             training_data,
                             training_data[:, -1].astype('float'),
                             cv=10,
                             scoring='roc_auc')
    print(scores)
    print(np.mean(scores))