Exemple #1
0
    feature_filter = args.feature_filter

    train_data = pd.read_csv(training_set_path, index_col=0)
    train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter)

    test_data = pd.read_csv(test_set_path, index_col=0)
    test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter)


    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                 max_depth=max_depth, min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    result = metrics.predict_table(clf, test_X, test_y)

    result['indice'] = test_X.index.tolist()
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
Exemple #2
0
    results = []
    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    for train_index, test_index in skf:
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    result = pd.concat(results)

    output = open(result_path + 'Arboles/Arbol.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result.csv')

    matrix = metrics.confusion_matrix(result)
    matrix.to_csv(result_path + 'Metricas/soft_matrix_.csv')

    clases = matrix.columns.tolist()
    f_score = [metrics.f_score(matrix, c) for c in clases]

    with open(result_path + 'Metricas/results.txt') as f:
        f.write(clases + '\n')
        f.write(str(f_score) + '\n')
Exemple #3
0
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))
        ids.extend(test_X.index.tolist())

        if validation == 'holdout':
            aux = metrics.predict_table(clf, test_X, test_y)
            aux.to_csv(result_path + 'Predicciones/hold_' + str(count) + '.csv')
            print 'hold ' + str(count) + ' ' + str(metrics.weighted_f_score(metrics.confusion_matrix(aux)))
            count += 1

    result = pd.concat(results)
    result['indice'] = ids
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))