コード例 #1
0
def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
           inverse=False, lc_filter=None):
    """

    path: Dirección del dataset a ocupar para entrenar
    index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar
    class_filter: Lista de clases que se quiere utilizar
    feature_filter: Lista de features que se quiere utilizar

    """
    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux
            
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14,
                                     min_samples_split=5)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    return pd.concat(results)
コード例 #2
0
def fit_sktree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
             inverse=False, max_depth=10, min_samples_split=20, lc_filter=None):

    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux

        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth,
                                     min_samples_split=min_samples_split)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    return pd.concat(results)
コード例 #3
0
ファイル: simple_rf.py プロジェクト: npcastro/pyRF
    y = data['class']
    data = data.drop('class', axis=1)

    results = []
    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    for train_index, test_index in skf:
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    result = pd.concat(results)

    output = open(result_path + 'Arboles/Arbol.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result.csv')

    matrix = metrics.confusion_matrix(result)
    matrix.to_csv(result_path + 'Metricas/soft_matrix_.csv')

    clases = matrix.columns.tolist()
    f_score = [metrics.f_score(matrix, c) for c in clases]
コード例 #4
0
def fit_means_rf(train_path, test_path, index_filter=None, class_filter=None, feature_filter=None, folds=10):
    """

    train_path: Dirección del dataset a ocupar para entrenar
    test_path: Dirección del dataset a ocupar para testear
    index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar
    class_filter: Lista de clases que se quiere utilizar
    feature_filter: Lista de features que se quiere utilizar

    """
    train_data = pd.read_csv(train_path, index_col=0)
    test_data = pd.read_csv(test_path, index_col=0)

    # Elimino curvas que estan repetidas
    test_data = utils.remove_duplicate_index(test_data)
    train_data = utils.remove_duplicate_index(train_data)

    if index_filter:
        train_data = train_data.loc[index_filter]
        test_data = test_data.loc[index_filter]
    
    if class_filter:
        train_data = train_data[train_data['class'].apply(lambda x: True if x in class_filter else False)]
        test_data = test_data[test_data['class'].apply(lambda x: True if x in class_filter else False)]

    train_data = train_data.dropna(axis=0, how='any')
    test_data = test_data.dropna(axis=0, how='any')

    # Me aseguro que los datasets sean de los mismos datos
    common_index = list(set(test_data.index.tolist()) & set(train_data.index.tolist()))
    test_data = test_data.loc[common_index]
    train_data = train_data.loc[common_index]
    train_data = train_data.sort_index()
    test_data = test_data.sort_index()

    # Separo features de las clases
    train_y = train_data['class']
    train_X = train_data.drop('class', axis=1)

    test_y = test_data['class']
    test_X = test_data.drop('class', axis=1)

    if feature_filter:
        train_X = train_X[feature_filter]
        test_X = test_X[feature_filter]

    skf = cross_validation.StratifiedKFold(train_y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        fold_train_X = train_X.iloc[train_index]
        fold_train_y = train_y.iloc[train_index]

        fold_test_X = test_X.iloc[test_index]
        fold_test_y = test_y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14,
                                     min_samples_split=5)

        clf.fit(fold_train_X, fold_train_y)
        results.append(metrics.predict_table(clf, fold_test_X, fold_test_y))

    return pd.concat(results)
コード例 #5
0
ファイル: rf_balanced.py プロジェクト: npcastro/pyRF
    feature_filter = args.feature_filter

    train_data = pd.read_csv(training_set_path, index_col=0)
    train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter)

    test_data = pd.read_csv(test_set_path, index_col=0)
    test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter)


    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                 max_depth=max_depth, min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    result = metrics.predict_table(clf, test_X, test_y)

    result['indice'] = test_X.index.tolist()
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
コード例 #6
0
        train_X, train_y = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators,
                                     criterion=criterion,
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split,
                                     n_jobs=n_processes)
        clf.fit(train_X, train_y)

        aux = []
        for path in paths:
            test_data = pd.read_csv(path, index_col=0)
            test_data = test_data.loc[data.index].sort_index()
            test_data, test_y = utils.filter_data(
                test_data, feature_filter=feature_filter)
            test_X, test_y = test_data.iloc[train_index], test_y.iloc[
                train_index]

            aux.append(metrics.predict_table(clf, test_X, test_y))

        results.append(metrics.aggregate_predictions(aux))
        ids.extend(test_X.index.tolist())

    result = pd.concat(results)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')
コード例 #7
0
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux

        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))
        ids.extend(test_X.index.tolist())

        if validation == 'holdout':
            aux = metrics.predict_table(clf, test_X, test_y)
            aux.to_csv(result_path + 'Predicciones/hold_' + str(count) + '.csv')
            print 'hold ' + str(count) + ' ' + str(metrics.weighted_f_score(metrics.confusion_matrix(aux)))
            count += 1

    result = pd.concat(results)
    result['indice'] = ids
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
コード例 #8
0
        test_index_filter = pd.read_csv(test_index_filter, index_col=0).index

    paths = [
        sets_path + catalog + '_sampled_' + str(i) + '.csv'
        for i in xrange(n_samples)
    ]

    resultados = []
    for p in paths:
        data = pd.read_csv(p, index_col=0)

        train_X, train_y = utils.filter_data(data,
                                             index_filter=train_index_filter,
                                             feature_filter=feature_filter)
        test_X, test_y = utils.filter_data(data,
                                           index_filter=test_index_filter,
                                           feature_filter=feature_filter)

        clf = None
        clf = DecisionTreeClassifier(criterion='entropy',
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split)

        clf.fit(train_X, train_y)
        resultados.append(metrics.predict_table(clf, test_X, test_y))

    result = metrics.aggregate_predictions(resultados)
    result.to_csv(result_path + 'result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
コード例 #9
0
for train_index, test_index in skf:
    fold_train_X = train_X.iloc[train_index]
    fold_train_y = train_y.iloc[train_index]

    fold_test_X = test_X.iloc[test_index]
    fold_test_y = test_y.iloc[test_index]

    clf = None
    clf = RandomForestClassifier(n_estimators=100,
                                 criterion='entropy',
                                 max_depth=14,
                                 min_samples_split=5,
                                 n_jobs=-1)

    clf.fit(fold_train_X, fold_train_y)
    results.append(metrics.predict_table(clf, fold_test_X, fold_test_y))
    ids.extend(fold_test_X.index.tolist())

result = pd.concat(results)
result['indice'] = ids
result.set_index('indice')
result.index.name = catalog + '_id'
result = result.drop('indice', axis=1)

output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
pickle.dump(clf, output)
output.close()

result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')
コード例 #10
0
    train_data = pd.read_csv(training_set_path, index_col=0)
    train_X, train_y = utils.filter_data(train_data,
                                         feature_filter=feature_filter)

    test_data = pd.read_csv(test_set_path, index_col=0)
    test_X, test_y = utils.filter_data(test_data,
                                       feature_filter=feature_filter)

    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 criterion=criterion,
                                 max_depth=max_depth,
                                 min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    result = metrics.predict_table(clf, test_X, test_y)

    result['indice'] = test_X.index.tolist()
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))