Beispiel #1
0
    index_filter = args.index_filter
    feature_filter = args.feature_filter

    if index_filter is not None:
        index_filter = pd.read_csv(index_filter, index_col=0).index

    paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(n_samples)]

    if model == 'tree':
        partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds,
                              inverse=inverse, max_depth=max_depth,
                              min_samples_split=min_samples_split, lc_filter=lc_filter)
    elif model == 'rf':
        partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds,
                              inverse=inverse, lc_filter=lc_filter)
    elif model == 'sktree':
        partial_fit = partial(parallel.fit_sktree, feature_filter=feature_filter, folds=folds,
                              inverse=inverse, max_depth=max_depth,
                              min_samples_split=min_samples_split, lc_filter=lc_filter)

    pool = Pool(processes=n_processes, maxtasksperchild=2)
    
    resultados = pool.map(partial_fit, paths)
    pool.close()
    pool.join()

    result = metrics.aggregate_predictions(resultados)
    result.to_csv(result_path)

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
Beispiel #2
0
        train_X, train_y = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators,
                                     criterion=criterion,
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split,
                                     n_jobs=n_processes)
        clf.fit(train_X, train_y)

        aux = []
        for path in paths:
            test_data = pd.read_csv(path, index_col=0)
            test_data = test_data.loc[data.index].sort_index()
            test_data, test_y = utils.filter_data(
                test_data, feature_filter=feature_filter)
            test_X, test_y = test_data.iloc[train_index], test_y.iloc[
                train_index]

            aux.append(metrics.predict_table(clf, test_X, test_y))

        results.append(metrics.aggregate_predictions(aux))
        ids.extend(test_X.index.tolist())

    result = pd.concat(results)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')
Beispiel #3
0
    ids = []

    for train_index, test_index in skf:

        train_X, train_y  = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)
        clf.fit(train_X, train_y)

        aux = []
        for path in paths:
        	test_data = pd.read_csv(path, index_col=0)
        	test_data = test_data.loc[data.index].sort_index()
        	test_data, test_y = utils.filter_data(test_data, feature_filter=feature_filter)
        	test_X, test_y  = test_data.iloc[train_index], test_y.iloc[train_index]

        	aux.append(metrics.predict_table(clf, test_X, test_y))

        results.append(metrics.aggregate_predictions(aux))
        ids.extend(test_X.index.tolist())

    result = pd.concat(results)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')