Ejemplo n.º 1
0
    feature_filter = args.feature_filter

    train_data = pd.read_csv(training_set_path, index_col=0)
    train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter)

    test_data = pd.read_csv(test_set_path, index_col=0)
    test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter)


    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                 max_depth=max_depth, min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    result = metrics.predict_table(clf, test_X, test_y)

    result['indice'] = test_X.index.tolist()
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
Ejemplo n.º 2
0
    index_filter = args.index_filter
    feature_filter = args.feature_filter

    if index_filter is not None:
        index_filter = pd.read_csv(index_filter, index_col=0).index

    paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(n_samples)]

    if model == 'tree':
        partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds,
                              inverse=inverse, max_depth=max_depth,
                              min_samples_split=min_samples_split, lc_filter=lc_filter)
    elif model == 'rf':
        partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds,
                              inverse=inverse, lc_filter=lc_filter)
    elif model == 'sktree':
        partial_fit = partial(parallel.fit_sktree, feature_filter=feature_filter, folds=folds,
                              inverse=inverse, max_depth=max_depth,
                              min_samples_split=min_samples_split, lc_filter=lc_filter)

    pool = Pool(processes=n_processes, maxtasksperchild=2)
    
    resultados = pool.map(partial_fit, paths)
    pool.close()
    pool.join()

    result = metrics.aggregate_predictions(resultados)
    result.to_csv(result_path)

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
Ejemplo n.º 3
0
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))
        ids.extend(test_X.index.tolist())

        if validation == 'holdout':
            aux = metrics.predict_table(clf, test_X, test_y)
            aux.to_csv(result_path + 'Predicciones/hold_' + str(count) + '.csv')
            print 'hold ' + str(count) + ' ' + str(metrics.weighted_f_score(metrics.confusion_matrix(aux)))
            count += 1

    result = pd.concat(results)
    result['indice'] = ids
    result.set_index('indice')
    result.index.name = catalog + '_id'
    result = result.drop('indice', axis=1)

    output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
    pickle.dump(clf, output)
    output.close()

    result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')

    print metrics.weighted_f_score(metrics.confusion_matrix(result))
Ejemplo n.º 4
0

files = [f for f in os.listdir(result_dir) if '.csv' in f]
for f in files:
    pattern = re.compile('[0-9]+')
    percentage = int(pattern.search(f).group())

    result = pd.read_csv(result_dir + 'result_' + str(percentage) + '.csv', index_col=0)
    if how == 'soft':
        matrix = metrics.confusion_matrix(result)
    elif how == 'hard':
        matrix = metrics.hard_matrix(result)

    matrix.to_csv(path + 'Metricas/' +  how + '_matrix_' + str(percentage) + '.csv')

    w_dict[percentage] = metrics.weighted_f_score(matrix)

    clases = matrix.columns.tolist()
    p = [metrics.precision(matrix, c) for c in clases]
    r = [metrics.recall(matrix, c) for c in clases]
    f = [metrics.f_score(matrix, c) for c in clases]

    p_dict[percentage] = p
    r_dict[percentage] = r
    f_dict[percentage] = f

save_dir = path + 'Metricas/'

w_df = pd.DataFrame.from_dict(w_dict, orient='index')
w_df.columns = ['f_score']
w_df = w_df.sort_index(ascending=True)
Ejemplo n.º 5
0
                                         max_depth=14, min_samples_split=20,
                                         n_jobs=2)
            
            clf.fit(train_X, train_y)
            results.append(metrics.predict_table(clf, test_X, test_y))
            

        result = pd.concat(results)

        matrix = metrics.confusion_matrix(result)

        clases = matrix.columns.tolist()
        precisions = [metrics.precision(matrix, c) for c in clases]
        recalls = [metrics.recall(matrix, c) for c in clases]
        f_scores = [metrics.f_score(matrix, c) for c in clases]

        w_score = metrics.weighted_f_score(matrix)

        # f = open(result_dir + str(max_depth) + ' ' + str(min_samples_split) + '.txt', 'w')
        f = open(result_dir + str(p) + '.txt', 'w')

        f.write('F_score by class')
        f.write('\n')
        f.write(str(f_scores))
        f.write('\n')
        f.write('\n')
        f.write('Weighted average: ')
        f.write(str(w_score))

        f.close()
Ejemplo n.º 6
0
    # Entreno y clasifico con árboles
    partial_fit = partial(parallel.fit_tree,
                          feature_filter=feature_filter,
                          folds=folds)
    pool = Pool(processes=n_processes, maxtasksperchild=2)
    resultados_tree = pool.map(partial_fit, paths)
    pool.close()
    pool.join()

    # Imprimo y guardo resultados obtenidos
    for i, r in enumerate(resultados_tree):
        r.to_csv(result_path + 'result_tree_' + str(i) + '.csv')
        matrix = metrics.hard_matrix(r)
        print 'Tree ' + str(i) + ' f_score: ' + str(
            metrics.weighted_f_score(matrix))

    # Entreno y clasifico con rf
    partial_fit = partial(parallel.fit_rf,
                          feature_filter=feature_filter,
                          folds=folds)
    pool = Pool(processes=n_processes, maxtasksperchild=2)
    resultados_rf = pool.map(partial_fit, paths)
    pool.close()
    pool.join()

    # Imprimo y guardo resultados obtenidos
    for i, r in enumerate(resultados_rf):
        r.to_csv(result_path + 'result_rf_' + str(i) + '.csv')
        matrix = metrics.hard_matrix(r)
        print 'RF ' + str(i) + ' f_score: ' + str(
Ejemplo n.º 7
0
    feature_filter = args.feature_filter

    paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)]
    paths = paths[0:10]

    # Entreno y clasifico con árboles
    partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds)
    pool = Pool(processes=n_processes, maxtasksperchild=2)
    resultados_tree = pool.map(partial_fit, paths)
    pool.close()
    pool.join()

    # Imprimo y guardo resultados obtenidos
    for i, r in enumerate(resultados_tree):
        r.to_csv(result_path + 'result_tree_' + str(i) + '.csv')
        matrix = metrics.hard_matrix(r)
        print 'Tree ' + str(i) + ' f_score: ' + str(metrics.weighted_f_score(matrix))

    # Entreno y clasifico con rf
    partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds)
    pool = Pool(processes=n_processes, maxtasksperchild=2)
    resultados_rf = pool.map(partial_fit, paths)
    pool.close()
    pool.join()

    # Imprimo y guardo resultados obtenidos
    for i, r in enumerate(resultados_rf):
        r.to_csv(result_path + 'result_rf_' + str(i) + '.csv')
        matrix = metrics.hard_matrix(r)
        print 'RF ' + str(i) + ' f_score: ' + str(metrics.weighted_f_score(matrix))
Ejemplo n.º 8
0
        '/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/test.csv',
        index_col=0).index
    test_X, test_y = utils.filter_data(test_data,
                                       index_filter=test_index_filter,
                                       feature_filter=feature_filter)

    results = []
    ids = []

    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 criterion=criterion,
                                 max_depth=max_depth,
                                 min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    results.append(metrics.predict_table(clf, test_X, test_y))
    ids.extend(test_X.index.tolist())

    result = pd.concat(results)
    result['indice'] = ids
    result.set_index('indice')
    result.index.name = None
    result = result.drop('indice', axis=1)

    result.to_csv(result_path)

    m = metrics.confusion_matrix(result)
    print metrics.weighted_f_score(m)
Ejemplo n.º 9
0
files = [f for f in os.listdir(result_dir) if '.csv' in f]
for f in files:
    pattern = re.compile('[0-9]+')
    percentage = int(pattern.search(f).group())

    result = pd.read_csv(result_dir + 'result_' + str(percentage) + '.csv',
                         index_col=0)
    if how == 'soft':
        matrix = metrics.confusion_matrix(result)
    elif how == 'hard':
        matrix = metrics.hard_matrix(result)

    matrix.to_csv(path + 'Metricas/' + how + '_matrix_' + str(percentage) +
                  '.csv')

    w_dict[percentage] = metrics.weighted_f_score(matrix)

    clases = matrix.columns.tolist()
    p = [metrics.precision(matrix, c) for c in clases]
    r = [metrics.recall(matrix, c) for c in clases]
    f = [metrics.f_score(matrix, c) for c in clases]

    p_dict[percentage] = p
    r_dict[percentage] = r
    f_dict[percentage] = f

save_dir = path + 'Metricas/'

w_df = pd.DataFrame.from_dict(w_dict, orient='index')
w_df.columns = ['f_score']
w_df = w_df.sort_index(ascending=True)
Ejemplo n.º 10
0
    train_data = pd.read_csv(train_path, index_col=0)
    train_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/train.csv', index_col=0).index
    train_X, train_y = utils.filter_data(train_data, index_filter=train_index_filter, feature_filter=feature_filter)

    test_data = pd.read_csv(test_path, index_col=0)
    test_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/test.csv', index_col=0).index
    test_X, test_y = utils.filter_data(test_data, index_filter=test_index_filter, feature_filter=feature_filter)

    results = []
    ids = []

    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                 max_depth=max_depth, min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    results.append(metrics.predict_table(clf, test_X, test_y))
    ids.extend(test_X.index.tolist())

    result = pd.concat(results)
    result['indice'] = ids
    result.set_index('indice')
    result.index.name = None
    result = result.drop('indice', axis=1)

    result.to_csv(result_path)

    m = metrics.confusion_matrix(result)
    print metrics.weighted_f_score(m)