feature_filter = args.feature_filter train_data = pd.read_csv(training_set_path, index_col=0) train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter) test_data = pd.read_csv(test_set_path, index_col=0) test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter) clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) result = metrics.predict_table(clf, test_X, test_y) result['indice'] = test_X.index.tolist() result.set_index('indice') result.index.name = catalog + '_id' result = result.drop('indice', axis=1) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv') print metrics.weighted_f_score(metrics.confusion_matrix(result))
index_filter = args.index_filter feature_filter = args.feature_filter if index_filter is not None: index_filter = pd.read_csv(index_filter, index_col=0).index paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(n_samples)] if model == 'tree': partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds, inverse=inverse, max_depth=max_depth, min_samples_split=min_samples_split, lc_filter=lc_filter) elif model == 'rf': partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds, inverse=inverse, lc_filter=lc_filter) elif model == 'sktree': partial_fit = partial(parallel.fit_sktree, feature_filter=feature_filter, folds=folds, inverse=inverse, max_depth=max_depth, min_samples_split=min_samples_split, lc_filter=lc_filter) pool = Pool(processes=n_processes, maxtasksperchild=2) resultados = pool.map(partial_fit, paths) pool.close() pool.join() result = metrics.aggregate_predictions(resultados) result.to_csv(result_path) print metrics.weighted_f_score(metrics.confusion_matrix(result))
train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) ids.extend(test_X.index.tolist()) if validation == 'holdout': aux = metrics.predict_table(clf, test_X, test_y) aux.to_csv(result_path + 'Predicciones/hold_' + str(count) + '.csv') print 'hold ' + str(count) + ' ' + str(metrics.weighted_f_score(metrics.confusion_matrix(aux))) count += 1 result = pd.concat(results) result['indice'] = ids result.set_index('indice') result.index.name = catalog + '_id' result = result.drop('indice', axis=1) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv') print metrics.weighted_f_score(metrics.confusion_matrix(result))
files = [f for f in os.listdir(result_dir) if '.csv' in f] for f in files: pattern = re.compile('[0-9]+') percentage = int(pattern.search(f).group()) result = pd.read_csv(result_dir + 'result_' + str(percentage) + '.csv', index_col=0) if how == 'soft': matrix = metrics.confusion_matrix(result) elif how == 'hard': matrix = metrics.hard_matrix(result) matrix.to_csv(path + 'Metricas/' + how + '_matrix_' + str(percentage) + '.csv') w_dict[percentage] = metrics.weighted_f_score(matrix) clases = matrix.columns.tolist() p = [metrics.precision(matrix, c) for c in clases] r = [metrics.recall(matrix, c) for c in clases] f = [metrics.f_score(matrix, c) for c in clases] p_dict[percentage] = p r_dict[percentage] = r f_dict[percentage] = f save_dir = path + 'Metricas/' w_df = pd.DataFrame.from_dict(w_dict, orient='index') w_df.columns = ['f_score'] w_df = w_df.sort_index(ascending=True)
max_depth=14, min_samples_split=20, n_jobs=2) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) result = pd.concat(results) matrix = metrics.confusion_matrix(result) clases = matrix.columns.tolist() precisions = [metrics.precision(matrix, c) for c in clases] recalls = [metrics.recall(matrix, c) for c in clases] f_scores = [metrics.f_score(matrix, c) for c in clases] w_score = metrics.weighted_f_score(matrix) # f = open(result_dir + str(max_depth) + ' ' + str(min_samples_split) + '.txt', 'w') f = open(result_dir + str(p) + '.txt', 'w') f.write('F_score by class') f.write('\n') f.write(str(f_scores)) f.write('\n') f.write('\n') f.write('Weighted average: ') f.write(str(w_score)) f.close()
# Entreno y clasifico con árboles partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds) pool = Pool(processes=n_processes, maxtasksperchild=2) resultados_tree = pool.map(partial_fit, paths) pool.close() pool.join() # Imprimo y guardo resultados obtenidos for i, r in enumerate(resultados_tree): r.to_csv(result_path + 'result_tree_' + str(i) + '.csv') matrix = metrics.hard_matrix(r) print 'Tree ' + str(i) + ' f_score: ' + str( metrics.weighted_f_score(matrix)) # Entreno y clasifico con rf partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds) pool = Pool(processes=n_processes, maxtasksperchild=2) resultados_rf = pool.map(partial_fit, paths) pool.close() pool.join() # Imprimo y guardo resultados obtenidos for i, r in enumerate(resultados_rf): r.to_csv(result_path + 'result_rf_' + str(i) + '.csv') matrix = metrics.hard_matrix(r) print 'RF ' + str(i) + ' f_score: ' + str(
feature_filter = args.feature_filter paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)] paths = paths[0:10] # Entreno y clasifico con árboles partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds) pool = Pool(processes=n_processes, maxtasksperchild=2) resultados_tree = pool.map(partial_fit, paths) pool.close() pool.join() # Imprimo y guardo resultados obtenidos for i, r in enumerate(resultados_tree): r.to_csv(result_path + 'result_tree_' + str(i) + '.csv') matrix = metrics.hard_matrix(r) print 'Tree ' + str(i) + ' f_score: ' + str(metrics.weighted_f_score(matrix)) # Entreno y clasifico con rf partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds) pool = Pool(processes=n_processes, maxtasksperchild=2) resultados_rf = pool.map(partial_fit, paths) pool.close() pool.join() # Imprimo y guardo resultados obtenidos for i, r in enumerate(resultados_rf): r.to_csv(result_path + 'result_rf_' + str(i) + '.csv') matrix = metrics.hard_matrix(r) print 'RF ' + str(i) + ' f_score: ' + str(metrics.weighted_f_score(matrix))
'/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/test.csv', index_col=0).index test_X, test_y = utils.filter_data(test_data, index_filter=test_index_filter, feature_filter=feature_filter) results = [] ids = [] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) ids.extend(test_X.index.tolist()) result = pd.concat(results) result['indice'] = ids result.set_index('indice') result.index.name = None result = result.drop('indice', axis=1) result.to_csv(result_path) m = metrics.confusion_matrix(result) print metrics.weighted_f_score(m)
train_data = pd.read_csv(train_path, index_col=0) train_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/train.csv', index_col=0).index train_X, train_y = utils.filter_data(train_data, index_filter=train_index_filter, feature_filter=feature_filter) test_data = pd.read_csv(test_path, index_col=0) test_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/test.csv', index_col=0).index test_X, test_y = utils.filter_data(test_data, index_filter=test_index_filter, feature_filter=feature_filter) results = [] ids = [] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) ids.extend(test_X.index.tolist()) result = pd.concat(results) result['indice'] = ids result.set_index('indice') result.index.name = None result = result.drop('indice', axis=1) result.to_csv(result_path) m = metrics.confusion_matrix(result) print metrics.weighted_f_score(m)