def fit_montecarlo_tree(path_index, paths = None, index_filter=None, class_filter=None, feature_filter=None, folds=10): """A diferencia de fit tree, este metodo recibe todos los paths. Entrena solo con uno, indicado por path index. Pero luego por orden, voy abriendo todos los sets para clasificar. """ data = pd.read_csv(paths[path_index], index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: train_X = data.iloc[train_index] train_y = y.iloc[train_index] clf = None clf = tree.Tree('gain', max_depth=10, min_samples_split=20) clf.fit(train_X, train_y) # result = clf.predict_table(test_X, test_y) # results.append(result) # Ahora clasifico con este arbol para todos los datasets for path in paths: data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter) return pd.concat(results)
def __getitem__(self, index): 'Generates one sample of data' if self.train: scan = get_3d_scan(self.list_of_ids[index]) misc, fvc, percent, weeks, ranger = filter_data( self.data, self.list_of_ids[index]) scan = torch.tensor(scan).unsqueeze(0) return (scan.float(), misc.float(), fvc.float(), percent.float(), weeks.float(), ranger.int()) else: try: scan = process_3d_scan(self.list_of_ids[index], False) except: print("Error caught in scan creation. Returning zeros") with np.load( "../input/localosic/OSICPulmonFibrosis-master/data/scans/ID00421637202311550012437.npy" ) as scan_file: scan = scan_file misc, fvc, percent, weeks = filter_data(self.data, self.list_of_ids[index], train=False) scan = torch.tensor(scan).unsqueeze(0) return (scan.float(), misc.float(), fvc.float(), percent.float(), weeks.float())
def fit_tree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, max_depth=10, min_samples_split=20, lc_filter=None): """ path: Dirección del dataset a ocupar para entrenar index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar class_filter: Lista de clases que se quiere utilizar feature_filter: Lista de features que se quiere utilizar """ data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = tree.Tree('gain', max_depth=max_depth, min_samples_split=min_samples_split) clf.fit(train_X, train_y) results.append(clf.predict_table(test_X, test_y)) return pd.concat(results)
def fit_sktree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, max_depth=10, min_samples_split=20, lc_filter=None): data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) return pd.concat(results)
def __init__(self, code_path, ast_path, nl_path): # get lines codes = utils.load_dataset(code_path) asts = utils.load_dataset(ast_path) nls = utils.load_dataset(nl_path) if len(codes) != len(asts) or len(codes) != len(nls) or len(asts) != len(nls): raise Exception('The lengths of three dataset do not match.') self.codes, self.asts, self.nls = utils.filter_data(codes, asts, nls)
def read(self, cell_hash, frame_name, data_format=None, nrow=None): """ Tell the selected backend to read the file, and filter if required. """ data = self.store.read(cell_hash, frame_name) if data_format == "application/json": data = convert_to_json(data) elif data_format == "application/octet-stream": data = convert_to_arrow(data) if nrow: data = filter_data(data, nrow) return data
def run(args): # Creating and opening the pipeline options = PipelineOptions() with beam.Pipeline(options=options) as p: filtered_data = utils.filter_data(p, args.input_file) _ = (filtered_data | 'Get all Items' >> beam.Map(lambda event: (event[1][4], 0)) | 'Group by Item' >> beam.GroupByKey() | 'Count number of Items' >> beam.combiners.Count.Globally() | 'Write to output file' >> beam.io.WriteToText( '%s/itemsCount.txt' % args.work_dir, shard_name_template=''))
def train_tree(path, feature_filter=None, train_index=None): data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, feature_filter=feature_filter) train_X = data.iloc[train_index] train_y = y.iloc[train_index] clf = None clf = tree.Tree('gain', max_depth=10, min_samples_split=20) clf.fit(train_X, train_y) return clf
max_depth = args.max_depth min_samples_split = args.min_samples_split result_path = args.result_path feature_filter = args.feature_filter data = pd.read_csv(training_set_path, index_col=0) paths = [test_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)] # Necesito asegurarme de que las curvas sean las mismas en train y test test_data = pd.read_csv(paths[0], index_col=0) data, test_data = utils.equalize_indexes(data, test_data) data, y = utils.filter_data(data, feature_filter=feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y)
parser.add_argument('--result_path', required=True, type=str) parser.add_argument('--class_filter', nargs='*', type=str) parser.add_argument('--feature_filter', nargs='*', type=str) args = parser.parse_args(sys.argv[1:]) percentage = args.percentage folds = args.folds training_set_path = args.training_set_path result_path = args.result_path class_filter = args.class_filter feature_filter = args.feature_filter data = pd.read_csv(training_set_path, index_col=0) data, y = utils.filter_data(data, index_filter=None, class_filter=class_filter, feature_filter=feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = tree.Tree('gain', max_depth=10, min_samples_split=20) clf.fit(train_X, train_y) results.append(clf.predict_table(test_X, test_y)) ids.extend(test_X.index.tolist())
n_processes = args.n_processes catalog = args.catalog train_path = args.train_path test_path = args.test_path result_path = args.result_path n_estimators = args.n_estimators criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split feature_filter = args.feature_filter train_data = pd.read_csv(train_path, index_col=0) train_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/train.csv', index_col=0).index train_X, train_y = utils.filter_data(train_data, index_filter=train_index_filter, feature_filter=feature_filter) test_data = pd.read_csv(test_path, index_col=0) test_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/test.csv', index_col=0).index test_X, test_y = utils.filter_data(test_data, index_filter=test_index_filter, feature_filter=feature_filter) results = [] ids = [] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y))
max_depth = args.max_depth min_samples_split = args.min_samples_split feature_filter = args.feature_filter index_filter = args.index_filter if index_filter is not None: index_filter = pd.read_csv(index_filter, index_col=0).index train_data = pd.read_csv(train_path, index_col=0) test_data = pd.read_csv(test_path, index_col=0) train_data, test_data = utils.equalize_indexes(train_data, test_data) train_X, train_y = utils.filter_data(train_data, index_filter=index_filter, feature_filter=feature_filter) test_X, test_y = utils.filter_data(test_data, index_filter=index_filter, feature_filter=feature_filter) # Ocupo solo los datos de test para hacer el k-fold, por que estos no estan repetidos # Y es valido ocuparlos solo por posicion skf = cross_validation.StratifiedKFold(test_y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index
n_processes = args.n_processes catalog = args.catalog training_set_path = args.training_set_path test_set_path = args.test_set_path result_path = args.result_path n_estimators = args.n_estimators criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split feature_filter = args.feature_filter train_data = pd.read_csv(training_set_path, index_col=0) train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter) test_data = pd.read_csv(test_set_path, index_col=0) test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter) clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) result = metrics.predict_table(clf, test_X, test_y) result['indice'] = test_X.index.tolist() result.set_index('indice')
min_samples_split = args.min_samples_split result_path = args.result_path feature_filter = args.feature_filter data = pd.read_csv(training_set_path, index_col=0) paths = [ test_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100) ] # Necesito asegurarme de que las curvas sean las mismas en train y test test_data = pd.read_csv(paths[0], index_col=0) data, test_data = utils.equalize_indexes(data, test_data) data, y = utils.filter_data(data, feature_filter=feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes)
lc_filter = args.lc_filter index_filter = args.index_filter feature_filter = args.feature_filter data = pd.read_csv(training_set_path, index_col=0) if index_filter is not None: index_filter = pd.read_csv(index_filter, index_col=0).index elif lc_filter is not None: # Filtro un porcentaje de las curvas y las guardo para comparar despues data = utils.stratified_filter(data, data['class'], lc_filter) data.to_csv(result_path + 'data.csv') data, y = utils.filter_data(data, feature_filter=feature_filter, index_filter=index_filter) if validation == 'kfold': skf = cross_validation.StratifiedKFold(y, n_folds=folds) elif validation == 'holdout': skf = cross_validation.StratifiedShuffleSplit(y, n_iter=folds, test_size=test_size) results = [] ids = [] count = 1 for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux
percentage = args.percentage catalog = args.catalog n_processes = args.n_processes training_set_path = args.training_set_path result_path = args.result_path n_estimators = args.n_estimators criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split feature_filter = args.feature_filter folds = 10 data = pd.read_csv(training_set_path, index_col=0) data, y = utils.filter_data(data, feature_filter=feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: # Invierto el orden del k-fold train_X, test_X = data.iloc[test_index], data.iloc[train_index] train_y, test_y = y.iloc[test_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
train_index_filter = pd.read_csv(train_index_filter, index_col=0).index if test_index_filter is not None: test_index_filter = pd.read_csv(test_index_filter, index_col=0).index paths = [ sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(n_samples) ] resultados = [] for p in paths: data = pd.read_csv(p, index_col=0) train_X, train_y = utils.filter_data(data, index_filter=train_index_filter, feature_filter=feature_filter) test_X, test_y = utils.filter_data(data, index_filter=test_index_filter, feature_filter=feature_filter) clf = None clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split) clf.fit(train_X, train_y) resultados.append(metrics.predict_table(clf, test_X, test_y)) result = metrics.aggregate_predictions(resultados) result.to_csv(result_path + 'result_' + percentage + '.csv')
def transform_csv(data_path=None, train_path=None, test_path=None, train_output_path=None, test_output_path=None, header="infer", train_frac=0.8, implicit_threshold=0, sep=",", label_col=0, cat_cols=None, num_cols=None, normalize=False, num_neg=None, ffm=True, seed=2020): neg_sample = True if num_neg is not None and num_neg > 0 else False cat_cols = (list(map(int, cat_cols.split(','))) if cat_cols is not None else list()) num_cols = (list(map(int, num_cols.split(','))) if num_cols is not None else list()) train_data, test_data = read_data(data_path, train_path, test_path, sep, header, label_col, train_frac, seed, implicit_threshold, neg_sample) if normalize and num_cols: train_data, test_data = normalize_data(train_data, test_data, num_cols) train_data, test_data = filter_data(train_data, test_data, cat_cols) cat_unique_vals, num_unique_vals = index_data(train_data, cat_cols, num_cols) if not neg_sample: transformed_train_data = convert_normal(train_data, label_col, cat_cols, num_cols, cat_unique_vals, num_unique_vals, ffm) transformed_test_data = convert_normal(test_data, label_col, cat_cols, num_cols, cat_unique_vals, num_unique_vals, ffm) else: transformed_train_data = convert_neg(train_data, label_col, cat_cols, num_cols, cat_unique_vals, num_unique_vals, num_neg, ffm, train=True) transformed_test_data = convert_neg(test_data, label_col, cat_cols, num_cols, cat_unique_vals, num_unique_vals, num_neg, ffm, train=False) pd.Series(transformed_train_data).to_csv(train_output_path, index=False, header=False) pd.Series(transformed_test_data).to_csv(test_output_path, index=False, header=False)
# print 'Guardado de arboles' # count = 0 # for clf in arboles: # output = open(result_path + "Arboles/arbol_" + str(count) + '.pkl', 'wb+') # pickle.dump(clf, output) # output.close() # count += 1 print 'Consolido resultados' # Guardo las votaciones de clasificaciones para cada dataset sample_set_result = [] for path in paths: data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, feature_filter=feature_filter) test_X = data.iloc[test_index] test_y = y.iloc[test_index] # Guardo la clasificacion de cada árbol para el dataset actual aux = [] for clf in arboles: result = clf.predict_table(test_X, test_y) aux.append(result) # Consolido las votaciones de los árboles en un solo frame consolidated_frame = reduce(lambda a, b: a+b, map(metrics.result_to_frame, aux)) sample_set_result.append(consolidated_frame) print 'Largo de lista para cada muestra: ' + str(len(sample_set_result))
import unicodedata from bs4 import BeautifulSoup import requests from utils import filter_data if __name__ == '__main__': if len(argv) < 2: exit("Usage: 'python3 get_degree_requirments.py [degree]'") degrees = argv[1:] urls = json.load(open('./degree_requirement_urls.json')) filtered_data = {} for degree in degrees: if degree not in urls: exit(f'Invalid degree. Valid choices are {list(urls.keys())}') url = urls[degree] html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') courses = list( map(lambda x: unicodedata.normalize('NFKD', x['title']), soup.select('a.code.bubblelink'))) filtered_data.update(filter_data(courses, degree)) output_path = f'../course_data/{degree}_courses.obj' with open(output_path, 'wb') as f: pickle.dump(filtered_data, f) print(f"result: {len(filtered_data)} nodes") print(f'wrote file to {output_path}')
if not (len(sys.argv) == 1 + 1): print('\033[91m' + '✘ Error: ' + '\033[0m' + 'CSV file is missing, please add his path as argument') sys.exit() df = utils.get_valuable_dataframe(sys.argv[1]) house = utils.get_house() row_list = [[ 'House', 'Feature1', 'Feature2', 'Theta1', 'Theta2', 'Theta3', 'Mean1', 'Mean2', 'Std1', 'Std2', 'Accuracy' ]] for i in range(0, len(house)): if verb_print == True: print('\n\033[93m' + house[i] + '\033[0m') for feature_1 in range(1, len(df.columns)): for feature_2 in range(feature_1 + 1, len(df.columns)): x, y = utils.filter_data(df, house[i], feature_1, feature_2) x, mean, std = standardize(x) col, row = x.shape[0], x.shape[1] x = np.insert(x, 0, 1, axis=1) y = y.reshape(col, 1) theta = np.zeros((row + 1, 1)) theta, history_err = get_theta(x, y, theta, learning_rate, iteration, verb_cost) ac = get_accuracy(x, y, theta) if verb_print == True: print('\033[94m' + df.columns[feature_1] + '\033[0m', end='') print(' vs ', end='') print('\033[96m' + df.columns[feature_2] + '\033[0m') print('Accuracy: ', end='')
# -*- coding: utf-8 -*- """Preprocessing of the Train dataset, display of content of one patient""" import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import get_data, filter_data PREPROC = get_data() ID_PATIENT = "ID00026637202179561894768" OTHER, FVC, PERCENT, WEEKS = filter_data(PREPROC, ID_PATIENT) print(FVC) print(OTHER) print(PERCENT) print(WEEKS)