Beispiel #1
0
def fit_montecarlo_tree(path_index, paths = None, index_filter=None, class_filter=None,
                        feature_filter=None, folds=10):
    """A diferencia de fit tree, este metodo recibe todos los paths. Entrena solo con uno, indicado
    por path index. Pero luego por orden, voy abriendo todos los sets para clasificar.
    """
    data = pd.read_csv(paths[path_index], index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)

    results = []
    for train_index, test_index in skf:
        train_X = data.iloc[train_index]
        train_y = y.iloc[train_index]

        clf = None
        clf = tree.Tree('gain', max_depth=10, min_samples_split=20)

        clf.fit(train_X, train_y)
        # result = clf.predict_table(test_X, test_y)
        # results.append(result)

    # Ahora clasifico con este arbol para todos los datasets
    for path in paths:
        data = pd.read_csv(path, index_col=0)
        data, y = utils.filter_data(data, index_filter, class_filter, feature_filter)

    return pd.concat(results)
Beispiel #2
0
    def __getitem__(self, index):
        'Generates one sample of data'
        if self.train:
            scan = get_3d_scan(self.list_of_ids[index])
            misc, fvc, percent, weeks, ranger = filter_data(
                self.data, self.list_of_ids[index])
            scan = torch.tensor(scan).unsqueeze(0)
            return (scan.float(), misc.float(), fvc.float(), percent.float(),
                    weeks.float(), ranger.int())

        else:
            try:
                scan = process_3d_scan(self.list_of_ids[index], False)
            except:
                print("Error caught in scan creation. Returning zeros")
                with np.load(
                        "../input/localosic/OSICPulmonFibrosis-master/data/scans/ID00421637202311550012437.npy"
                ) as scan_file:
                    scan = scan_file

            misc, fvc, percent, weeks = filter_data(self.data,
                                                    self.list_of_ids[index],
                                                    train=False)
            scan = torch.tensor(scan).unsqueeze(0)
            return (scan.float(), misc.float(), fvc.float(), percent.float(),
                    weeks.float())
Beispiel #3
0
def fit_tree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
             inverse=False, max_depth=10, min_samples_split=20, lc_filter=None):
    """

    path: Dirección del dataset a ocupar para entrenar
    index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar
    class_filter: Lista de clases que se quiere utilizar
    feature_filter: Lista de features que se quiere utilizar

    """
    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux

        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = tree.Tree('gain', max_depth=max_depth, min_samples_split=min_samples_split)

        clf.fit(train_X, train_y)
        results.append(clf.predict_table(test_X, test_y))

    return pd.concat(results)
Beispiel #4
0
def fit_sktree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
             inverse=False, max_depth=10, min_samples_split=20, lc_filter=None):

    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux

        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth,
                                     min_samples_split=min_samples_split)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    return pd.concat(results)
    def __init__(self, code_path, ast_path, nl_path):
        # get lines
        codes = utils.load_dataset(code_path)
        asts = utils.load_dataset(ast_path)
        nls = utils.load_dataset(nl_path)

        if len(codes) != len(asts) or len(codes) != len(nls) or len(asts) != len(nls):
            raise Exception('The lengths of three dataset do not match.')

        self.codes, self.asts, self.nls = utils.filter_data(codes, asts, nls)
Beispiel #6
0
 def read(self, cell_hash, frame_name, data_format=None, nrow=None):
     """
     Tell the selected backend to read the file, and filter if required.
     """
     data = self.store.read(cell_hash, frame_name)
     if data_format == "application/json":
         data = convert_to_json(data)
     elif data_format == "application/octet-stream":
         data = convert_to_arrow(data)
     if nrow:
         data = filter_data(data, nrow)
     return data
Beispiel #7
0
def run(args):
    # Creating and opening the pipeline
    options = PipelineOptions()
    with beam.Pipeline(options=options) as p:
        filtered_data = utils.filter_data(p, args.input_file)

        _ = (filtered_data
             | 'Get all Items' >> beam.Map(lambda event: (event[1][4], 0))
             | 'Group by Item' >> beam.GroupByKey()
             | 'Count number of Items' >> beam.combiners.Count.Globally()
             | 'Write to output file' >> beam.io.WriteToText(
                 '%s/itemsCount.txt' % args.work_dir, shard_name_template=''))
Beispiel #8
0
def train_tree(path, feature_filter=None, train_index=None):
    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, feature_filter=feature_filter)
    
    train_X = data.iloc[train_index]
    train_y = y.iloc[train_index]

    clf = None
    clf = tree.Tree('gain', max_depth=10, min_samples_split=20)

    clf.fit(train_X, train_y)
    
    return clf
Beispiel #9
0
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split
    result_path = args.result_path
    feature_filter = args.feature_filter



    data = pd.read_csv(training_set_path, index_col=0)
    
    paths = [test_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)]

    # Necesito asegurarme de que las curvas sean las mismas en train y test
    test_data = pd.read_csv(paths[0], index_col=0)
    data, test_data = utils.equalize_indexes(data, test_data)

    data, y = utils.filter_data(data, feature_filter=feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:

        train_X, train_y  = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)
        clf.fit(train_X, train_y)
Beispiel #10
0
    parser.add_argument('--result_path',  required=True, type=str)
    parser.add_argument('--class_filter',  nargs='*', type=str)
    parser.add_argument('--feature_filter',  nargs='*', type=str)

    args = parser.parse_args(sys.argv[1:])

    percentage = args.percentage
    folds = args.folds
    training_set_path = args.training_set_path
    result_path = args.result_path
    class_filter = args.class_filter
    feature_filter = args.feature_filter


    data = pd.read_csv(training_set_path, index_col=0)
    data, y = utils.filter_data(data, index_filter=None, class_filter=class_filter,
                                feature_filter=feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)

    results = []
    ids = []
    for train_index, test_index in skf:
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = tree.Tree('gain', max_depth=10, min_samples_split=20)

        clf.fit(train_X, train_y)
        results.append(clf.predict_table(test_X, test_y))
        ids.extend(test_X.index.tolist())
Beispiel #11
0
    n_processes = args.n_processes
    catalog = args.catalog

    train_path = args.train_path
    test_path = args.test_path
    result_path = args.result_path
    n_estimators = args.n_estimators
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split
    feature_filter = args.feature_filter

    train_data = pd.read_csv(train_path, index_col=0)
    train_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/train.csv', index_col=0).index
    train_X, train_y = utils.filter_data(train_data, index_filter=train_index_filter, feature_filter=feature_filter)

    test_data = pd.read_csv(test_path, index_col=0)
    test_index_filter = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/Resultados/MACHO/RF/Small/test.csv', index_col=0).index
    test_X, test_y = utils.filter_data(test_data, index_filter=test_index_filter, feature_filter=feature_filter)

    results = []
    ids = []

    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                 max_depth=max_depth, min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    results.append(metrics.predict_table(clf, test_X, test_y))
Beispiel #12
0
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split

    feature_filter = args.feature_filter
    index_filter = args.index_filter

    if index_filter is not None:
        index_filter = pd.read_csv(index_filter, index_col=0).index

    train_data = pd.read_csv(train_path, index_col=0)
    test_data = pd.read_csv(test_path, index_col=0)

    train_data, test_data = utils.equalize_indexes(train_data, test_data)

    train_X, train_y = utils.filter_data(train_data,
                                         index_filter=index_filter,
                                         feature_filter=feature_filter)
    test_X, test_y = utils.filter_data(test_data,
                                       index_filter=index_filter,
                                       feature_filter=feature_filter)

    # Ocupo solo los datos de test para hacer el k-fold, por que estos no estan repetidos
    # Y es valido ocuparlos solo por posicion
    skf = cross_validation.StratifiedKFold(test_y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
Beispiel #13
0
    n_processes = args.n_processes
    catalog = args.catalog

    training_set_path = args.training_set_path
    test_set_path = args.test_set_path
    result_path = args.result_path

    n_estimators = args.n_estimators
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split

    feature_filter = args.feature_filter

    train_data = pd.read_csv(training_set_path, index_col=0)
    train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter)

    test_data = pd.read_csv(test_set_path, index_col=0)
    test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter)


    clf = None
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                 max_depth=max_depth, min_samples_split=min_samples_split,
                                 n_jobs=n_processes)

    clf.fit(train_X, train_y)
    result = metrics.predict_table(clf, test_X, test_y)

    result['indice'] = test_X.index.tolist()
    result.set_index('indice')
Beispiel #14
0
    min_samples_split = args.min_samples_split
    result_path = args.result_path
    feature_filter = args.feature_filter

    data = pd.read_csv(training_set_path, index_col=0)

    paths = [
        test_path + catalog + '_sampled_' + str(i) + '.csv'
        for i in xrange(100)
    ]

    # Necesito asegurarme de que las curvas sean las mismas en train y test
    test_data = pd.read_csv(paths[0], index_col=0)
    data, test_data = utils.equalize_indexes(data, test_data)

    data, y = utils.filter_data(data, feature_filter=feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:

        train_X, train_y = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators,
                                     criterion=criterion,
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split,
                                     n_jobs=n_processes)
Beispiel #15
0
    lc_filter = args.lc_filter
    index_filter = args.index_filter
    feature_filter = args.feature_filter

    data = pd.read_csv(training_set_path, index_col=0)

    if index_filter is not None:
        index_filter = pd.read_csv(index_filter, index_col=0).index

    elif lc_filter is not None:
        # Filtro un porcentaje de las curvas y las guardo para comparar despues
        data = utils.stratified_filter(data, data['class'], lc_filter)
        data.to_csv(result_path + 'data.csv')

    data, y = utils.filter_data(data, feature_filter=feature_filter, index_filter=index_filter)

    if validation == 'kfold':
        skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    elif validation == 'holdout':
        skf = cross_validation.StratifiedShuffleSplit(y, n_iter=folds, test_size=test_size)

    results = []
    ids = []
    count = 1
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux
Beispiel #16
0
    percentage = args.percentage
    catalog = args.catalog
    n_processes = args.n_processes
    training_set_path = args.training_set_path
    result_path = args.result_path
    n_estimators = args.n_estimators
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split
    feature_filter = args.feature_filter

    folds = 10

    data = pd.read_csv(training_set_path, index_col=0)
    data, y = utils.filter_data(data, feature_filter=feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)

    results = []
    ids = []

    for train_index, test_index in skf:
        # Invierto el orden del k-fold
        train_X, test_X = data.iloc[test_index], data.iloc[train_index]
        train_y, test_y = y.iloc[test_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators,
                                     criterion=criterion,
                                     max_depth=max_depth,
Beispiel #17
0
        train_index_filter = pd.read_csv(train_index_filter, index_col=0).index

    if test_index_filter is not None:
        test_index_filter = pd.read_csv(test_index_filter, index_col=0).index

    paths = [
        sets_path + catalog + '_sampled_' + str(i) + '.csv'
        for i in xrange(n_samples)
    ]

    resultados = []
    for p in paths:
        data = pd.read_csv(p, index_col=0)

        train_X, train_y = utils.filter_data(data,
                                             index_filter=train_index_filter,
                                             feature_filter=feature_filter)
        test_X, test_y = utils.filter_data(data,
                                           index_filter=test_index_filter,
                                           feature_filter=feature_filter)

        clf = None
        clf = DecisionTreeClassifier(criterion='entropy',
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split)

        clf.fit(train_X, train_y)
        resultados.append(metrics.predict_table(clf, test_X, test_y))

    result = metrics.aggregate_predictions(resultados)
    result.to_csv(result_path + 'result_' + percentage + '.csv')
Beispiel #18
0
def transform_csv(data_path=None,
                  train_path=None,
                  test_path=None,
                  train_output_path=None,
                  test_output_path=None,
                  header="infer",
                  train_frac=0.8,
                  implicit_threshold=0,
                  sep=",",
                  label_col=0,
                  cat_cols=None,
                  num_cols=None,
                  normalize=False,
                  num_neg=None,
                  ffm=True,
                  seed=2020):
    neg_sample = True if num_neg is not None and num_neg > 0 else False
    cat_cols = (list(map(int, cat_cols.split(',')))
                if cat_cols is not None else list())
    num_cols = (list(map(int, num_cols.split(',')))
                if num_cols is not None else list())

    train_data, test_data = read_data(data_path, train_path, test_path, sep,
                                      header, label_col, train_frac, seed,
                                      implicit_threshold, neg_sample)

    if normalize and num_cols:
        train_data, test_data = normalize_data(train_data, test_data, num_cols)

    train_data, test_data = filter_data(train_data, test_data, cat_cols)
    cat_unique_vals, num_unique_vals = index_data(train_data, cat_cols,
                                                  num_cols)

    if not neg_sample:
        transformed_train_data = convert_normal(train_data, label_col,
                                                cat_cols, num_cols,
                                                cat_unique_vals,
                                                num_unique_vals, ffm)
        transformed_test_data = convert_normal(test_data, label_col, cat_cols,
                                               num_cols, cat_unique_vals,
                                               num_unique_vals, ffm)
    else:
        transformed_train_data = convert_neg(train_data,
                                             label_col,
                                             cat_cols,
                                             num_cols,
                                             cat_unique_vals,
                                             num_unique_vals,
                                             num_neg,
                                             ffm,
                                             train=True)
        transformed_test_data = convert_neg(test_data,
                                            label_col,
                                            cat_cols,
                                            num_cols,
                                            cat_unique_vals,
                                            num_unique_vals,
                                            num_neg,
                                            ffm,
                                            train=False)

    pd.Series(transformed_train_data).to_csv(train_output_path,
                                             index=False,
                                             header=False)
    pd.Series(transformed_test_data).to_csv(test_output_path,
                                            index=False,
                                            header=False)
Beispiel #19
0
    #     print 'Guardado de arboles'

    #     count = 0
    #     for clf in arboles:
    #         output = open(result_path + "Arboles/arbol_" + str(count) + '.pkl', 'wb+')
    #         pickle.dump(clf, output)
    #         output.close()
    #         count += 1

        print 'Consolido resultados'

        # Guardo las votaciones de clasificaciones para cada dataset
        sample_set_result = []
        for path in paths:
            data = pd.read_csv(path, index_col=0)
            data, y = utils.filter_data(data, feature_filter=feature_filter)

            test_X = data.iloc[test_index]
            test_y = y.iloc[test_index]
            
            # Guardo la clasificacion de cada árbol para el dataset actual
            aux = []
            for clf in arboles:
                result = clf.predict_table(test_X, test_y)
                aux.append(result)

            # Consolido las votaciones de los árboles en un solo frame
            consolidated_frame = reduce(lambda a, b: a+b, map(metrics.result_to_frame, aux))
            sample_set_result.append(consolidated_frame)
            print 'Largo de lista para cada muestra: ' + str(len(sample_set_result))
Beispiel #20
0
import unicodedata

from bs4 import BeautifulSoup
import requests

from utils import filter_data

if __name__ == '__main__':
    if len(argv) < 2:
        exit("Usage: 'python3 get_degree_requirments.py [degree]'")
    degrees = argv[1:]
    urls = json.load(open('./degree_requirement_urls.json'))
    filtered_data = {}
    for degree in degrees:
        if degree not in urls:
            exit(f'Invalid degree. Valid choices are {list(urls.keys())}')
        url = urls[degree]
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'html.parser')
        courses = list(
            map(lambda x: unicodedata.normalize('NFKD', x['title']),
                soup.select('a.code.bubblelink')))
        filtered_data.update(filter_data(courses, degree))

    output_path = f'../course_data/{degree}_courses.obj'
    with open(output_path, 'wb') as f:
        pickle.dump(filtered_data, f)

    print(f"result: {len(filtered_data)} nodes")
    print(f'wrote file to {output_path}')
Beispiel #21
0
    if not (len(sys.argv) == 1 + 1):
        print('\033[91m' + '✘ Error: ' + '\033[0m' +
              'CSV file is missing, please add his path as argument')
        sys.exit()
    df = utils.get_valuable_dataframe(sys.argv[1])
    house = utils.get_house()
    row_list = [[
        'House', 'Feature1', 'Feature2', 'Theta1', 'Theta2', 'Theta3', 'Mean1',
        'Mean2', 'Std1', 'Std2', 'Accuracy'
    ]]
    for i in range(0, len(house)):
        if verb_print == True:
            print('\n\033[93m' + house[i] + '\033[0m')
        for feature_1 in range(1, len(df.columns)):
            for feature_2 in range(feature_1 + 1, len(df.columns)):
                x, y = utils.filter_data(df, house[i], feature_1, feature_2)
                x, mean, std = standardize(x)
                col, row = x.shape[0], x.shape[1]
                x = np.insert(x, 0, 1, axis=1)
                y = y.reshape(col, 1)
                theta = np.zeros((row + 1, 1))
                theta, history_err = get_theta(x, y, theta, learning_rate,
                                               iteration, verb_cost)
                ac = get_accuracy(x, y, theta)
                if verb_print == True:
                    print('\033[94m' + df.columns[feature_1] + '\033[0m',
                          end='')
                    print(' vs ', end='')
                    print('\033[96m' + df.columns[feature_2] + '\033[0m')

                    print('Accuracy: ', end='')
# -*- coding: utf-8 -*-
"""Preprocessing of the Train dataset, display of content of one patient"""

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import get_data, filter_data

PREPROC = get_data()
ID_PATIENT = "ID00026637202179561894768"
OTHER, FVC, PERCENT, WEEKS = filter_data(PREPROC, ID_PATIENT)

print(FVC)
print(OTHER)
print(PERCENT)
print(WEEKS)