Python DatasetReader Exemples, metalazy.utils.dataset_reader.DatasetReader Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to thedirectory with  libsvm files')

    args = parser.parse_args()
    path = args.p

    dataset_reader = DatasetReader(path)

    fold = 0
    result = {'25': [], '200': []}

    start = time.time()

    while dataset_reader.has_next():
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        for N in [25, 200]:
            print('{} Neighbours'.format(N))
            clf = MetaLazyClassifier(
                n_neighbors=N,
                select_features=False,
                weight_function='inverse',
                log_time_file=
                '/home/lfmendes/data/mestrado/metalazy/results/tempos2/logtimes{}_{}.json'
                .format(N, fold))
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            # print(classification_report(y_pred=y_pred, y_true=y_test))
            result[str(N)].append(
                f1_score(y_true=y_test, y_pred=y_pred, average='macro'))

            clf.flush_log_time_file()
        fold = fold + 1

    print('\n\n ---------\n EXPERIMENT RESULT \n ---------')
    print(result)
    for N in ['25', '200']:
        print('{}: {}'.format(N, np.mean(np.array(result[N]))))

    end = time.time()
    print(end - start)

Exemple #2

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument(
        '-j',
        help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument(
        '-g',
        help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))
        start_fold = time.time()

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        clf = MetaLazyClassifier(select_features=False,
                                 n_jobs=n_jobs,
                                 grid_size=grid_size)

        clf = ExtraTreesClassifier(n_jobs=-1)

        # tuned_parameters = [{'specific_classifier': ['nb'],
        #                      'weight_function': ['none', 'cosine', 'inverse'],
        #                      'n_neighbors': [200, 50], 'number_of_cooccurrences': [1, 10]}]

        #tuned_parameters = [{'specific_classifier': ['nb', 'logistic', 'extrarf'],
        tuned_parameters = [{
            'specific_classifier': ['nb', 'logistic', 'extrarf'],
            'weight_function': ['inverse'],
            'n_neighbors': [100],
            'number_of_cooccurrences': [10]
        }]

        tuned_parameters = [{
            'criterion': ['gini', 'entropy'],
            'max_features': ['log2', 'sqrt'],
            'class_weight': ['balanced', 'None'],
            'n_estimators': [100, 200]
        }]

        print('GENERAL STARTING')
        start_grid = time.time()
        grid = GridSearchCV(clf,
                            tuned_parameters,
                            cv=3,
                            scoring='f1_macro',
                            n_jobs=1)
        grid.fit(X_train, y_train)
        end = time.time()
        print('GENERAL - Total grid time: {}'.format((end - start_grid)))
        print('GENERAL - Best score was {} with \n {}'.format(
            grid.best_score_, grid.best_estimator_))

        grid.best_score_, grid.best_estimator_

        # Fit the train data
        fit(grid.best_estimator_, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(grid.best_estimator_, X_test, time_dic)

        print(str(grid.best_estimator_))
        print(str(grid.best_estimator_.weaker))
        # Save the result
        result.append({
            'macro':
            f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro':
            f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config':
            str(grid.best_estimator_),
            'best_clf':
            str(grid.best_estimator_.weaker),
        })

        print('Macro: {}'.format(
            f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(
            f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1
        end_fold = time.time()
        print('Total fold time: {}'.format((end_fold - start_fold)))
        print('train size {}'.format(X_train.shape))
        print('test size {}'.format(X_test.shape))
        print()

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)

Exemple #3

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    configurations = {'specific_classifier': [0, 1],
                      'weight': [0, 1],
                      'cooccurrence': [0, 1]}

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        clf = MetaLazyClassifier(select_features=False,
                                 n_jobs=n_jobs,
                                 grid_size=grid_size)

        # for each fold we vary weight function, number of co occurrences and the choosing of the classifier
        for specific in configurations['specific_classifier']:
            for weight in configurations['weight']:
                for cooccurrence in configurations['cooccurrence']:
                    print(
                        'Running for specific {}, weight {} and cooccurrence {}'.format(specific, weight, cooccurrence))

                    tuned_parameters = choose_tunning_parameters(specific=specific, weight=weight,
                                                                 coccurrence=cooccurrence)

                    print('GENERAL STARTING')
                    start_grid = time.time()
                    grid = GridSearchCV(clf, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1)
                    grid.fit(X_train, y_train)
                    end = time.time()
                    print('GENERAL - Total grid time: {}'.format((end - start_grid)))
                    print('GENERAL - Best score was {} with \n {}'.format(grid.best_score_, grid.best_estimator_))

                    grid.best_score_, grid.best_estimator_

                    # Fit the train data
                    fit(grid.best_estimator_, X_train, y_train, time_dic)

                    # Predict
                    y_pred = predict(grid.best_estimator_, X_test, time_dic)

                    print(str(grid.best_estimator_))
                    print(str(grid.best_estimator_.weaker))
                    # Save the result
                    result.append({
                        'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
                        'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
                        'config': str(grid.best_estimator_),
                        'best_clf': str(grid.best_estimator_.weaker),
                    })

                    configuration = {'weight': weight, 'specific': specific, 'cooc': cooccurrence}

                    result[-1].update(configuration)

        print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

        result_dataframe = pd.DataFrame(data=result)
        print(result_dataframe.head(10))
        result_dataframe.to_csv(output_path + '/result_factorial.csv', index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result_factorial.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)

Exemple #4

0

Afficher le fichier

Fichier : experiment.py Projet : lfomendes/metalazy

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument('-c', help='classifier')
    parser.add_argument('-k', help='number of neighbours')
    parser.add_argument('-w', help='weight function')
    parser.add_argument('-f', help='number of cooc features - default 10')
    parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p
    k = int(args.k)
    weight_function = args.w
    classifier_name = args.c

    n_cooc = 10
    if args.f:
        n_cooc = int(args.f)

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        clf = MetaLazyClassifier(specific_classifier=classifier_name, n_neighbors=k, select_features=False,
                                 weight_function=weight_function, n_jobs=n_jobs,
                                 grid_size=grid_size, number_of_cooccurrences=n_cooc)

        # Fit the train data
        fit(clf, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(clf, X_test, time_dic)

        print(str(clf))
        print(str(clf.weaker))
        # Save the result
        result.append({
            'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config': str(clf),
            'best_clf': str(clf.weaker),
        })

        print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)

Exemple #5

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000')
    parser.add_argument('-d',
                        help='Use the dataset default parameters.Dont use this parameter if you want to do grid search')
    parser.add_argument('-t', help='Limit test size, for each fold only use this number of instances')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset = None
    if args.d:
        dataset = args.d

    test_size_limit = None
    if args.t:
        test_size_limit = int(args.t)


    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        if test_size_limit:
            X_test = X_test[0:test_size_limit]
            y_test = y_test[0:test_size_limit]

        if dataset is None:
            # Create the classifier
            estimator = MetaLazyClassifier(select_features=False,
                                           n_jobs=n_jobs,
                                           grid_size=grid_size)

            tuned_parameters = choose_tunning_parameters(specific=1, weight=1, coccurrence=1)

            print(tuned_parameters)

            # first we find the best configuration in general
            print('GRID SEARCH FOR FOLD {}'.format(fold))
            start_grid = time.time()
            grid = GridSearchCV(estimator, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1)
            grid.fit(X_train, y_train)
            end = time.time()
            time_dic['grid'] = (end - start_grid)
            print('GENERAL - Total grid time: {}'.format((end - start_grid)))
            print('GENERAL - Best score was {} with \n {}'.format(grid.best_score_, grid.best_estimator_))

            estimator = grid.best_estimator_
            best_param = grid.best_params_
            print('GENERAL - Best param was {}\n'.format(grid.best_params_))
        else:
            print('Using default dataset parameters')
            estimator = get_best_version_for_each_dataset(dataset=dataset, n_jobs=n_jobs, grid_size=grid_size)
        print(estimator)

        estimator.log_time_file = output_path + '/log_times_{}.json'.format(fold)

        # Fit the train data
        fit(estimator, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(estimator, X_test, time_dic)

        print('\nWeaker Classifier used:')
        print(str(estimator.weaker))
        # Save the result
        result.append({
            'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config': str(estimator),
            'best_clf': str(estimator.weaker),
            'fold': str(fold),
        })

        print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

        result_dataframe = pd.DataFrame(data=result)
        print(result_dataframe.head(10))
        result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

        estimator.flush_log_time_file()

        #FIXME
        #break

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)

Exemple #6

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument(
        '-j',
        help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument(
        '-g',
        help='Size of the sample to the hyperparameter search - Default-5000')
    parser.add_argument('-c', help='Which classifier to user (svc, nb)')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p
    clf = args.c

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    result = []
    times = []

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        # Create the classifier
        if clf == 'svc':
            clf = SVC({
                'kernel': 'linear',
                'C': 1,
                'verbose': False,
                'probability': False,
                'degree': 3,
                'shrinking': True,
                'decision_function_shape': None,
                'tol': 0.001,
                'cache_size': 25000,
                'coef0': 0.0,
                'gamma': 'auto',
                'class_weight': None,
                'random_state': 42
            })
            # Set the parameters by cross-validation
            # 'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1, 1, 10],
            #  'C': [0.1, 1, 10, 100, 1000], 'class_weight': ['balanced', None],
            #  'probability': [True]},
            tuned_parameters = [{
                'kernel': ['linear'],
                'C': np.append(2.0**np.arange(-5, 15, 2), 1),
                'class_weight': ['balanced', None],
                'probability': [False]
            }]
            print(tuned_parameters)
        elif clf == 'nb':
            clf = MultinomialNB()
            tuned_parameters = [{'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}]

        # first we find the best configuration in general
        print('GRID SEARCH FOR FOLD {}'.format(fold))
        start_grid = time.time()
        grid = GridSearchCV(clf,
                            tuned_parameters,
                            cv=3,
                            scoring='f1_macro',
                            n_jobs=1)
        grid.fit(X_train, y_train)
        end = time.time()
        time_dic['grid'] = (end - start_grid)
        print('GENERAL - Total grid time: {}'.format((end - start_grid)))
        print('GENERAL - Best score was {} with \n {}'.format(
            grid.best_score_, grid.best_estimator_))

        estimator = grid.best_estimator_
        best_param = grid.best_params_

        print('GENERAL - Best param was {}\n'.format(grid.best_params_))

        # Fit the train data
        fit(estimator, X_train, y_train, time_dic)

        # Predict
        y_pred = predict(grid.best_estimator_, X_test, time_dic)

        print(str(grid.best_estimator_))
        # Save the result
        result.append({
            'macro':
            f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'micro':
            f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'config':
            str(grid.best_estimator_),
            # 'best_clf': str(grid.best_estimator_.weaker),
            'fold':
            str(fold),
        })

        print('Macro: {}'.format(
            f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print('Micro: {}'.format(
            f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        times.append(time_dic)
        fold = fold + 1

        result_dataframe = pd.DataFrame(data=result)
        print(result_dataframe.head(10))
        result_dataframe.to_csv(output_path + '/result_tunning_time.csv',
                                index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

        break

    print(result)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    result_dataframe = pd.DataFrame(data=result)
    print(result_dataframe.head(10))
    result_dataframe.to_csv(output_path + '/result_tunning_time.csv',
                            index=False)

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)

Exemple #7

0

Afficher le fichier

Fichier : experiment_oracle_nocooc_nodistance.py Projet : lfomendes/metalazy

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', help='path to the directory with  libsvm files')
    parser.add_argument('-o', help='path to the output directory')
    parser.add_argument(
        '-j',
        help='number of jobs to run in parallel. use -1 for all - Default:-1')
    parser.add_argument(
        '-g',
        help='Size of the sample to the hyperparameter search - Default-5000')

    args = parser.parse_args()

    output_path = args.o
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    path = args.p

    n_jobs = -1
    if args.j:
        n_jobs = int(args.j)

    grid_size = 5000
    if args.g:
        grid_size = int(args.g)

    dataset_reader = DatasetReader(path)

    fold = 0
    times = []

    specific_classifier = ['nb', 'logistic', 'extrarf']
    configurations = {'weight': [0, 1], 'cooccurrence': [0, 1]}

    start = time.time()
    while dataset_reader.has_next():
        time_dic = {}
        print('FOLD {}'.format(fold))

        # Load the regular data
        X_train, y_train, X_test, y_test = dataset_reader.get_next_fold()

        result_df = pd.DataFrame()

        # for each fold we vary the specific classifier
        for specific in specific_classifier:

            print('Running for specific {}'.format(specific))

            # setting the 0 configurations (turn off cooc or weight)
            estimator = MetaLazyClassifier(specific_classifier=specific,
                                           select_features=False,
                                           n_jobs=n_jobs,
                                           number_of_cooccurrences=0,
                                           weight_function='none',
                                           grid_size=grid_size)

            print(estimator)

            # Fit the train data
            fit(estimator, X_train, y_train, time_dic)

            # Predict
            y_pred = predict(estimator, X_test, time_dic)

            # Save the result
            result_df[specific] = y_pred

        result_df['y_test'] = y_test

        times.append(time_dic)
        fold = fold + 1

        print(result_df.head(10))
        result_df.to_csv(output_path +
                         '/result_oracle_off_fold_{}.csv'.format(fold),
                         index=False)

        times_dataframe = pd.DataFrame(data=times)
        print(times_dataframe.head(10))
        times_dataframe.to_csv(output_path + '/times.csv', index=False)

    end = time.time()
    print('Total time: {}'.format((end - start)))

    times_dataframe = pd.DataFrame(data=times)
    print(times_dataframe.head(10))
    times_dataframe.to_csv(output_path + '/times.csv', index=False)