Python verify_dataset Examples, xcessiv.functions.verify_dataset Python Examples

Example #1

0

Show file

File: views.py Project: BigRLab/xcessiv

def verify_extraction_test_dataset():
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()

    X, y = extraction.return_test_dataset()

    return jsonify(functions.verify_dataset(X, y))

Example #2

0

Show file

File: views.py Project: BigRLab/xcessiv

def verify_extraction_meta_feature_generation():
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()

    if extraction.meta_feature_generation['method'] == 'cv':
        raise exceptions.UserError('Xcessiv will use cross-validation to'
                                   ' generate meta-features')

    X_holdout, y_holdout = extraction.return_holdout_dataset()

    return jsonify(functions.verify_dataset(X_holdout, y_holdout))

Example #3

0

Show file

def extraction_data_statistics(path):
    """ Generates data statistics for the given data extraction setup stored
    in Xcessiv notebook.

    This is in rqtasks.py but not as a job yet. Temporarily call this directly
    while I'm figuring out Javascript lel.

    Args:
        path (str, unicode): Path to xcessiv notebook
    """
    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()
        X, y = extraction.return_main_dataset()
        functions.verify_dataset(X, y)

        if extraction.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=extraction.test_dataset['split_ratio'],
                random_state=extraction.test_dataset['split_seed'],
                stratify=y
            )
        elif extraction.test_dataset['method'] == 'source':
            if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.test_dataset["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code, "extract_test_dataset")
            X_test, y_test = extraction_function()
        else:
            X_test, y_test = None, None

        # test base learner cross-validation
        extraction_code = extraction.meta_feature_generation['source']
        return_splits_iterable = functions.import_object_from_string_code(
            extraction_code,
            'return_splits_iterable'
        )
        number_of_splits = 0
        test_indices = []
        try:
            for train_idx, test_idx in return_splits_iterable(X, y):
                number_of_splits += 1
                test_indices.append(test_idx)
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        # preparation before testing stacked ensemble cross-validation
        test_indices = np.concatenate(test_indices)
        X, y = X[test_indices], y[test_indices]

        # test stacked ensemble cross-validation
        extraction_code = extraction.stacked_ensemble_cv['source']
        return_splits_iterable = functions.import_object_from_string_code(
            extraction_code,
            'return_splits_iterable'
        )
        number_of_splits_stacked_cv = 0
        try:
            for train_idx, test_idx in return_splits_iterable(X, y):
                number_of_splits_stacked_cv += 1
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        data_stats = dict()
        data_stats['train_data_stats'] = functions.verify_dataset(X, y)
        if X_test is not None:
            data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test)
        else:
            data_stats['test_data_stats'] = None
        data_stats['holdout_data_stats'] = {'number_of_splits': number_of_splits}
        data_stats['stacked_ensemble_cv_stats'] = {'number_of_splits': number_of_splits_stacked_cv}

        extraction.data_statistics = data_stats

        session.add(extraction)
        session.commit()

Example #4

0

Show file

File: rqtasks.py Project: BigRLab/xcessiv

def extraction_data_statistics(path):
    """ Generates data statistics for the given data extraction setup stored
    in Xcessiv notebook.

    This is in rqtasks.py but not as a job yet. Temporarily call this directly
    while I'm figuring out Javascript lel.

    Args:
        path (str, unicode): Path to xcessiv notebook
    """
    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()
        X, y = extraction.return_main_dataset()
        functions.verify_dataset(X, y)

        if extraction.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=extraction.test_dataset['split_ratio'],
                random_state=extraction.test_dataset['split_seed'],
                stratify=y
            )
        elif extraction.test_dataset['method'] == 'source':
            if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.test_dataset["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code, "extract_test_dataset")
            X_test, y_test = extraction_function()
        else:
            X_test, y_test = None, None

        if extraction.meta_feature_generation['method'] == 'holdout_split':
            X, X_holdout, y, y_holdout = train_test_split(
                X,
                y,
                test_size=extraction.meta_feature_generation['split_ratio'],
                random_state=extraction.meta_feature_generation['seed'],
                stratify=y
            )
        elif extraction.meta_feature_generation['method'] == 'holdout_source':
            if 'source' not in extraction.meta_feature_generation or \
                    not extraction.meta_feature_generation['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.meta_feature_generation["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code,
                                               "extract_holdout_dataset")
            X_holdout, y_holdout = extraction_function()
        else:
            X_holdout, y_holdout = None, None

        data_stats = dict()
        data_stats['train_data_stats'] = functions.verify_dataset(X, y)
        if X_test is not None:
            data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test)
        else:
            data_stats['test_data_stats'] = None
        if X_holdout is not None:
            data_stats['holdout_data_stats'] = functions.verify_dataset(X_holdout, y_holdout)
        else:
            data_stats['holdout_data_stats'] = None

        extraction.data_statistics = data_stats

        session.add(extraction)
        session.commit()

Example #5

0

Show file

File: test_functions.py Project: tongli12/xcessiv

 def test_correct_dataset(self):
     X, y = load_digits(return_X_y=True)
     verification_dict = functions.verify_dataset(X, y)
     assert verification_dict['features_shape'] == (1797, 64)
     assert verification_dict['labels_shape'] == (1797, )