def verify_extraction_test_dataset(): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() X, y = extraction.return_test_dataset() return jsonify(functions.verify_dataset(X, y))
def verify_extraction_meta_feature_generation(): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() if extraction.meta_feature_generation['method'] == 'cv': raise exceptions.UserError('Xcessiv will use cross-validation to' ' generate meta-features') X_holdout, y_holdout = extraction.return_holdout_dataset() return jsonify(functions.verify_dataset(X_holdout, y_holdout))
def extraction_data_statistics(path): """ Generates data statistics for the given data extraction setup stored in Xcessiv notebook. This is in rqtasks.py but not as a job yet. Temporarily call this directly while I'm figuring out Javascript lel. Args: path (str, unicode): Path to xcessiv notebook """ with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() X, y = extraction.return_main_dataset() functions.verify_dataset(X, y) if extraction.test_dataset['method'] == 'split_from_main': X, X_test, y, y_test = train_test_split( X, y, test_size=extraction.test_dataset['split_ratio'], random_state=extraction.test_dataset['split_seed'], stratify=y ) elif extraction.test_dataset['method'] == 'source': if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']: raise exceptions.UserError('Source is empty') extraction_code = extraction.test_dataset["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_test_dataset") X_test, y_test = extraction_function() else: X_test, y_test = None, None # test base learner cross-validation extraction_code = extraction.meta_feature_generation['source'] return_splits_iterable = functions.import_object_from_string_code( extraction_code, 'return_splits_iterable' ) number_of_splits = 0 test_indices = [] try: for train_idx, test_idx in return_splits_iterable(X, y): number_of_splits += 1 test_indices.append(test_idx) except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) # preparation before testing stacked ensemble cross-validation test_indices = np.concatenate(test_indices) X, y = X[test_indices], y[test_indices] # test stacked ensemble cross-validation extraction_code = extraction.stacked_ensemble_cv['source'] return_splits_iterable = functions.import_object_from_string_code( extraction_code, 'return_splits_iterable' ) number_of_splits_stacked_cv = 0 try: for train_idx, test_idx in return_splits_iterable(X, y): number_of_splits_stacked_cv += 1 except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) data_stats = dict() data_stats['train_data_stats'] = functions.verify_dataset(X, y) if X_test is not None: data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test) else: data_stats['test_data_stats'] = None data_stats['holdout_data_stats'] = {'number_of_splits': number_of_splits} data_stats['stacked_ensemble_cv_stats'] = {'number_of_splits': number_of_splits_stacked_cv} extraction.data_statistics = data_stats session.add(extraction) session.commit()
def extraction_data_statistics(path): """ Generates data statistics for the given data extraction setup stored in Xcessiv notebook. This is in rqtasks.py but not as a job yet. Temporarily call this directly while I'm figuring out Javascript lel. Args: path (str, unicode): Path to xcessiv notebook """ with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() X, y = extraction.return_main_dataset() functions.verify_dataset(X, y) if extraction.test_dataset['method'] == 'split_from_main': X, X_test, y, y_test = train_test_split( X, y, test_size=extraction.test_dataset['split_ratio'], random_state=extraction.test_dataset['split_seed'], stratify=y ) elif extraction.test_dataset['method'] == 'source': if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']: raise exceptions.UserError('Source is empty') extraction_code = extraction.test_dataset["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_test_dataset") X_test, y_test = extraction_function() else: X_test, y_test = None, None if extraction.meta_feature_generation['method'] == 'holdout_split': X, X_holdout, y, y_holdout = train_test_split( X, y, test_size=extraction.meta_feature_generation['split_ratio'], random_state=extraction.meta_feature_generation['seed'], stratify=y ) elif extraction.meta_feature_generation['method'] == 'holdout_source': if 'source' not in extraction.meta_feature_generation or \ not extraction.meta_feature_generation['source']: raise exceptions.UserError('Source is empty') extraction_code = extraction.meta_feature_generation["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_holdout_dataset") X_holdout, y_holdout = extraction_function() else: X_holdout, y_holdout = None, None data_stats = dict() data_stats['train_data_stats'] = functions.verify_dataset(X, y) if X_test is not None: data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test) else: data_stats['test_data_stats'] = None if X_holdout is not None: data_stats['holdout_data_stats'] = functions.verify_dataset(X_holdout, y_holdout) else: data_stats['holdout_data_stats'] = None extraction.data_statistics = data_stats session.add(extraction) session.commit()
def test_correct_dataset(self): X, y = load_digits(return_X_y=True) verification_dict = functions.verify_dataset(X, y) assert verification_dict['features_shape'] == (1797, 64) assert verification_dict['labels_shape'] == (1797, )