i.e. remove the features that have the same value in all samples. Examples -------- >>> X = [[0, 2, 0, 3], ... [0, 1, 4, 3], ... [0, 1, 1, 3]] >>> selector = VarianceThreshold(0.0) >>> selector.fit_transform(X) array([[2, 0], [1, 4], [1, 1]]) """ def __init__(self, threshold=0.0): self.threshold = threshold def _check_X(self, X): if not isinstance(X, np.ndarray): X = np.asarray(X) return X def fit_transform(self, X): _X = self._check_X(X) return _X[:, (np.std(_X, axis=0) > self.threshold + 1e-8)] if __name__ == '__main__': # run corresponding tests from utils.testing import run_tests run_tests(__file__)
self.best_model_ = self.model if self.save_models: self.best_model_.save(filepath=os.path.join( self.dirpath, self._best_model_name()), **self.save_params) # verbosing if self.verbose: print_inline( " - best acc.: {0:.4f} +/- 2 * {1:.3f} at {2}\n" .format(self.best_score_, self.best_std_, self.best_params_)) # convert lists to np.ndarray for key in (['mean_score', 'std_score', 'params'] + [ 'split{0}_{1}'.format(k, s) for k in xrange(self.n_splits) for s in ('score', 'train_time', 'test_time') ]): self.cv_results_[key] = np.asarray(self.cv_results_[key]) return self def to_df(self): import pandas as pd return pd.DataFrame.from_dict(self.cv_results_).fillna('') if __name__ == '__main__': # run corresponding tests import tests.test_model_selection as t from utils.testing import run_tests run_tests(__file__, t)
MULTI_THREADED_SOLUTIONS, 1, args.max_thds + 1, CombingRunner, build_logger, SOLUTIONS_FOLDER, CXX_COMPILER_PATH) test_cases = [] for name in listdir(args.tests): with open(os.path.join(args.tests, name), 'r') as f: _, size = f.readline(), int(f.readline()) test_cases.append((size, os.path.join(args.tests, name))) if args.real_data: tests_pairs = [(74437, 91721), (124884, 134226)] else: tests_pairs = [(100000, 100000), (50000, 50000)] tests = [] for i in range(len(test_cases)): for j in range(len(test_cases)): x, y = test_cases[i], test_cases[j] if x[1] != y[1] and (x[0], y[0]) in tests_pairs: tests.append((x[0], CombingTest(x[1], y[1]))) tests.sort(key=lambda x: x[0]) _, tests = zip(*tests) tests = list(tests) run_tests(runners, tests, args.repeats, args.resultcsvfile, default_logger, True)
SINGLE_THREADED_SOLUTIONS = [ 'braid_multiplication_sequential_non_optimized', 'braid_multiplication_sequential_memory', 'braid_multiplication_sequential_precompute' ] MULTI_THREADED_SOLUTIONS = [ 'braid_multiplication_parallel', ] SOLUTIONS_FOLDER = 'braid_mults_solutions' # where we put our ready to run implementations CSV_FILE = 'braid_mul.csv' REPEATS = 10 default_logger = Logger('log.txt') build_logger = Logger('build_log.txt') if __name__ == '__main__': arg_parser = ArgumentParser() arg_parser.add_argument('depth', type=int, help='Depth for level paralellism') args = arg_parser.parse_args() runners = compile_programs(SINGLE_THREADED_SOLUTIONS, MULTI_THREADED_SOLUTIONS, 0, args.depth + 1, BraidMulRunner, build_logger, SOLUTIONS_FOLDER, CXX_COMPILER_PATH) n = [1000, 5000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000,150000,200000, 250000, 300000,350000,400000,450000,500000,550000,600000,650000,700000,750000,800000,850000,900000,950000,1000000, 2000000, 5000000, 7500000, 10000000] tests = [BraidMulTest(str(x), '42') for x in n] run_tests(runners, tests, REPEATS, CSV_FILE, default_logger, False)