def benchmark(func, repeat, name=None, profile=False, duration=1.0, burnin=True): if name is None: name = func.__name__ if profile: import yep yep.start("profiles/%s.prof" % name) times = [] elapsed = 0 n = 0 if burnin: func(1) while elapsed < duration: t0 = get_wtime() func(repeat) t1 = get_wtime() elapsed += t1 - t0 times.append(t1 - t0) n += 1 print '%s, %d x %d: %s' % (name, n, repeat, ftime(min(times) / repeat)) if profile: yep.stop() return min(times) / repeat
def profiler(use='cprofile', filename='out.prof'): if use == 'yep': # pragma: no cover import yep yep.start(filename) if use == 'cprofile': # pragma: no cover #import cProfile prof = cProfile.Profile() prof.enable() try: yield finally: if use == 'yep': # pragma: no cover yep.stop() print yellow % 'wrote: %s' % filename, '(use google-pprof to view)' # google-pprof --text /bin/ls imitation.prof # google-pprof --evince /bin/ls imitation.prof # google-pprof --web /bin/ls --web imitation.prof if use == 'cprofile': # pragma: no cover #import pstats prof.disable() prof.dump_stats(filename) pstats.Stats(filename).strip_dirs().sort_stats('time').print_stats() print yellow % 'wrote: %s' % filename
def profiler(use='cprofile', filename='out.prof'): if use == 'yep': # pragma: no cover import yep yep.start(filename) if use == 'cprofile': # pragma: no cover #import cProfile prof = cProfile.Profile() prof.enable() try: yield finally: if use == 'yep': # pragma: no cover yep.stop() print(yellow % 'wrote: %s' % filename, '(use `google-pprof` to view)') # google-pprof --text /bin/ls imitation.prof # google-pprof --evince /bin/ls imitation.prof # google-pprof --web /bin/ls --web imitation.prof if use == 'cprofile': # pragma: no cover #import pstats prof.disable() prof.dump_stats(filename) #pstats.Stats(filename).strip_dirs().sort_stats('time').print_stats() print(yellow % 'wrote: %s' % filename, '(use `gprof-viz` to view)')
def profile_yep(): import yep trie = create_trie() WORDS = words100k() yep.start(b'output.prof') for x in range(1000): for word in WORDS: trie[word] yep.stop()
def profile_yep(): import yep trie = create_trie() #WORDS = words100k() yep.start(b'output.prof') for x in range(100): trie.keys() # for x in range(1000): # for word in WORDS: # trie[word] yep.stop()
def vr_l1_persistence_performance_test(): pc = torch.randn(20, 3) pc = torch.tensor(pc, device='cuda', dtype=torch.float) max_dimension = 2 max_ball_radius = 0 yep.start('profiling_pershom/profile.google-pprof') time_start = time() res = pershom_backend.__C.VRCompCuda__vr_persistence( pc, max_dimension, max_ball_radius, 'l1') print(time() - time_start) yep.stop() print(res[0][0])
def profile_run(examples, grammar, maxlength, minlength, aggressive, seed): # TODO: localize the seed to just Distribution D = Distribution(examples=examples, grammar=grammar, maxlength=maxlength, minlength=minlength, aggressive=aggressive, seed=seed) import yep yep.start() for i, (example, m) in enumerate(D.examples): print 'Example: %s, length: %s' % (i, example.N) p = CPParser(a1, 'changeprop', D.grammar) p.initial_rollout(example, m) for [I, K] in example.nodes: p.change(I, K, 1 - m[I, K]) yep.stop()
import blb, yep class ProfBLB(blb.BLB): def __init__(self, **kwargs): self.compute_estimate = 'stdev' self.reduce_bootstraps = 'mean' self.average = 'mean' blb.BLB.__init__(self, **kwargs) if __name__ == '__main__': data1 = range(10000) data2 = range(50000) data3 = range(100000) tester = ProfBLB() tester.run(data3) yep.start('cilk.prof') for i in xrange(500): tester.run(data3) yep.stop()
def test(): c = None use_cache = False if use_cache: random_simplicial_complex_path = './random_simplicial_complex.pickle' if pth.exists(random_simplicial_complex_path): with open(random_simplicial_complex_path, 'br') as f: c = pickle.load(f) else: c = random_simplicial_complex(100, 100, 100, 100, 100, 100) with open(random_simplicial_complex_path, 'bw') as f: pickle.dump(c, f) else: c = random_simplicial_complex(100, 100, 100, 100, 100) print('|C| = ', len(c)) max_red_by_iteration = -1 # cpu_impl = SortedListBoundaryMatrix(c) # cpu_impl.max_pairs = max_red_by_iteration bm, col_dim = descending_sorted_boundary_array_from_filtrated_sp(c) print(bm[-1]) bm, col_dim = bm.to('cuda'), col_dim.to('cuda') barcodes_true = toplex_persistence_diagrams(c, list(range(len(c)))) dgm_true = [ Counter(((float(b), float(d)) for b, d in dgm)) for dgm in barcodes_true ] def my_output_to_dgms(input): ret = [] b, b_e = input for dim, (b_dim, b_dim_e) in enumerate(zip(b, b_e)): b_dim, b_dim_e = b_dim.float(), b_dim_e.float() tmp = torch.empty_like(b_dim_e) tmp.fill_(float('inf')) b_dim_e = torch.cat([b_dim_e, tmp], dim=1) dgm = torch.cat([b_dim, b_dim_e], dim=0) dgm = dgm.tolist() dgm = Counter(((float(b), float(d)) for b, d in dgm)) ret.append(dgm) return ret # pr = cProfile.Profile() # pr.enable() ind_not_reduced = torch.tensor(list(range( col_dim.size(0)))).to('cuda').detach() ind_not_reduced = ind_not_reduced.masked_select( bm[:, 0] >= 0).long().detach() bm = bm.index_select(0, ind_not_reduced).detach() yep.start('profiling_pershom/profile.google-pprof') for i in range(10): time_start = time() output = pershom_backend.calculate_persistence(bm.clone(), ind_not_reduced.clone(), col_dim.clone(), max(col_dim)) print(time() - time_start) yep.stop() # pr.disable() # pr.dump_stats('high_level_profile.cProfile') print([[len(x) for x in y] for y in output]) dgm_test = my_output_to_dgms(output) print('dgm_true lengths:', [len(dgm) for dgm in dgm_true]) print('dgm_test lengths:', [len(dgm) for dgm in dgm_test]) for dgm_test, dgm_true in zip(dgm_test, dgm_true): assert (dgm_test == dgm_true)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) else: raise Exception("Unknown data set!") Xtrain, ytrain = traingen.get_all() Xtest, ytest = testgen.get_all() print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) if param['tree_type'] == "randomized": from sklearn.ensemble import ExtraTreesClassifier as RF elif param['tree_type'] == "standard": from sklearn.ensemble import RandomForestClassifier as RF model = RF(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], random_state=seed, bootstrap=param['bootstrap'], min_samples_leaf=1, max_depth=None, verbose=0) if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") # training fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(Xtrain) # testing test_start_time = time.time() ypred_test = model.predict(Xtest) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(ypreds_train, ytrain, results, "training") evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
Y = tf.placeholder(tf.int32, [batchSize, outDim], name="output") output = layer2.eval(layer1.eval(layer0.eval(X))) loss = loss(output, Y) optimizer = tf.train.GradientDescentOptimizer(0.05) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = optimizer.minimize(loss, global_step=global_step) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) weights0, bias0, weights1, bias1 = sess.run( [layer0.weights, layer0.bias, layer1.weights, layer1.bias]) map1 = nnMap(np.copy(weights0), np.copy(bias0), np.copy(weights1), np.copy(bias1), 2, 0.5) tr_x, tr_y = mnist.train.next_batch(batchSize) errorMargins = sess.run(errorRate(output, Y), feed_dict={X: tr_x, Y: tr_y}) print errorMargins yep.start('mapper.out') map1.batchAdd(tr_x, np.copy(errorMargins), 1) print map1.location(0) yep.stop() for i in range(1, 100): tr_x, tr_y = mnist.train.next_batch(batchSize) sess.run(train_op, feed_dict={X: tr_x, Y: tr_y})
def single_run(dkey, train_size, param, seed, profile=False): print( "Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) if dkey == "covtype": traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) n_subset = 50000 elif dkey == "higgs": traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) n_subset = 500000 elif dkey == "susy": traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) n_subset = 500000 else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) model = SubsetWoodClassifier(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], seed=seed, bootstrap=param['bootstrap'], tree_traversal_mode="dfs", tree_type=param['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=1, store=MemoryStore()) # training if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen, n_subset=n_subset) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(generator=traingen) # testing test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) evaluate(ypreds_train, traingen.get_all_target(), results, "training") evaluate(ypred_test, testgen.get_all_target(), results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
def main(): from optparse import OptionParser import time parser = OptionParser(description="Test the C version of difflib. Either " "specify files, or leave empty for auto-generated " "random lines", usage="Usage: %prog [options] [file1 file2]") parser.add_option("-n", "--niter", dest="niter", type="int", help="num of iterations (default=%default)", default=1) parser.add_option( "-l", "--lines", dest="lines", type="int", help= "num of lines to generate if no files specified (default=%default)", default=20000) parser.add_option( "-d", "--diffs", dest="diffs", type="int", help= "num of random lines to change if no files specified (default=%default)", default=200) parser.add_option("-p", "--profile", dest="profile", default=False, action="store_true", help="run in the python profiler and print results") parser.add_option("-c", "--compare", dest="compare", default=False, action="store_true", help="also run the non-c difflib to compare outputs") parser.add_option("-y", "--yep", dest="yep", default=False, action="store_true", help="use yep to profile the c code") (opts, args) = parser.parse_args() start = int(time.time()) if opts.niter < 1: parser.error("Need to do at least 1 iteration..") if args: if len(args) != 2: parser.error("Need exactly 2 files to compare.") try: print("Reading input files...") s1 = open(args[0]).readlines() s2 = open(args[1]).readlines() except (IOError, OSError): parser.error("Couldn't load input files %s and %s" % (args[0], args[1])) else: print("Generating random similar streams...") s1, s2 = generate_similar_streams(opts.lines, opts.diffs) # shonky, but saves time.. sys.path.append('build/lib.linux-x86_64-2.7/') sys.path.append('build/lib.linux-x86_64-2.7-pydebug/') sys.path.append('build/lib.macosx-10.6-intel-2.7') if opts.yep: import yep yep.start("cdifflib.prof") if opts.profile: import cProfile import pstats fn = "cdifflib_%d.prof" % start print("Profiling cdifflib.CSequenceMatcher...") cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher), dict(a=s1, b=s2, n=opts.niter, sm=CSequenceMatcher), fn) print_stats(pstats.Stats(fn)) if opts.compare: fn = "difflib_%d.prof" % start print("Profiling difflib.SequenceMatcher...") cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher), dict(a=s1, b=s2, n=opts.niter, sm=SequenceMatcher), fn) print_stats(pstats.Stats(fn)) else: print("Running cdifflib.CSequenceMatcher %d times..." % opts.niter) profile_sequence_matcher(CSequenceMatcher, s1, s2, opts.niter) if opts.compare: print("Running difflib.SequenceMatcher %d times..." % opts.niter) profile_sequence_matcher(SequenceMatcher, s1, s2, opts.niter) if opts.yep: yep.stop()
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/hugewood" if dkey == "landsat": # TODO: Download file manually if needed (255GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd fname_train = "data/landsat_train.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=10000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=params.seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=param['n_estimators'], n_estimators_bottom=param['n_estimators_bottom'], n_top="auto", n_patterns_leaf="auto", balanced_top_tree=True, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=params.seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, odir=tmp_dir, store=DiskStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['top'] = model.get_training_times()['top'] results['distribute'] = model.get_training_times()['distribute'] results['bottom'] = model.get_training_times()['bottom'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
def main(): from optparse import OptionParser import time parser = OptionParser(description="Test the C version of difflib. Either " "specify files, or leave empty for auto-generated " "random lines", usage="Usage: %prog [options] [file1 file2]") parser.add_option("-n", "--niter", dest="niter", type="int", help="num of iterations (default=%default)", default=1) parser.add_option("-l", "--lines", dest="lines", type="int", help="num of lines to generate if no files specified (default=%default)", default=20000) parser.add_option("-d", "--diffs", dest="diffs", type="int", help="num of random lines to change if no files specified (default=%default)", default=200) parser.add_option("-p", "--profile", dest="profile", default=False, action="store_true", help="run in the python profiler and print results") parser.add_option("-c", "--compare", dest="compare", default=False, action="store_true", help="also run the non-c difflib to compare outputs") parser.add_option("-y", "--yep", dest="yep", default=False, action="store_true", help="use yep to profile the c code") (opts, args) = parser.parse_args() start = int(time.time()) if opts.niter < 1: parser.error("Need to do at least 1 iteration..") if args: if len(args) != 2: parser.error("Need exactly 2 files to compare.") try: print("Reading input files...") s1 = open(args[0]).readlines() s2 = open(args[1]).readlines() except (IOError, OSError): parser.error("Couldn't load input files %s and %s" % (args[0], args[1])) else: print("Generating random similar streams...") s1, s2 = generate_similar_streams(opts.lines, opts.diffs) # shonky, but saves time.. sys.path.append('build/lib.linux-x86_64-2.7/') sys.path.append('build/lib.linux-x86_64-2.7-pydebug/') sys.path.append('build/lib.macosx-10.6-intel-2.7') if opts.yep: import yep yep.start("cdifflib.prof") if opts.profile: import cProfile import pstats fn = "cdifflib_%d.prof" % start print("Profiling cdifflib.CSequenceMatcher...") cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher), dict(a=s1, b=s2, n=opts.niter, sm=CSequenceMatcher), fn) print_stats(pstats.Stats(fn)) if opts.compare: fn = "difflib_%d.prof" % start print("Profiling difflib.SequenceMatcher...") cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher), dict(a=s1, b=s2, n=opts.niter, sm=SequenceMatcher), fn) print_stats(pstats.Stats(fn)) else: print("Running cdifflib.CSequenceMatcher %d times..." % opts.niter) profile_sequence_matcher(CSequenceMatcher, s1, s2, opts.niter) if opts.compare: print("Running difflib.SequenceMatcher %d times..." % opts.niter) profile_sequence_matcher(SequenceMatcher, s1, s2, opts.niter) if opts.yep: yep.stop()
def run(self): # takes about 5 seconds yep.start('yep.prof') for i in xrange(100): self.test_method() test_function() yep.stop()
def single_run(dkey, train_size, n_bottom, param, seed, profile=False): print( "Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param))) if dkey == "covtype": traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) elif dkey == "higgs": traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) elif dkey == "susy": traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=int(24 / n_bottom), n_estimators_bottom=int(n_bottom), n_top="auto", n_patterns_leaf=75000, balanced_top_tree=True, top_tree_lambda=1.0, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, store=MemoryStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(generator=traingen) # testing test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) evaluate(ypreds_train, traingen.get_all_target(), results, "training") evaluate(ypred_test, testgen.get_all_target(), results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_75K", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/subsetwood" if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) # set to top trees size n_subset = 500000 model = SubsetWoodClassifier( n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], seed=seed, bootstrap=param['bootstrap'], tree_traversal_mode="dfs", tree_type=param['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=1, odir=tmp_dir, store=DiskStore()) # training if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen, n_subset=n_subset) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['subset'] = model.get_training_times()['subset'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() ytrain = traingen.get_all_target() ytrain = ytrain.astype(numpy.int64) ytest = ytest.astype(numpy.int64) ypred_test = ypred_test.astype(numpy.int64) evaluate(ypred_test, ytest, results, "testing") print("Training distribution") print(numpy.bincount(ytrain)) print("Test distribution") print(numpy.bincount(ytest)) print("Predict distribution") print(numpy.bincount(ypred_test)) fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood_" + str(n_subset), fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del(testgen) del(traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print( "Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) if dkey == "covtype": Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed) elif dkey == "higgs": Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed) elif dkey == "susy": Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) if param['tree_type'] == "randomized": from sklearn.ensemble import ExtraTreesClassifier as RF elif param['tree_type'] == "standard": from sklearn.ensemble import RandomForestClassifier as RF model = RF(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], random_state=seed, bootstrap=param['bootstrap'], min_samples_leaf=1, max_depth=None, verbose=0) if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") # training fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(Xtrain) # testing test_start_time = time.time() ypred_test = model.predict(Xtest) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(ypreds_train, ytrain, results, "training") evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
import yep import hamiltonian_exponentiation as h num_qubits = 2 ham = h.random_hamiltonian(num_qubits) t = 1 yep.start('yep_output.prof') h.exp_ham(ham, t) yep.stop()