Example #1
0
def benchmark(func,
              repeat,
              name=None,
              profile=False,
              duration=1.0,
              burnin=True):
    if name is None:
        name = func.__name__
    if profile:
        import yep
        yep.start("profiles/%s.prof" % name)
    times = []
    elapsed = 0
    n = 0
    if burnin:
        func(1)
    while elapsed < duration:
        t0 = get_wtime()
        func(repeat)
        t1 = get_wtime()
        elapsed += t1 - t0
        times.append(t1 - t0)
        n += 1
    print '%s, %d x %d: %s' % (name, n, repeat, ftime(min(times) / repeat))
    if profile:
        yep.stop()
    return min(times) / repeat
Example #2
0
def profiler(use='cprofile', filename='out.prof'):

    if use == 'yep':   # pragma: no cover
        import yep
        yep.start(filename)

    if use == 'cprofile':  # pragma: no cover
        #import cProfile
        prof = cProfile.Profile()
        prof.enable()

    try:

        yield

    finally:
        if use == 'yep':  # pragma: no cover
            yep.stop()
            print yellow % 'wrote: %s' % filename, '(use google-pprof to view)'
            # google-pprof --text /bin/ls imitation.prof
            # google-pprof --evince /bin/ls imitation.prof
            # google-pprof --web /bin/ls --web imitation.prof

        if use == 'cprofile':  # pragma: no cover
            #import pstats
            prof.disable()
            prof.dump_stats(filename)
            pstats.Stats(filename).strip_dirs().sort_stats('time').print_stats()
            print yellow % 'wrote: %s' % filename
Example #3
0
def profiler(use='cprofile', filename='out.prof'):

    if use == 'yep':  # pragma: no cover
        import yep
        yep.start(filename)

    if use == 'cprofile':  # pragma: no cover
        #import cProfile
        prof = cProfile.Profile()
        prof.enable()

    try:

        yield

    finally:
        if use == 'yep':  # pragma: no cover
            yep.stop()
            print(yellow % 'wrote: %s' % filename,
                  '(use `google-pprof` to view)')
            # google-pprof --text /bin/ls imitation.prof
            # google-pprof --evince /bin/ls imitation.prof
            # google-pprof --web /bin/ls --web imitation.prof

        if use == 'cprofile':  # pragma: no cover
            #import pstats
            prof.disable()
            prof.dump_stats(filename)
            #pstats.Stats(filename).strip_dirs().sort_stats('time').print_stats()
            print(yellow % 'wrote: %s' % filename, '(use `gprof-viz` to view)')
Example #4
0
    def profile_yep():
        import yep

        trie = create_trie()
        WORDS = words100k()

        yep.start(b'output.prof')
        for x in range(1000):
            for word in WORDS:
                trie[word]
        yep.stop()
Example #5
0
    def profile_yep():
        import yep

        trie = create_trie()
        #WORDS = words100k()

        yep.start(b'output.prof')
        for x in range(100):
            trie.keys()
#        for x in range(1000):
#            for word in WORDS:
#                trie[word]
        yep.stop()
Example #6
0
def vr_l1_persistence_performance_test():
    pc = torch.randn(20, 3)
    pc = torch.tensor(pc, device='cuda', dtype=torch.float)

    max_dimension = 2
    max_ball_radius = 0

    yep.start('profiling_pershom/profile.google-pprof')
    time_start = time()
    res = pershom_backend.__C.VRCompCuda__vr_persistence(
        pc, max_dimension, max_ball_radius, 'l1')
    print(time() - time_start)
    yep.stop()
    print(res[0][0])
Example #7
0
def profile_run(examples, grammar, maxlength, minlength, aggressive, seed):

    # TODO: localize the seed to just Distribution

    D = Distribution(examples=examples,
                     grammar=grammar,
                     maxlength=maxlength,
                     minlength=minlength,
                     aggressive=aggressive,
                     seed=seed)

    import yep
    yep.start()

    for i, (example, m) in enumerate(D.examples):
        print 'Example: %s, length: %s' % (i, example.N)
        p = CPParser(a1, 'changeprop', D.grammar)
        p.initial_rollout(example, m)
        for [I, K] in example.nodes:
            p.change(I, K, 1 - m[I, K])

    yep.stop()
Example #8
0
def benchmark(func, repeat, name=None, profile=False, duration=1.0,
              burnin=True):
    if name is None:
        name = func.__name__
    if profile:
        import yep
        yep.start("profiles/%s.prof" % name)
    times = []
    elapsed = 0
    n = 0
    if burnin:
        func(1)
    while elapsed < duration:
        t0 = get_wtime()
        func(repeat)
        t1 = get_wtime()
        elapsed += t1 - t0
        times.append(t1 - t0)
        n += 1
    print '%s, %d x %d: %s' % (name, n, repeat, ftime(min(times) / repeat))
    if profile:
        yep.stop()
    return min(times) / repeat
Example #9
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=20000000)

    else:
        raise Exception("Unknown data set!")

    Xtrain, ytrain = traingen.get_all()
    Xtest, ytest = testgen.get_all()

    print("")
    print("Number of training patterns:\t%i" % Xtrain.shape[0])
    print("Number of test patterns:\t%i" % Xtest.shape[0])
    print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])

    if param['tree_type'] == "randomized":
        from sklearn.ensemble import ExtraTreesClassifier as RF
    elif param['tree_type'] == "standard":
        from sklearn.ensemble import RandomForestClassifier as RF

    model = RF(n_estimators=param['n_estimators'],
               criterion="gini",
               max_features=param['max_features'],
               min_samples_split=2,
               n_jobs=param['n_jobs'],
               random_state=seed,
               bootstrap=param['bootstrap'],
               min_samples_leaf=1,
               max_depth=None,
               verbose=0)

    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    # training
    fit_start_time = time.time()
    model.fit(Xtrain, ytrain)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(Xtrain)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(Xtest)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(ypreds_train, ytrain, results, "training")
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Example #10
0
def test():
    c = None

    use_cache = False

    if use_cache:
        random_simplicial_complex_path = './random_simplicial_complex.pickle'
        if pth.exists(random_simplicial_complex_path):
            with open(random_simplicial_complex_path, 'br') as f:
                c = pickle.load(f)
        else:
            c = random_simplicial_complex(100, 100, 100, 100, 100, 100)
            with open(random_simplicial_complex_path, 'bw') as f:
                pickle.dump(c, f)
    else:
        c = random_simplicial_complex(100, 100, 100, 100, 100)

    print('|C| = ', len(c))
    max_red_by_iteration = -1

    # cpu_impl = SortedListBoundaryMatrix(c)
    # cpu_impl.max_pairs = max_red_by_iteration
    bm, col_dim = descending_sorted_boundary_array_from_filtrated_sp(c)

    print(bm[-1])

    bm, col_dim = bm.to('cuda'), col_dim.to('cuda')

    barcodes_true = toplex_persistence_diagrams(c, list(range(len(c))))
    dgm_true = [
        Counter(((float(b), float(d)) for b, d in dgm))
        for dgm in barcodes_true
    ]

    def my_output_to_dgms(input):
        ret = []
        b, b_e = input

        for dim, (b_dim, b_dim_e) in enumerate(zip(b, b_e)):
            b_dim, b_dim_e = b_dim.float(), b_dim_e.float()

            tmp = torch.empty_like(b_dim_e)
            tmp.fill_(float('inf'))
            b_dim_e = torch.cat([b_dim_e, tmp], dim=1)

            dgm = torch.cat([b_dim, b_dim_e], dim=0)
            dgm = dgm.tolist()
            dgm = Counter(((float(b), float(d)) for b, d in dgm))

            ret.append(dgm)

        return ret

    # pr = cProfile.Profile()
    # pr.enable()

    ind_not_reduced = torch.tensor(list(range(
        col_dim.size(0)))).to('cuda').detach()
    ind_not_reduced = ind_not_reduced.masked_select(
        bm[:, 0] >= 0).long().detach()
    bm = bm.index_select(0, ind_not_reduced).detach()

    yep.start('profiling_pershom/profile.google-pprof')

    for i in range(10):
        time_start = time()
        output = pershom_backend.calculate_persistence(bm.clone(),
                                                       ind_not_reduced.clone(),
                                                       col_dim.clone(),
                                                       max(col_dim))
        print(time() - time_start)
    yep.stop()

    # pr.disable()
    # pr.dump_stats('high_level_profile.cProfile')

    print([[len(x) for x in y] for y in output])

    dgm_test = my_output_to_dgms(output)

    print('dgm_true lengths:', [len(dgm) for dgm in dgm_true])
    print('dgm_test lengths:', [len(dgm) for dgm in dgm_test])

    for dgm_test, dgm_true in zip(dgm_test, dgm_true):
        assert (dgm_test == dgm_true)
Example #11
0
def main():
    from optparse import OptionParser
    import time

    parser = OptionParser(description="Test the C version of difflib. Either "
                          "specify files, or leave empty for auto-generated "
                          "random lines",
                          usage="Usage: %prog [options] [file1 file2]")
    parser.add_option("-n",
                      "--niter",
                      dest="niter",
                      type="int",
                      help="num of iterations (default=%default)",
                      default=1)
    parser.add_option(
        "-l",
        "--lines",
        dest="lines",
        type="int",
        help=
        "num of lines to generate if no files specified (default=%default)",
        default=20000)
    parser.add_option(
        "-d",
        "--diffs",
        dest="diffs",
        type="int",
        help=
        "num of random lines to change if no files specified (default=%default)",
        default=200)
    parser.add_option("-p",
                      "--profile",
                      dest="profile",
                      default=False,
                      action="store_true",
                      help="run in the python profiler and print results")
    parser.add_option("-c",
                      "--compare",
                      dest="compare",
                      default=False,
                      action="store_true",
                      help="also run the non-c difflib to compare outputs")
    parser.add_option("-y",
                      "--yep",
                      dest="yep",
                      default=False,
                      action="store_true",
                      help="use yep to profile the c code")

    (opts, args) = parser.parse_args()

    start = int(time.time())

    if opts.niter < 1:
        parser.error("Need to do at least 1 iteration..")

    if args:
        if len(args) != 2:
            parser.error("Need exactly 2 files to compare.")
        try:
            print("Reading input files...")
            s1 = open(args[0]).readlines()
            s2 = open(args[1]).readlines()
        except (IOError, OSError):
            parser.error("Couldn't load input files %s and %s" %
                         (args[0], args[1]))
    else:
        print("Generating random similar streams...")
        s1, s2 = generate_similar_streams(opts.lines, opts.diffs)

    # shonky, but saves time..
    sys.path.append('build/lib.linux-x86_64-2.7/')
    sys.path.append('build/lib.linux-x86_64-2.7-pydebug/')
    sys.path.append('build/lib.macosx-10.6-intel-2.7')

    if opts.yep:
        import yep
        yep.start("cdifflib.prof")

    if opts.profile:
        import cProfile
        import pstats
        fn = "cdifflib_%d.prof" % start
        print("Profiling cdifflib.CSequenceMatcher...")
        cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher),
                        dict(a=s1, b=s2, n=opts.niter, sm=CSequenceMatcher),
                        fn)
        print_stats(pstats.Stats(fn))

        if opts.compare:
            fn = "difflib_%d.prof" % start
            print("Profiling difflib.SequenceMatcher...")
            cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher),
                            dict(a=s1, b=s2, n=opts.niter, sm=SequenceMatcher),
                            fn)
            print_stats(pstats.Stats(fn))

    else:
        print("Running cdifflib.CSequenceMatcher %d times..." % opts.niter)
        profile_sequence_matcher(CSequenceMatcher, s1, s2, opts.niter)
        if opts.compare:
            print("Running difflib.SequenceMatcher %d times..." % opts.niter)
            profile_sequence_matcher(SequenceMatcher, s1, s2, opts.niter)

    if opts.yep:
        yep.stop()
Example #12
0
def single_run(dkey, train_size, n_bottom, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param)))

    if dkey == "covtype":
        traingen, testgen = covtype_generators(train_size=train_size,
                                               store="mem",
                                               seed=seed)
    elif dkey == "higgs":
        traingen, testgen = higgs_generators(train_size=train_size,
                                             store="mem",
                                             seed=seed)
    elif dkey == "susy":
        traingen, testgen = susy_generators(train_size=train_size,
                                            store="mem",
                                            seed=seed)
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=int(24 / n_bottom),
        n_estimators_bottom=int(n_bottom),
        n_top="auto",
        n_patterns_leaf=75000,
        balanced_top_tree=True,
        top_tree_lambda=1.0,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        store=MemoryStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(generator=traingen)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    evaluate(ypreds_train, traingen.get_all_target(), results, "training")
    evaluate(ypred_test, testgen.get_all_target(), results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size),
                         str(n_bottom), "hugewood_75K", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()

    time.sleep(1)
Example #13
0
def single_run(dkey, train_size, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(seed), str(param)))

    if dkey == "covtype":
        traingen, testgen = covtype_generators(train_size=train_size,
                                               store="mem",
                                               seed=seed)
        n_subset = 50000
    elif dkey == "higgs":
        traingen, testgen = higgs_generators(train_size=train_size,
                                             store="mem",
                                             seed=seed)
        n_subset = 500000
    elif dkey == "susy":
        traingen, testgen = susy_generators(train_size=train_size,
                                            store="mem",
                                            seed=seed)
        n_subset = 500000
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    model = SubsetWoodClassifier(n_estimators=param['n_estimators'],
                                 criterion="gini",
                                 max_features=param['max_features'],
                                 min_samples_split=2,
                                 n_jobs=param['n_jobs'],
                                 seed=seed,
                                 bootstrap=param['bootstrap'],
                                 tree_traversal_mode="dfs",
                                 tree_type=param['tree_type'],
                                 min_samples_leaf=1,
                                 float_type="double",
                                 max_depth=None,
                                 verbose=1,
                                 store=MemoryStore())

    # training
    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen, n_subset=n_subset)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(generator=traingen)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    evaluate(ypreds_train, traingen.get_all_target(), results, "training")
    evaluate(ypred_test, testgen.get_all_target(), results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()

    time.sleep(1)
Example #14
0
import blb, yep

class ProfBLB(blb.BLB):
    def __init__(self, **kwargs):
        self.compute_estimate = 'stdev'
        self.reduce_bootstraps = 'mean'
        self.average = 'mean'
        blb.BLB.__init__(self, **kwargs)

if __name__ == '__main__':
    data1 = range(10000)
    data2 = range(50000)
    data3 = range(100000)

    tester = ProfBLB()
    tester.run(data3)

    yep.start('cilk.prof')
    for i in xrange(500):
        tester.run(data3)
    yep.stop()
    
    
Example #15
0
def single_run(dkey, train_size, param, seed, profile=False):
                
    print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param)))
    
    tmp_dir = "tmp/subsetwood"
    
    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        
        traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000)
    
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
    
    # set to top trees size
    n_subset = 500000

    model = SubsetWoodClassifier(
                n_estimators=param['n_estimators'],
                criterion="gini",
                max_features=param['max_features'],
                min_samples_split=2,
                n_jobs=param['n_jobs'],
                seed=seed,
                bootstrap=param['bootstrap'],
                tree_traversal_mode="dfs",
                tree_type=param['tree_type'],
                min_samples_leaf=1,
                float_type="double",
                max_depth=None,
                verbose=1,
                odir=tmp_dir,
                store=DiskStore())

    # training
    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")
                
    fit_start_time = time.time()        
    model.fit(traingen, n_subset=n_subset)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    
    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()
    
    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['subset'] = model.get_training_times()['subset']
    
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])
    
    print("Evaluating test error ...")

    ytest = testgen.get_all_target()            
    ytrain = traingen.get_all_target()            
    ytrain = ytrain.astype(numpy.int64)
    ytest = ytest.astype(numpy.int64)
    ypred_test = ypred_test.astype(numpy.int64)
    evaluate(ypred_test, ytest, results, "testing")

    print("Training distribution")
    print(numpy.bincount(ytrain))

    print("Test distribution")
    print(numpy.bincount(ytest))

    print("Predict distribution")
    print(numpy.bincount(ypred_test))
    
    fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
                                  str(param['max_features']),
                                  str(param['n_jobs']),
                                  str(param['bootstrap']),
                                  str(param['tree_type']),
                                  str(seed),
                                )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood_" + str(n_subset), fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    
    del(testgen)
    del(traingen)
    model.cleanup()
    time.sleep(1)
Example #16
0
import blb, yep


class ProfBLB(blb.BLB):
    def __init__(self, **kwargs):
        self.compute_estimate = 'stdev'
        self.reduce_bootstraps = 'mean'
        self.average = 'mean'
        blb.BLB.__init__(self, **kwargs)


if __name__ == '__main__':
    data1 = range(10000)
    data2 = range(50000)
    data3 = range(100000)

    tester = ProfBLB()
    tester.run(data3)

    yep.start('cilk.prof')
    for i in xrange(500):
        tester.run(data3)
    yep.stop()
Example #17
0
def main():
    from optparse import OptionParser
    import time

    parser = OptionParser(description="Test the C version of difflib. Either "
                          "specify files, or leave empty for auto-generated "
                          "random lines",
                          usage="Usage: %prog [options] [file1 file2]")
    parser.add_option("-n", "--niter", dest="niter", type="int",
                      help="num of iterations (default=%default)", default=1)
    parser.add_option("-l", "--lines", dest="lines", type="int",
                      help="num of lines to generate if no files specified (default=%default)", default=20000)
    parser.add_option("-d", "--diffs", dest="diffs", type="int",
                      help="num of random lines to change if no files specified (default=%default)", default=200)
    parser.add_option("-p", "--profile", dest="profile", default=False, action="store_true",
                      help="run in the python profiler and print results")
    parser.add_option("-c", "--compare", dest="compare", default=False, action="store_true",
                      help="also run the non-c difflib to compare outputs")
    parser.add_option("-y", "--yep", dest="yep", default=False, action="store_true",
                      help="use yep to profile the c code")

    (opts, args) = parser.parse_args()

    start = int(time.time())

    if opts.niter < 1:
        parser.error("Need to do at least 1 iteration..")

    if args:
        if len(args) != 2:
            parser.error("Need exactly 2 files to compare.")
        try:
            print("Reading input files...")
            s1 = open(args[0]).readlines()
            s2 = open(args[1]).readlines()
        except (IOError, OSError):
            parser.error("Couldn't load input files %s and %s" %
                         (args[0], args[1]))
    else:
        print("Generating random similar streams...")
        s1, s2 = generate_similar_streams(opts.lines, opts.diffs)

    # shonky, but saves time..
    sys.path.append('build/lib.linux-x86_64-2.7/')
    sys.path.append('build/lib.linux-x86_64-2.7-pydebug/')
    sys.path.append('build/lib.macosx-10.6-intel-2.7')

    if opts.yep:
        import yep
        yep.start("cdifflib.prof")

    if opts.profile:
        import cProfile
        import pstats
        fn = "cdifflib_%d.prof" % start
        print("Profiling cdifflib.CSequenceMatcher...")
        cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher),
                        dict(a=s1, b=s2, n=opts.niter, sm=CSequenceMatcher),
                        fn)
        print_stats(pstats.Stats(fn))

        if opts.compare:
            fn = "difflib_%d.prof" % start
            print("Profiling difflib.SequenceMatcher...")
            cProfile.runctx("p(sm,a,b,n)", dict(p=profile_sequence_matcher),
                            dict(a=s1, b=s2, n=opts.niter, sm=SequenceMatcher),
                            fn)
            print_stats(pstats.Stats(fn))

    else:
        print("Running cdifflib.CSequenceMatcher %d times..." % opts.niter)
        profile_sequence_matcher(CSequenceMatcher, s1, s2, opts.niter)
        if opts.compare:
            print("Running difflib.SequenceMatcher %d times..." % opts.niter)
            profile_sequence_matcher(SequenceMatcher, s1, s2, opts.niter)

    if opts.yep:
        yep.stop()
Example #18
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    tmp_dir = "tmp/hugewood"

    if dkey == "landsat":

        # TODO: Download file manually if needed (255GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        fname_train = "data/landsat_train.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=10000000)

    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=params.seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=param['n_estimators'],
        n_estimators_bottom=param['n_estimators_bottom'],
        n_top="auto",
        n_patterns_leaf="auto",
        balanced_top_tree=True,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=params.seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        odir=tmp_dir,
        store=DiskStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()

    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['top'] = model.get_training_times()['top']
    results['distribute'] = model.get_training_times()['distribute']
    results['bottom'] = model.get_training_times()['bottom']

    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    print("Evaluating test error ...")
    ytest = testgen.get_all_target()
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()
    time.sleep(1)
Example #19
0
def single_run(dkey, train_size, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(seed), str(param)))

    if dkey == "covtype":
        Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size,
                                               seed=seed)
    elif dkey == "higgs":
        Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed)
    elif dkey == "susy":
        Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed)
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % Xtrain.shape[0])
    print("Number of test patterns:\t%i" % Xtest.shape[0])
    print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])

    if param['tree_type'] == "randomized":
        from sklearn.ensemble import ExtraTreesClassifier as RF
    elif param['tree_type'] == "standard":
        from sklearn.ensemble import RandomForestClassifier as RF

    model = RF(n_estimators=param['n_estimators'],
               criterion="gini",
               max_features=param['max_features'],
               min_samples_split=2,
               n_jobs=param['n_jobs'],
               random_state=seed,
               bootstrap=param['bootstrap'],
               min_samples_leaf=1,
               max_depth=None,
               verbose=0)

    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    # training
    fit_start_time = time.time()
    model.fit(Xtrain, ytrain)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(Xtrain)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(Xtest)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(ypreds_train, ytrain, results, "training")
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Example #20
0
 def run(self):                  # takes about 5 seconds
     yep.start('yep.prof')
     for i in xrange(100):
         self.test_method()
         test_function()
     yep.stop()