Beispiel #1
0
 def calc_old_dist(cls, from_node):
     offset = from_node % 10
     return np.cumsum(
         common.score(cls.nodes,
                      cls.solution[from_node:],
                      cls.prime,
                      offset=offset))
Beispiel #2
0
def get_cluster_performance(labels, predictions, n_clusters, fold, seedval):
    return {
        'fold': fold,
        'seed': seedval,
        'score': common.score(labels, predictions),
        'n_clusters': n_clusters
    }
Beispiel #3
0
def select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in candidates]
        best_candidate = candidates[common.argbest(candidate_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #4
0
def select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i):
    initial_ensemble_size = 2
    max_candidates=50
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in candidates]
        best_candidate = candidates[common.argbest(candidate_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #5
0
 def calc_reverse(cls, left):
     r_solution = cls.solution[left:-1][::-1]
     reverse_dist = []
     for offset in range(10):
         reverse_dist.append(
             np.cumsum(
                 common.score(cls.nodes, r_solution, cls.prime,
                              offset)[::-1])[::-1])
     reverse_dist = np.array(reverse_dist)
     return reverse_dist
Beispiel #6
0
def get_performance(df, ensemble, fold, seedval):
    labels = df.index.get_level_values('label').values
    predictions = df[ensemble].mean(axis=1)
    return {
        'fold': fold,
        'seed': seedval,
        'score': common.score(labels, predictions),
        'ensemble': ensemble[-1],
        'ensemble_size': len(ensemble)
    }
Beispiel #7
0
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_diversity_scores = [abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates]
        candidate_diversity_ranks = array(candidate_diversity_scores).argsort()
        diversity_candidates = candidates[candidate_diversity_ranks[:max_diversity_candidates]]
        candidate_accuracy_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in diversity_candidates]
        best_candidate = candidates[common.argbest(candidate_accuracy_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #8
0
def solve(enc):
    mlen = min(len(a) for a in enc)
    enc = [a[:mlen] for a in enc]

    key = ''
    for i in range(mlen):
        e = ''.join(a[i] for a in enc)
        scorer = lambda k: score(xor_str(e, cycle(chr(k))))
        scores = [scorer(k) for k in range(256)]
        key += chr(max(range(256), key=lambda k: scores[k]))

    return key
Beispiel #9
0
def selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: common.score(train_labels, x)).order(ascending = not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_performance.append(get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1]
    return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
Beispiel #10
0
def selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: common.score(
        train_labels, x)).order(ascending=not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels,
                                          best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_performance.append(
            get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(
            get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(
        train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1]
    return get_predictions(test_df, best_ensemble, fold,
                           seedval), DataFrame.from_records(test_performance)
Beispiel #11
0
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble,
                          i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values,
                            min(max_candidates, len(best_classifiers)),
                            replace=False)
        candidate_diversity_scores = [
            abs(common.diversity_score(train_df[ensemble +
                                                [candidate]].values))
            for candidate in candidates
        ]
        candidate_diversity_ranks = array(candidate_diversity_scores).argsort()
        diversity_candidates = candidates[
            candidate_diversity_ranks[:max_diversity_candidates]]
        candidate_accuracy_scores = [
            common.score(train_labels,
                         train_df[ensemble + [candidate]].mean(axis=1))
            for candidate in diversity_candidates
        ]
        best_candidate = candidates[common.argbest(candidate_accuracy_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #12
0

def norm_dist(keysize):
    numblocks = (len(data) / keysize)
    blocksum = 0
    for i in range(numblocks - 1):
        a = data[i * keysize:(i + 1) * keysize]
        b = data[(i + 1) * keysize:(i + 2) * keysize]
        blocksum += hamming(a, b)
    blocksum /= float(numblocks)
    blocksum /= float(keysize)
    return blocksum


keysize = min(range(2, 40), key=norm_dist)
print "Decided keysize = ", keysize

key = [None] * keysize

for i in range(keysize):
    d = data[i::keysize]
    key[i] = max(range(256), key=lambda k: score(xor(d, k)))

key = ''.join(map(chr, key))

print "Decided key = ", repr(key)

print "Decoded data below"
print
print ''.join(chr(ord(a) ^ ord(b)) for a, b in zip(data, cycle(key)))
Beispiel #13
0
    MAXV = 10**9

    try:
        shutil.rmtree('./tmp', ignore_errors=False)
    except:
        pass
    os.mkdir('./tmp')

    for test in range(1, TESTS + 1):
        print("Case #%d: " % test, end="")

        N = random.randint(2, MAXN)
        subprocess.check_call(
            ["./" + gen_name(), str(N), str(MAXV)],
            stdout=open(temp_in(TEST_IDX), "w"))

        for source in Sources:
            if run_source(source, TEST_IDX, True) == False:
                print("Runtime error on %s" % source)
                exit(1)

        subprocess.check_call(
            ["cp", temp_out(Sources[0], TEST_IDX),
             temp_ok(TEST_IDX)])

        for source in Sources:
            if score(source, TEST_IDX) != 5:
                print("Wrong answer: %s" % source)
                exit(1)
        print("OK")
Beispiel #14
0
stacker = RandomForestClassifier(n_estimators=200,
                                 max_depth=2,
                                 bootstrap=False,
                                 random_state=0)
if len(argv) > 3 and argv[3] == 'linear':
    stacker = SGDClassifier(loss='log', n_iter=50, random_state=0)

predictions_dfs = []
performance_dfs = []
seeds = [0] if method == 'greedy' else range(10)
for seedval in seeds:
    results = Parallel(n_jobs=-1, verbose=1)(delayed(method_function)(fold)
                                             for fold in range(fold_count))
    for predictions_df, performance_df in results:
        predictions_dfs.append(predictions_df)
        performance_dfs.append(performance_df)
performance_df = concat(performance_dfs)
performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' %
                      (path, method, common.score.__name__),
                      index=False)
predictions_df = concat(predictions_dfs)
predictions_df['method'] = method
predictions_df['metric'] = common.score.__name__
predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' %
                      (path, method, common.score.__name__),
                      index=False)
print '%.3f %i' % (predictions_df.groupby([
    'fold', 'seed'
]).apply(lambda x: common.score(x.label, x.prediction)).mean(),
                   predictions_df.ensemble_size.mean())
Beispiel #15
0
    MAXPART = int(input("MAXPART = "))
    MAXV = int(input("MAXV = "))
    try:
        shutil.rmtree('./tmp', ignore_errors=False)
    except:
        pass
    os.mkdir('./tmp')

    for test in range(1, TESTS + 1):
        print("Case #%d: " % test, end="")

        N = random.randint(1, MAXN)
        K = random.randint(1, min(N, MAXK))
        X = random.randint(1, min(MAXX, N))
        Y = random.randint(1, min(MAXY, N))
        PART = random.randint(1, min(X, Y, MAXPART))
        subprocess.check_call(["./gen", str(N), str(K), str(X), str(Y), str(PART), str(MAXV), str(random.randint(0, 2**32 - 1))], stdout=open("./tmp/"+problem_name()+".in", "w"))

        for source in Sources:
            if run_source(source) == False:
                print("Runtime error on %s" % source)
                exit(1)

        subprocess.check_call(["cp", "./tmp/%s.out" % Sources[0], "./tmp/"+problem_name()+".ok"])

        for source in Sources:
            if score(source) != 10:
                print("Wrong answer: %s" % source)
                exit(1)
        print("OK")
Beispiel #16
0
    def solve(cls,
              nodes,
              solution,
              prime,
              prime_set,
              run_through=None,
              max_iter=5,
              start_at=1):
        solution = solution.copy()
        start = time.time()

        i = 1
        if run_through == None:
            run_through = len(nodes) - 1

        asd = nodes[solution]
        df = pd.DataFrame(asd)
        df.to_csv('asd.csv')
        solution_nodes = pd.read_csv('asd.csv').iloc[:, 1:].values

        cls.nodes = nodes
        cls.solution = solution
        cls.prime = prime
        cls.prime_set = prime_set
        cls.solution_nodes = solution_nodes

        while i <= max_iter:
            print('Start iteration %s' % i)

            iter_start = time.time()
            best = []
            did = 0

            reverse_dist, old_dist, roller = cls.calc_basis(start_at)
            left = 1
            #for left in range(start_at, run_through):
            while left >= 1 and left < run_through:
                node_before_a, node_a = solution[left - 1:left + 1]

                middle, reverse_dist = cls.calc_middle(reverse_dist, left)
                to_d, from_a, roller = cls.calc_connector(
                    roller, left, node_before_a, node_a)
                place, saving, old_dist = cls.calc_save(
                    from_a, to_d, middle, old_dist)

                if saving < 0:
                    best.append(saving)
                    did += saving
                    target = place + 1 + left

                    #execute swap
                    cls.excute_swap(left, place)
                    print('a: %s | saving: %.3f | cumsave: %.3f' %
                          (left, saving, did))

                    #reset infos
                    #reverse_dist, old_dist, roller = cls.calc_basis(left+1)
                    reverse_dist, old_dist, roller = cls.calc_basis(left)
                    left -= 1
                left += 1
            print('iter %s done, total swap %s, saves %.3f, time %.1f' %
                  (i, len(best), did, time.time() - iter_start))
            print(common.score(nodes, solution, prime).sum())
            print('####################################')
            i += 1
            if len(best) == 0:
                print('zero')
                break
        return solution
Beispiel #17
0
    MAXK2 = int(input("MAXK2 = "))

    try:
        shutil.rmtree('./tmp', ignore_errors=False)
    except:
        pass
    os.mkdir('./tmp')

    for test in range(1, TESTS + 1):
        print("Case #%d" % test, end="")

        N = random.randint(1, MAXN)
        K1 = random.randint(1, min(MAXK1, N))
        K2 = random.randint(1, min(MAXK2, N))
        if test == TESTS:
            N, K1, K2 = MAXN, MAXK1, MAXK2
        subprocess.check_call(["./gen", str(N), str(K1), str(K2)], stdout=open("./tmp/"+problem_name()+".in", "w"))
        print("(N = %d, K1 = %d, K2 = %d): " % (N, K1, K2), end="")
        for source in Sources:
            if run_source(source) == False:
                print("Runtime error on %s" % source)
                exit(1)

        subprocess.check_call(["cp", "./tmp/%s.out" % Sources[0], "./tmp/"+problem_name()+".ok"])

        for source in Sources:
            if score(source) != 10:
                print("Wrong answer: %s" % source)
                exit(1)
        print("OK")
Beispiel #18
0
from common import score


def decode(enc, k):
    return ''.join(chr(ord(x) ^ k) for x in enc)

enc = '1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736'.decode('hex')
key = max(range(256), key=lambda k: score(decode(enc, k)))

print "Key: ", key
print "Decoded: ", decode(enc, key)
Beispiel #19
0
from common import score
from tqdm import tqdm  # For progress bar


def decode(enc, k):
    return ''.join(chr(ord(x) ^ k) for x in enc)

with open('data/4.txt', 'r') as f:
    data = f.read().split()
    data = [d.decode('hex') for d in data]

keys = [max(range(256), key=lambda k: score(decode(e, k))) for e in tqdm(data)]
decs = [decode(e, k) for e, k in zip(data, keys)]

best = max(decs, key=score)

print best
Beispiel #20
0
        test_df = common.unbag(test_df, bag_count)
    test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1]
    return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})


path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
method = argv[2]
assert method in ['aggregate', 'standard']
p = common.load_properties(path)
fold_count = int(p['foldCount'])
bag_count = int(p['bagCount'])

# use non-negative least squares for regression
if 'predictClassValue' not in p:
    stacker = NNLS()
else:
    # use linear stacker if requested, else use shallow non-linear stacker
    if len(argv) > 3 and argv[3] == 'linear':
        stacker = SGDClassifier(loss = 'log', n_iter = 50, random_state = 0)
    else:
        stacker = RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0)

predictions_dfs = Parallel(n_jobs = -1, verbose = 1)(delayed(stacked_generalization)(fold) for fold in range(fold_count))
predictions_df = concat(predictions_dfs)
predictions_df['method'] = method
predictions_df.to_csv('%s/analysis/stacking-%s.csv' % (path, method), index = False)
print '%.3f' % predictions_df.groupby('fold').apply(lambda x: common.score(x.label, x.prediction)).mean()
Beispiel #21
0
    method_function = selection
p = common.load_properties(path)
fold_count = int(p['foldCount'])
initial_ensemble_size = 2
max_ensemble_size = 50
max_candidates = 50
max_diversity_candidates = 5
accuracy_weight = 0.5
max_clusters = 20

# use shallow non-linear stacker by default
stacker = RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0)
if len(argv) > 3 and argv[3] == 'linear':
    stacker = SGDClassifier(loss = 'log', n_iter = 50, random_state = 0)

predictions_dfs = []
performance_dfs = []
seeds = [0] if method == 'greedy' else range(10)
for seedval in seeds:
    results = Parallel(n_jobs = -1, verbose = 1)(delayed(method_function)(fold) for fold in range(fold_count))
    for predictions_df, performance_df in results:
        predictions_dfs.append(predictions_df)
        performance_dfs.append(performance_df)
performance_df = concat(performance_dfs)
performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, common.score.__name__), index = False)
predictions_df = concat(predictions_dfs)
predictions_df['method'] = method
predictions_df['metric'] = common.score.__name__
predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, common.score.__name__), index = False)
print '%.3f %i' % (predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.score(x.label, x.prediction)).mean(), predictions_df.ensemble_size.mean())
Beispiel #22
0
def get_performance(df, ensemble, fold, seedval):
    labels          = df.index.get_level_values('label').values
    predictions     = df[ensemble].mean(axis = 1)
    return {'fold': fold, 'seed': seedval, 'score': common.score(labels, predictions), 'ensemble': ensemble[-1], 'ensemble_size': len(ensemble)}
Beispiel #23
0
def get_cluster_performance(labels, predictions, n_clusters, fold, seedval):
    return {'fold': fold, 'seed': seedval, 'score': common.score(labels, predictions), 'n_clusters': n_clusters}
Beispiel #24
0
    try:
        shutil.rmtree('./tmp', ignore_errors=False)
    except:
        pass
    os.mkdir('./tmp')

    for test in range(1, TESTS + 1):
        print("Case #%d: " % test, end="")
        

        N = random.randint(3, MAXN)


        subprocess.check_call(["./gen", str(N), str(MAXV), str(REALMAXV), str(random.randint(0, 2**32 - 1))], stdout=open("./tmp/"+problem_name()+".in", "w"))



        for source in Sources:
            if run_source(source) == False:
                print("Runtime error on %s" % source)
                exit(1)

        subprocess.check_call(["cp", "./tmp/%s.out" % Sources[0], "./tmp/"+problem_name()+".ok"])

        for source in Sources:
            if score(source) != 5:
                print("Wrong answer: %s" % source)
                exit(1)
        print("OK")
Beispiel #25
0
        print(".", end="")
        stdout.flush()
        subprocess.check_call([
            "cp", "./teste/" + str(test) + "-" + problem_name() + ".in",
            "./tmp/" + problem_name() + ".in"
        ])
        subprocess.check_call([
            "cp", "./teste/" + str(test) + "-" + problem_name() + ".ok",
            "./tmp/" + problem_name() + ".ok"
        ])

        for source in SOURCES:
            if run_source(source) == False:
                RESULTS[source] += ['X']
            else:
                subprocess.check_call([
                    "cp", "./tmp/" + source + ".out",
                    "./tmp/" + problem_name() + ".out"
                ])
                s = score(source)
                RESULTS[source] += [str(s)]
                SCORES[source] += s
    print()
    SCORES = [[name, score] for name, score in SCORES.items()]
    RESULTS = [[name] + results for name, results in RESULTS.items()]

    print(tabulate.tabulate(SCORES, ["Nume", "Scor"], tablefmt="grid"))
    print(
        tabulate.tabulate(RESULTS, ["Nume"] + list(range(len(TESTS))),
                          tablefmt="grid"))
Beispiel #26
0
    ]
    SCORES = {source: 0 for source in SOURCES}
    RESULTS = {source: [] for source in SOURCES}
    for test in range(len(TESTS)):
        print(".", end="")
        stdout.flush()
        subprocess.check_call([
            "cp", "./teste/" + str(test) + "-" + problem_name() + ".in",
            temp_in(test)
        ])
        subprocess.check_call([
            "cp", "./teste/" + str(test) + "-" + problem_name() + ".ok",
            temp_ok(test)
        ])

        for source in SOURCES:
            if run_source(source, test) == False:
                RESULTS[source] += ['X']
            else:
                s = score(source, test)
                RESULTS[source] += [str(s)]
                SCORES[source] += s
    print()
    SCORES = [[name, score] for name, score in SCORES.items()]
    RESULTS = [[name] + results for name, results in RESULTS.items()]

    print(tabulate.tabulate(SCORES, ["Nume", "Scor"], tablefmt="grid"))
    print(
        tabulate.tabulate(RESULTS, ["Nume"] + list(range(len(TESTS))),
                          tablefmt="grid"))
Beispiel #27
0
from glob import glob
from os.path import abspath, exists
from sys import argv

from pandas import DataFrame, concat, read_csv
import common

from warnings import filterwarnings

filterwarnings('ignore', category=DeprecationWarning)

path = abspath(argv[1])
assert exists(path)

scores = []
for dirname in glob('%s/weka.classifiers.*' % path):
    filenames = glob('%s/predictions-*.csv.gz' % dirname)
    predictions_df = concat([
        read_csv(filename, index_col=[0, 1], skiprows=1, compression='gzip')
        for filename in filenames
    ])
    score = common.score(
        predictions_df.index.get_level_values('label').values,
        predictions_df.prediction)
    scores.append([dirname.split('/')[-1], score])
print DataFrame(scores, columns=['classifier',
                                 'score']).set_index('classifier').sort(
                                     'score',
                                     ascending=not common.greater_is_better)