def calc_old_dist(cls, from_node): offset = from_node % 10 return np.cumsum( common.score(cls.nodes, cls.solution[from_node:], cls.prime, offset=offset))
def get_cluster_performance(labels, predictions, n_clusters, fold, seedval): return { 'fold': fold, 'seed': seedval, 'score': common.score(labels, predictions), 'n_clusters': n_clusters }
def select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in candidates] best_candidate = candidates[common.argbest(candidate_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i): initial_ensemble_size = 2 max_candidates=50 if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in candidates] best_candidate = candidates[common.argbest(candidate_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def calc_reverse(cls, left): r_solution = cls.solution[left:-1][::-1] reverse_dist = [] for offset in range(10): reverse_dist.append( np.cumsum( common.score(cls.nodes, r_solution, cls.prime, offset)[::-1])[::-1]) reverse_dist = np.array(reverse_dist) return reverse_dist
def get_performance(df, ensemble, fold, seedval): labels = df.index.get_level_values('label').values predictions = df[ensemble].mean(axis=1) return { 'fold': fold, 'seed': seedval, 'score': common.score(labels, predictions), 'ensemble': ensemble[-1], 'ensemble_size': len(ensemble) }
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_diversity_scores = [abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] candidate_diversity_ranks = array(candidate_diversity_scores).argsort() diversity_candidates = candidates[candidate_diversity_ranks[:max_diversity_candidates]] candidate_accuracy_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in diversity_candidates] best_candidate = candidates[common.argbest(candidate_accuracy_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def solve(enc): mlen = min(len(a) for a in enc) enc = [a[:mlen] for a in enc] key = '' for i in range(mlen): e = ''.join(a[i] for a in enc) scorer = lambda k: score(xor_str(e, cycle(chr(k)))) scores = [scorer(k) for k in range(256)] key += chr(max(range(256), key=lambda k: scores[k])) return key
def selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) best_classifiers = train_df.apply(lambda x: common.score(train_labels, x)).order(ascending = not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append(get_performance(train_df, ensemble, fold, seedval)) test_performance.append(get_performance(test_df, ensemble, fold, seedval)) train_performance_df = DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer(train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1] return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
def selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) best_classifiers = train_df.apply(lambda x: common.score( train_labels, x)).order(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1] return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace=False) candidate_diversity_scores = [ abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates ] candidate_diversity_ranks = array(candidate_diversity_scores).argsort() diversity_candidates = candidates[ candidate_diversity_ranks[:max_diversity_candidates]] candidate_accuracy_scores = [ common.score(train_labels, train_df[ensemble + [candidate]].mean(axis=1)) for candidate in diversity_candidates ] best_candidate = candidates[common.argbest(candidate_accuracy_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def norm_dist(keysize): numblocks = (len(data) / keysize) blocksum = 0 for i in range(numblocks - 1): a = data[i * keysize:(i + 1) * keysize] b = data[(i + 1) * keysize:(i + 2) * keysize] blocksum += hamming(a, b) blocksum /= float(numblocks) blocksum /= float(keysize) return blocksum keysize = min(range(2, 40), key=norm_dist) print "Decided keysize = ", keysize key = [None] * keysize for i in range(keysize): d = data[i::keysize] key[i] = max(range(256), key=lambda k: score(xor(d, k))) key = ''.join(map(chr, key)) print "Decided key = ", repr(key) print "Decoded data below" print print ''.join(chr(ord(a) ^ ord(b)) for a, b in zip(data, cycle(key)))
MAXV = 10**9 try: shutil.rmtree('./tmp', ignore_errors=False) except: pass os.mkdir('./tmp') for test in range(1, TESTS + 1): print("Case #%d: " % test, end="") N = random.randint(2, MAXN) subprocess.check_call( ["./" + gen_name(), str(N), str(MAXV)], stdout=open(temp_in(TEST_IDX), "w")) for source in Sources: if run_source(source, TEST_IDX, True) == False: print("Runtime error on %s" % source) exit(1) subprocess.check_call( ["cp", temp_out(Sources[0], TEST_IDX), temp_ok(TEST_IDX)]) for source in Sources: if score(source, TEST_IDX) != 5: print("Wrong answer: %s" % source) exit(1) print("OK")
stacker = RandomForestClassifier(n_estimators=200, max_depth=2, bootstrap=False, random_state=0) if len(argv) > 3 and argv[3] == 'linear': stacker = SGDClassifier(loss='log', n_iter=50, random_state=0) predictions_dfs = [] performance_dfs = [] seeds = [0] if method == 'greedy' else range(10) for seedval in seeds: results = Parallel(n_jobs=-1, verbose=1)(delayed(method_function)(fold) for fold in range(fold_count)) for predictions_df, performance_df in results: predictions_dfs.append(predictions_df) performance_dfs.append(performance_df) performance_df = concat(performance_dfs) performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, common.score.__name__), index=False) predictions_df = concat(predictions_dfs) predictions_df['method'] = method predictions_df['metric'] = common.score.__name__ predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, common.score.__name__), index=False) print '%.3f %i' % (predictions_df.groupby([ 'fold', 'seed' ]).apply(lambda x: common.score(x.label, x.prediction)).mean(), predictions_df.ensemble_size.mean())
MAXPART = int(input("MAXPART = ")) MAXV = int(input("MAXV = ")) try: shutil.rmtree('./tmp', ignore_errors=False) except: pass os.mkdir('./tmp') for test in range(1, TESTS + 1): print("Case #%d: " % test, end="") N = random.randint(1, MAXN) K = random.randint(1, min(N, MAXK)) X = random.randint(1, min(MAXX, N)) Y = random.randint(1, min(MAXY, N)) PART = random.randint(1, min(X, Y, MAXPART)) subprocess.check_call(["./gen", str(N), str(K), str(X), str(Y), str(PART), str(MAXV), str(random.randint(0, 2**32 - 1))], stdout=open("./tmp/"+problem_name()+".in", "w")) for source in Sources: if run_source(source) == False: print("Runtime error on %s" % source) exit(1) subprocess.check_call(["cp", "./tmp/%s.out" % Sources[0], "./tmp/"+problem_name()+".ok"]) for source in Sources: if score(source) != 10: print("Wrong answer: %s" % source) exit(1) print("OK")
def solve(cls, nodes, solution, prime, prime_set, run_through=None, max_iter=5, start_at=1): solution = solution.copy() start = time.time() i = 1 if run_through == None: run_through = len(nodes) - 1 asd = nodes[solution] df = pd.DataFrame(asd) df.to_csv('asd.csv') solution_nodes = pd.read_csv('asd.csv').iloc[:, 1:].values cls.nodes = nodes cls.solution = solution cls.prime = prime cls.prime_set = prime_set cls.solution_nodes = solution_nodes while i <= max_iter: print('Start iteration %s' % i) iter_start = time.time() best = [] did = 0 reverse_dist, old_dist, roller = cls.calc_basis(start_at) left = 1 #for left in range(start_at, run_through): while left >= 1 and left < run_through: node_before_a, node_a = solution[left - 1:left + 1] middle, reverse_dist = cls.calc_middle(reverse_dist, left) to_d, from_a, roller = cls.calc_connector( roller, left, node_before_a, node_a) place, saving, old_dist = cls.calc_save( from_a, to_d, middle, old_dist) if saving < 0: best.append(saving) did += saving target = place + 1 + left #execute swap cls.excute_swap(left, place) print('a: %s | saving: %.3f | cumsave: %.3f' % (left, saving, did)) #reset infos #reverse_dist, old_dist, roller = cls.calc_basis(left+1) reverse_dist, old_dist, roller = cls.calc_basis(left) left -= 1 left += 1 print('iter %s done, total swap %s, saves %.3f, time %.1f' % (i, len(best), did, time.time() - iter_start)) print(common.score(nodes, solution, prime).sum()) print('####################################') i += 1 if len(best) == 0: print('zero') break return solution
MAXK2 = int(input("MAXK2 = ")) try: shutil.rmtree('./tmp', ignore_errors=False) except: pass os.mkdir('./tmp') for test in range(1, TESTS + 1): print("Case #%d" % test, end="") N = random.randint(1, MAXN) K1 = random.randint(1, min(MAXK1, N)) K2 = random.randint(1, min(MAXK2, N)) if test == TESTS: N, K1, K2 = MAXN, MAXK1, MAXK2 subprocess.check_call(["./gen", str(N), str(K1), str(K2)], stdout=open("./tmp/"+problem_name()+".in", "w")) print("(N = %d, K1 = %d, K2 = %d): " % (N, K1, K2), end="") for source in Sources: if run_source(source) == False: print("Runtime error on %s" % source) exit(1) subprocess.check_call(["cp", "./tmp/%s.out" % Sources[0], "./tmp/"+problem_name()+".ok"]) for source in Sources: if score(source) != 10: print("Wrong answer: %s" % source) exit(1) print("OK")
from common import score def decode(enc, k): return ''.join(chr(ord(x) ^ k) for x in enc) enc = '1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736'.decode('hex') key = max(range(256), key=lambda k: score(decode(enc, k))) print "Key: ", key print "Decoded: ", decode(enc, key)
from common import score from tqdm import tqdm # For progress bar def decode(enc, k): return ''.join(chr(ord(x) ^ k) for x in enc) with open('data/4.txt', 'r') as f: data = f.read().split() data = [d.decode('hex') for d in data] keys = [max(range(256), key=lambda k: score(decode(e, k))) for e in tqdm(data)] decs = [decode(e, k) for e, k in zip(data, keys)] best = max(decs, key=score) print best
test_df = common.unbag(test_df, bag_count) test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)}) path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) method = argv[2] assert method in ['aggregate', 'standard'] p = common.load_properties(path) fold_count = int(p['foldCount']) bag_count = int(p['bagCount']) # use non-negative least squares for regression if 'predictClassValue' not in p: stacker = NNLS() else: # use linear stacker if requested, else use shallow non-linear stacker if len(argv) > 3 and argv[3] == 'linear': stacker = SGDClassifier(loss = 'log', n_iter = 50, random_state = 0) else: stacker = RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0) predictions_dfs = Parallel(n_jobs = -1, verbose = 1)(delayed(stacked_generalization)(fold) for fold in range(fold_count)) predictions_df = concat(predictions_dfs) predictions_df['method'] = method predictions_df.to_csv('%s/analysis/stacking-%s.csv' % (path, method), index = False) print '%.3f' % predictions_df.groupby('fold').apply(lambda x: common.score(x.label, x.prediction)).mean()
method_function = selection p = common.load_properties(path) fold_count = int(p['foldCount']) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 # use shallow non-linear stacker by default stacker = RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0) if len(argv) > 3 and argv[3] == 'linear': stacker = SGDClassifier(loss = 'log', n_iter = 50, random_state = 0) predictions_dfs = [] performance_dfs = [] seeds = [0] if method == 'greedy' else range(10) for seedval in seeds: results = Parallel(n_jobs = -1, verbose = 1)(delayed(method_function)(fold) for fold in range(fold_count)) for predictions_df, performance_df in results: predictions_dfs.append(predictions_df) performance_dfs.append(performance_df) performance_df = concat(performance_dfs) performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, common.score.__name__), index = False) predictions_df = concat(predictions_dfs) predictions_df['method'] = method predictions_df['metric'] = common.score.__name__ predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, common.score.__name__), index = False) print '%.3f %i' % (predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.score(x.label, x.prediction)).mean(), predictions_df.ensemble_size.mean())
def get_performance(df, ensemble, fold, seedval): labels = df.index.get_level_values('label').values predictions = df[ensemble].mean(axis = 1) return {'fold': fold, 'seed': seedval, 'score': common.score(labels, predictions), 'ensemble': ensemble[-1], 'ensemble_size': len(ensemble)}
def get_cluster_performance(labels, predictions, n_clusters, fold, seedval): return {'fold': fold, 'seed': seedval, 'score': common.score(labels, predictions), 'n_clusters': n_clusters}
try: shutil.rmtree('./tmp', ignore_errors=False) except: pass os.mkdir('./tmp') for test in range(1, TESTS + 1): print("Case #%d: " % test, end="") N = random.randint(3, MAXN) subprocess.check_call(["./gen", str(N), str(MAXV), str(REALMAXV), str(random.randint(0, 2**32 - 1))], stdout=open("./tmp/"+problem_name()+".in", "w")) for source in Sources: if run_source(source) == False: print("Runtime error on %s" % source) exit(1) subprocess.check_call(["cp", "./tmp/%s.out" % Sources[0], "./tmp/"+problem_name()+".ok"]) for source in Sources: if score(source) != 5: print("Wrong answer: %s" % source) exit(1) print("OK")
print(".", end="") stdout.flush() subprocess.check_call([ "cp", "./teste/" + str(test) + "-" + problem_name() + ".in", "./tmp/" + problem_name() + ".in" ]) subprocess.check_call([ "cp", "./teste/" + str(test) + "-" + problem_name() + ".ok", "./tmp/" + problem_name() + ".ok" ]) for source in SOURCES: if run_source(source) == False: RESULTS[source] += ['X'] else: subprocess.check_call([ "cp", "./tmp/" + source + ".out", "./tmp/" + problem_name() + ".out" ]) s = score(source) RESULTS[source] += [str(s)] SCORES[source] += s print() SCORES = [[name, score] for name, score in SCORES.items()] RESULTS = [[name] + results for name, results in RESULTS.items()] print(tabulate.tabulate(SCORES, ["Nume", "Scor"], tablefmt="grid")) print( tabulate.tabulate(RESULTS, ["Nume"] + list(range(len(TESTS))), tablefmt="grid"))
] SCORES = {source: 0 for source in SOURCES} RESULTS = {source: [] for source in SOURCES} for test in range(len(TESTS)): print(".", end="") stdout.flush() subprocess.check_call([ "cp", "./teste/" + str(test) + "-" + problem_name() + ".in", temp_in(test) ]) subprocess.check_call([ "cp", "./teste/" + str(test) + "-" + problem_name() + ".ok", temp_ok(test) ]) for source in SOURCES: if run_source(source, test) == False: RESULTS[source] += ['X'] else: s = score(source, test) RESULTS[source] += [str(s)] SCORES[source] += s print() SCORES = [[name, score] for name, score in SCORES.items()] RESULTS = [[name] + results for name, results in RESULTS.items()] print(tabulate.tabulate(SCORES, ["Nume", "Scor"], tablefmt="grid")) print( tabulate.tabulate(RESULTS, ["Nume"] + list(range(len(TESTS))), tablefmt="grid"))
from glob import glob from os.path import abspath, exists from sys import argv from pandas import DataFrame, concat, read_csv import common from warnings import filterwarnings filterwarnings('ignore', category=DeprecationWarning) path = abspath(argv[1]) assert exists(path) scores = [] for dirname in glob('%s/weka.classifiers.*' % path): filenames = glob('%s/predictions-*.csv.gz' % dirname) predictions_df = concat([ read_csv(filename, index_col=[0, 1], skiprows=1, compression='gzip') for filename in filenames ]) score = common.score( predictions_df.index.get_level_values('label').values, predictions_df.prediction) scores.append([dirname.split('/')[-1], score]) print DataFrame(scores, columns=['classifier', 'score']).set_index('classifier').sort( 'score', ascending=not common.greater_is_better)