def main(sival_dir, outputfile): names_files = glob.glob(os.path.join(sival_dir, '*.names')) classes = sorted([os.path.basename(nf[:-6]) for nf in names_files]) mat = {} mat['class_names'] = np.array(classes) data = None reverse_index = {} progress = ProgressMonitor(total=len(classes), msg='Getting class labels') for i, clazz in enumerate(classes, 1): exset = parse_c45(clazz, sival_dir) if data is None: data = np.array(exset.to_float())[:, 2:-1] inst_classes = np.zeros(len(exset)) index = [(ex[0], ex[1]) for ex in exset] for j, key in enumerate(index): reverse_index[key] = j for ex in exset: inst_classes[reverse_index[(ex[0], ex[1])]] += i*ex[-1] progress.increment() mat['instance_ids'] = np.array(index) mat['X'] = data mat['y'] = inst_classes savemat(outputfile, mat)
def _prog(plist): progress = ProgressMonitor(total=len(plist), print_interval=1, msg='Constructing Kernel') for p in plist: yield p progress.increment()
def make_weights(X): n = len(X) prog = ProgressMonitor(total=n, print_interval=1, msg='Constructing Kernel') ws = [] for x in X: prog.increment() ws.append(median_weight(k, x)) return ws
def compute_andor(configuration_file, kerneldir): print 'Loading configuration...' with open(configuration_file, 'r') as f: configuration = yaml.load(f) kernels = dict() for experiment in configuration['experiments']: dataset = experiment['dataset'] epsilon = experiment['epsilon'] delta = experiment['delta'] seed = experiment['seed'] n = get_dset_size(dataset) mantissa = np.zeros((n, n)) exponent = np.zeros((n, n)) time = np.zeros((n, n)) prog = ProgressMonitor(total=(n*(n+1)/2), msg='%s,andor,%f,%f,%d' % (dataset, epsilon, delta, seed)) alldone = True for i in range(n): for j in range(i, n): prog.increment() andorkey = (dataset, 'andor', epsilon, delta, seed, i, j) andortask = Task(*andorkey) andortask.ground(kerneldir) if andortask.finished: continue andkey = (dataset, 'and', epsilon, delta, seed, i, j) andtask = Task(*andkey) andtask.ground(kerneldir) if not andtask.finished: alldone = False continue orkey = (dataset, 'or', epsilon, delta, seed, i, j) ortask = Task(*orkey) ortask.ground(kerneldir) if not ortask.finished: alldone = False continue andtime = andtask.runtime() ortime = ortask.runtime() andman, andexp = andtask.value() orman, orexp = ortask.value() submission = { 'mantissa' : (andman / orman), 'exponent' : (andexp - orexp), 'time' : (andtime + ortime), } andortask.store_results(submission) if not alldone: print 'Unfinished: %s, %f, %f, %d' % (dataset, epsilon, delta, seed)
def main(outputfile): progress = ProgressMonitor(total=len(DATASETS), msg='Extracting statistics') with open(outputfile, 'w+') as f: stats = ','.join(stat for stat, _ in STATISTICS) f.write('#%s\n' % stats) for dataset in DATASETS: dset = get_dataset(dataset) dset.name = dataset stats = ','.join(map(str, (f(dset) for _, f in STATISTICS))) f.write('%s\n' % stats) progress.increment()
def learn(self, X_labeled, y_labeled, X_pool, y_pool, X_test): # Initial Predictions self.classifier.fit(X_labeled, y_labeled) predictions = [self.classifier.decision_function(X_test)] if self.verbose: progress = ProgressMonitor(total=self.queries, msg='Active Learning') for q in range(self.queries): if len(X_pool) <= 0: if self.verbose: print 'Warning: skipping query %d...' % q predictions.append(predictions[-1]) else: next_labeled = self.select(X_pool) X_labeled.append(X_pool.pop(next_labeled)) y_labeled.append(y_pool.pop(next_labeled)) self.classifier.fit(X_labeled, y_labeled) predictions.append(self.classifier.decision_function(X_test)) if self.verbose: progress.increment() return predictions
def main(dataset, folddir, outputdir, reps=0): data_dict = data.get_dataset(dataset) folds = data.get_folds(folddir, dataset) all_bag_ids = set(data_dict.keys()) progress = ProgressMonitor(total=reps * len(folds), msg='Generating Replicates') for f in range(len(folds)): test = data.get_fold(folddir, dataset, f) bag_ids = np.array(list(all_bag_ids - set(test))) n = len(bag_ids) for r in range(1, reps + 1): rep_path = os.path.join(outputdir, '%s_%04d_%06d.rep' % (dataset, f, r)) if not os.path.exists(rep_path): sample = np.random.randint(n, size=n) sampled_bags = bag_ids[sample] with open(rep_path, 'w+') as ofile: ofile.write('\n'.join([bid for bid in sampled_bags.flat])) progress.increment()
def main(configfile, folddir, resultsdir): with open(configfile, 'r') as f: configuration = yaml.load(f) # Generate tasks from experiment list total = 0 actual = 0 prog = ProgressMonitor(total=len(configuration['experiments']), msg='Computing noise') for experiment in configuration['experiments']: technique = experiment['technique'] classifier = experiment['classifier'] dataset = experiment['dataset'] ids, _, y = data.get_dataset(dataset) y_dict = {} for (bid, iid), yi in zip(ids, y): y_dict[bid, iid] = yi folds = data.get_folds(folddir, dataset) for f in range(len(folds)): for r in range(experiment['reps']): for i in experiment['initial']: for s in experiment['shuffled']: labeled = setup_rep(technique, experiment['noise'], dataset, f, r, i, s, folddir, resultsdir) pos_shuffled = get_positive_shuffled(labeled, i, s) total += len(pos_shuffled) actual += count_actual_positive(pos_shuffled, y_dict) prog.increment() if total > 0: print 1 - (float(actual) / total) if total > 0: print 1 - (float(actual) / total)
def main(configfile, folddir, resultsdir): with open(configfile, 'r') as f: configuration = yaml.load(f) # Count total experiments for progress monitor exps = 0 for experiment in configuration['experiments']: dataset = experiment['dataset'] folds = get_folds(folddir, dataset) for f in range(len(folds)): for r in range(experiment['reps']): for n in experiment['noise']: for s in experiment['shuffled']: exps += 1 prog = ProgressMonitor(total=exps, msg='Generating Shuffled Bags') # Generate tasks from experiment list tasks = {} for experiment in configuration['experiments']: technique = experiment['technique'] classifier = experiment['classifier'] dataset = experiment['dataset'] folds = get_folds(folddir, dataset) for f in range(len(folds)): for r in range(experiment['reps']): for n in experiment['noise']: for s in experiment['shuffled']: key = (technique, classifier, dataset, experiment['kernel'], f, r, n, s) kwargs = {} kwargs['params'] = experiment['params'] kwargs['shuffled_bags'] = setup_rep(technique, dataset, f, r, n, s, folddir, resultsdir) task = Task(*key, **kwargs) tasks[key] = task prog.increment() # Mark finished tasks for task in tasks.values(): predfile = os.path.join(resultsdir, task.filebase('preds')) if os.path.exists(predfile): task.finish() def handle(key, task, submission): if 'stats' in submission: sfile = os.path.join(resultsdir, task.filebase('stats')) with open(sfile, 'w+') as f: f.write(yaml.dump(submission['stats'], default_flow_style=False)) pfile = os.path.join(resultsdir, task.filebase('preds')) with open(pfile, 'w+') as f: f.write(yaml.dump(submission['preds'], default_flow_style=False)) server = ExperimentServer(tasks, render, handle) cherrypy.config.update({'server.socket_port': PORT, 'server.socket_host': '0.0.0.0'}) cherrypy.quickstart(server)
def load_config(configuration_file, results_root_dir): tasks = {} parameter_dict = {} print 'Loading configuration...' with open(configuration_file, 'r') as f: configuration = yaml.load(f) experiment_key = configuration['experiment_key'] experiment_name = configuration['experiment_name'] if experiment_name == 'mi_kernels': from resampling import NullResamplingConfiguration def constructor_from_experiment(experiment): return lambda dset: NullResamplingConfiguration(dset) else: raise ValueError('Unknown experiment name "%s"' % experiment_name) for experiment in configuration['experiments']: try: experiment_id = tuple(experiment[k] for k in experiment_key) except KeyError: raise KeyError('Experiment missing identifier "%s"' % experiment_key) def _missing(pretty_name): raise KeyError('%s not specified for experiment "%s"' % (pretty_name, str(experiment_id))) def _resolve(field_name, pretty_name): field = experiment.get(field_name, configuration.get(field_name, None)) if field is None: _missing(pretty_name) return field print 'Setting up experiment "%s"...' % str(experiment_id) try: dataset = experiment['dataset'] except KeyError: _missing('Dataset') experiment_format = _resolve('experiment_key_format', 'Experiment key format') parameter_key = _resolve('parameter_key', 'Parameter key') parameter_format = _resolve('parameter_key_format', 'Parameter key format') parameters = _resolve('parameters', 'Parameters') param_config = ParameterConfiguration(results_root_dir, experiment_name, experiment_id, experiment_format, parameter_key, parameter_format, parameters) parameter_dict[experiment_id] = param_config folds = _resolve('folds', 'Folds') fold_config = FoldConfiguration(dataset, *folds) resampling_constructor = constructor_from_experiment(experiment) priority = experiment.get('priority', 0) experiment_config = ExperimentConfiguration( experiment_name, experiment_id, fold_config, param_config, resampling_constructor) settings = experiment_config.get_settings() prog = ProgressMonitor(total=len(settings), print_interval=10, msg='\tGetting tasks') for setting in settings: key = experiment_config.get_key(**setting) task = experiment_config.get_task(**setting) task.priority_adjustment = priority task.ground(results_root_dir, experiment_format, parameter_format) tasks[key] = task prog.increment() return tasks, parameter_dict
dst_did = dstk.get_dataset_id(dst) dst_tid = dstk.get_ktype_id(t) srccon = srck.get_connection() cursor = srccon.cursor() cursor.execute('SELECT * FROM kernel') new_entries = [ (dst_did, dst_tid, epsilon, delta, seed, i, j, mantissa, exponent, time) for _, _, epsilon, delta, seed, i, j, mantissa, exponent, time in cursor.fetchall() ] dstcon = dstk.get_connection() with dstcon: dstcon.executemany( 'INSERT INTO kernel ' 'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', new_entries ) if __name__ == '__main__': prog = ProgressMonitor(total=len(DST)*len(TYPES), msg='Copying kernels') for dst in DST: for t in TYPES: srcfile = filename(SRC, t) dstfile = filename(dst, t) srck = KernelManager(srcfile) dstk = KernelManager(dstfile) copy_kernel(srck, dstk, dst, t) prog.increment()
def load_config(configuration_file, results_root_dir): tasks = {} parameter_dict = {} print 'Loading configuration...' with open(configuration_file, 'r') as f: configuration = yaml.load(f) experiment_key = configuration['experiment_key'] experiment_name = configuration['experiment_name'] if experiment_name == 'mi_kernels': from resampling import NullResamplingConfiguration def constructor_from_experiment(experiment): return lambda dset: NullResamplingConfiguration(dset) else: raise ValueError('Unknown experiment name "%s"' % experiment_name) for experiment in configuration['experiments']: try: experiment_id = tuple(experiment[k] for k in experiment_key) except KeyError: raise KeyError('Experiment missing identifier "%s"' % experiment_key) def _missing(pretty_name): raise KeyError('%s not specified for experiment "%s"' % (pretty_name, str(experiment_id))) def _resolve(field_name, pretty_name): field = experiment.get(field_name, configuration.get(field_name, None)) if field is None: _missing(pretty_name) return field print 'Setting up experiment "%s"...' % str(experiment_id) try: dataset = experiment['dataset'] except KeyError: _missing('Dataset') experiment_format = _resolve('experiment_key_format', 'Experiment key format') parameter_key = _resolve('parameter_key', 'Parameter key') parameter_format = _resolve('parameter_key_format', 'Parameter key format') parameters = _resolve('parameters', 'Parameters') param_config = ParameterConfiguration(results_root_dir, experiment_name, experiment_id, experiment_format, parameter_key, parameter_format, parameters) parameter_dict[experiment_id] = param_config folds = _resolve('folds', 'Folds') fold_config = FoldConfiguration(dataset, *folds) resampling_constructor = constructor_from_experiment(experiment) priority = experiment.get('priority', 0) experiment_config = ExperimentConfiguration( experiment_name, experiment_id, fold_config, param_config, resampling_constructor) settings = experiment_config.get_settings() #import pdb;pdb.set_trace() prog = ProgressMonitor(total=len(settings), print_interval=10, msg='\tGetting tasks') for setting in settings: key = experiment_config.get_key(**setting) task = experiment_config.get_task(**setting) task.priority_adjustment = priority task.ground(results_root_dir, experiment_format, parameter_format) tasks[key] = task prog.increment() return tasks, parameter_dict