def start(self): print self.get_name() trial = [] self._setup_options(self.config) t0 = time() self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed, shuffle=True, percent=self.split, keep_subject=True) self.print_lap("Loaded", t0) # self.data = self.vectorize(self.data) cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split) t = 0 for train_index, test_index in cv: # get the data of this cv iteration # train, test = exputil.sample_data(self.data, train_index, test_index) train, test = self._sample_data(self.data, train_index, test_index) self.print_lap("\nSampled", t0) # get the expert and student learner = exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'), vct=self.vct, sent_tk=self.sent_tokenizer, seed=(t * 10 + 10), cost_model=self.cost_model) expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=len(train.data)) expert.fit(train.data, y=train.target, vct=self.vct) # do active learning results = self.main_loop(learner, expert, self.budget, self.bootstrap_size, train, test) self.print_lap("\nTrial %s" % t, t0) # save the results trial.append(results) t += 1 self.report_results(trial)
def _setup_options(self, config_obj): # experiment related config config = cfgutil.get_section_options(config_obj, 'experiment') self.trials = config['trials'] self.folds = config['folds'] self.max_iteration = config['maxiter'] self.step = config['stepsize'] self.budget = config['budget'] self.prefix = config['fileprefix'] self.output = config['outputdir'] self.seed = config['seed'] # self.bootstrap_size = config['bootstrap'] self.bootstrap_size, self.bootstrap_method = exputil.get_bootstrap(config) self.costfn = exputil.get_costfn(config['costfunction']) if 'cost_model' in config.keys(): self.cost_model = config['cost_model'] self.cost_base = config['cost_base'] # data related config config = cfgutil.get_section_options(config_obj, 'data') self.split = config['split'] self.data_cat = config['categories'] self.limit = config['limit'] self.data_path = config['path'] #data related config config = cfgutil.get_section_options(config_obj, 'expert') args = {} if 'snip_size' in config: args.update({'snip_size':config['snip_size']}) self.sent_tokenizer = exputil.get_tokenizer(config['sent_tokenizer'], **args)
def _setup_options(self, config_obj): super(ExperimentJobs,self)._setup_options(config_obj) # experiment related config config = cfgutil.get_section_options(config_obj, 'experiment') self.validation_set = config['validation_set']
def __init__(self, dataname, config, verbose=False, debug=False): super(Experiment, self).__init__() self.verbose = verbose self.debug = debug self.config = config self.dataname = dataname self.data_cat = None self.data = None self.data_path = None self.trials = None self.folds = None self.split = None self.costfn = None self.cost_model = None self.cost_base = 25 self.budget = None self.max_iteration = None self.step = None self.bootstrap_size = None self.seed = None self.output = None self.rnd_state = np.random.RandomState(32564) self.remaining = None self.vct = exputil.get_vectorizer(cfgutil.get_section_options(config, 'data')) self.sent_tokenizer = None
def start(self, n_jobs=1, pre_dispatch='2*n_jobs'): trial = [] self._setup_options(self.config) print self.get_name() t0 = time() self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed, shuffle=True, percent=self.split, keep_subject=True) self.print_lap("Loaded", t0) self.data = self.vectorize(self.data) cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split) seeds = np.arange(len(cv)) * 10 + 10 expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=(len(self.data.train.target),self.data.train.sizes.max())) expert.fit(self.data.train.bow, y=self.data.train.target, vct=self.vct) lrnr_setup= {'vct':self.vct, "sent_tk":self.sent_tokenizer, "cost_model":self.cost_model, 'validation_set':self.validation_set} lrnr_type = cfgutil.get_section_option(self.config, 'learner', 'type') neu_threshold = cfgutil.get_section_option(self.config, 'expert', 'threshold') if lrnr_type in ['utility-cheat','const-cheat','const-cheat-noisy']: lrnr_setup.update({'snip_model':expert.oracle, 'threshold':neu_threshold}) learners = [exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'), seed=s, **lrnr_setup) for s in seeds] self.print_lap("\nPreprocessed", t0) # =================================== parallel = Parallel(n_jobs=n_jobs, verbose=True, pre_dispatch=pre_dispatch) scores = parallel(delayed(self.main_loop_jobs,check_pickle=False)(learners[t], expert, self.budget, self.bootstrap_size, self.data, tr[0],tr[1], t) for t, tr in enumerate(cv)) # =================================== self.print_lap("\nDone trials", t0) # save the results self.report_results(scores)
def set_options(self, config_obj): self.rnd_state = np.random.RandomState(32564) config = cfgutil.get_section_options(config_obj, 'data') self.data_cat = config['categories'] self.data_path = config['path'] self.split = config['split'] self.vct = exputil.get_vectorizer(config) config = cfgutil.get_section_options(config_obj, 'expert') self.sent_tokenizer = exputil.get_tokenizer(config['sent_tokenizer']) config = cfgutil.get_section_options(config_obj, 'experiment') self.seed = config['seed'] self.budget = config['budget'] self.step = config['stepsize'] self.output = config['outputdir'] self.bootstrap_size = config['bootstrap']
def get_expert(self, config, target_names): ''' Get human expert :return: ''' type_exp = cfgutil.get_section_options(config, 'expert') if type_exp['type'] == 'human': from expert.human_expert import HumanExpert names = ", ".join(["{}={}".format(a, b) for a, b in enumerate(target_names + ['neutral'])]) + " ? > " expert = HumanExpert(None, names) else: raise Exception("Oops, cannot handle an %s expert" % type_exp) return expert
def get_student(self, config, pool, sequence): from collections import deque l1 = cfgutil.get_section_options(config, 'learner1') pool[0].remaining = deque(sequence) student1 = exputil.get_learner(l1, vct=self.vct, sent_tk=self.sent_tokenizer, seed=self.seed) self.learner1 = bunch.Bunch(student=student1, name="{}-{}".format(l1['utility'], l1['snippet']), pool=pool[0], train=[], budget=0, sequence=sequence) l1 = cfgutil.get_section_options(config, 'learner2') student2 = exputil.get_learner(l1, vct=self.vct, sent_tk=self.sent_tokenizer, seed=self.seed) ## reshuffle the sequence rnd2 = np.random.RandomState(9187465) sequence2 = [s for s in sequence] rnd2.shuffle(sequence2) # udpade the pool pool[1].remaining = deque(sequence2) self.learner2 = bunch.Bunch(student=student2, name="{}-{}".format(l1['utility'], l1['snippet']), pool=pool[1], train=[], budget=0, sequence=sequence2) return self.learner1, self.learner2
def get_name(self): cfg = cfgutil.get_section_options(self.config, 'learner') post = cfgutil.get_section_option(self.config, 'experiment', 'fileprefix') name = "data-{}-lrn-{}-ut-{}-snip-{}-cal-{}{}".format(self.dataname, cfg['type'], cfg['utility'], cfg['snippet'], cfg['calibration'], post) return name
def get_name(self): cfg = cfgutil.get_section_options(self.config, 'learner') post = cfgutil.get_section_option(self.config, 'experiment', 'fileprefix') name = "data-{}-lrn-{}-ut-{}-{}".format(self.dataname, cfg['type'], cfg['loss_function'], post) return name