def start(self): print self.get_name() trial = [] self._setup_options(self.config) t0 = time() self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed, shuffle=True, percent=self.split, keep_subject=True) self.print_lap("Loaded", t0) # self.data = self.vectorize(self.data) cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split) t = 0 for train_index, test_index in cv: # get the data of this cv iteration # train, test = exputil.sample_data(self.data, train_index, test_index) train, test = self._sample_data(self.data, train_index, test_index) self.print_lap("\nSampled", t0) # get the expert and student learner = exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'), vct=self.vct, sent_tk=self.sent_tokenizer, seed=(t * 10 + 10), cost_model=self.cost_model) expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=len(train.data)) expert.fit(train.data, y=train.target, vct=self.vct) # do active learning results = self.main_loop(learner, expert, self.budget, self.bootstrap_size, train, test) self.print_lap("\nTrial %s" % t, t0) # save the results trial.append(results) t += 1 self.report_results(trial)
def start(self, n_jobs=1, pre_dispatch='2*n_jobs'): trial = [] self._setup_options(self.config) print self.get_name() t0 = time() self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed, shuffle=True, percent=self.split, keep_subject=True) self.print_lap("Loaded", t0) self.data = self.vectorize(self.data) cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split) seeds = np.arange(len(cv)) * 10 + 10 expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=(len(self.data.train.target),self.data.train.sizes.max())) expert.fit(self.data.train.bow, y=self.data.train.target, vct=self.vct) lrnr_setup= {'vct':self.vct, "sent_tk":self.sent_tokenizer, "cost_model":self.cost_model, 'validation_set':self.validation_set} lrnr_type = cfgutil.get_section_option(self.config, 'learner', 'type') neu_threshold = cfgutil.get_section_option(self.config, 'expert', 'threshold') if lrnr_type in ['utility-cheat','const-cheat','const-cheat-noisy']: lrnr_setup.update({'snip_model':expert.oracle, 'threshold':neu_threshold}) learners = [exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'), seed=s, **lrnr_setup) for s in seeds] self.print_lap("\nPreprocessed", t0) # =================================== parallel = Parallel(n_jobs=n_jobs, verbose=True, pre_dispatch=pre_dispatch) scores = parallel(delayed(self.main_loop_jobs,check_pickle=False)(learners[t], expert, self.budget, self.bootstrap_size, self.data, tr[0],tr[1], t) for t, tr in enumerate(cv)) # =================================== self.print_lap("\nDone trials", t0) # save the results self.report_results(scores)
def start(self): import copy from collections import deque from time import time self.set_options(self.config) self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed, shuffle=True, percent=self.split, keep_subject=True) sequence = self.get_sequence(len(self.data.train.target), self.budget+self.bootstrap_size) pool, test = self._sample_data(self.data, sequence, []) # pool2, _ = self._sample_data(self.data, sequence, []) # pool2 = copy.deepcopy(pool) pool2 = self.copy_pool(pool) # pool2.remaining = [] student1, student2 = self.get_student(self.config, [pool, pool2], sequence) expert = self.get_expert(self.config, self.data.train.target_names) combined_budget = 0 coin = np.random.RandomState(9187465) i = 0 # expert_labels = self.start_record() student = {'learner1':student1, 'learner2':student2} expert_times = {'learner1':[], 'learner2':[]} expert_labels = {'learner1': self.start_record(), 'learner2': self.start_record()} original_sequence = [] raw_input("\n*** Press <return> to start ***") t0 = time() while combined_budget < (2 * self.budget): if i == 0: ## Bootstrap # bootstrap train = self.bootstrap(student['learner1'].pool, self.bootstrap_size, bunch.Bunch(index=[], target=[])) student['learner1'].train = train student['learner2'].train = bunch.Bunch(index=copy.copy(train.index), target=copy.copy(train.target)) student['learner1'].student = self.retrain(student['learner1'].student, student['learner1'].pool, student['learner1'].train) student['learner2'].student = self.retrain(student['learner2'].student, student['learner2'].pool, student['learner2'].train) for t in train.index: student['learner1'].pool.remaining.remove(t) student['learner2'].pool.remaining.remove(t) tmp_list = list(student['learner1'].pool.remaining) pool_sample = self.rnd_state.choice(tmp_list, self.budget, False) student['learner1'].pool.remaining = deque(pool_sample) original_sequence = copy.copy(train.index) + list(pool_sample) self.rnd_state.shuffle(pool_sample) student['learner2'].pool.remaining = deque(pool_sample) else: # select student next_turn = coin.random_sample() if next_turn < .5: curr_student = 'learner1' else: # first1 student curr_student = 'learner2' query, labels = self.al_cycle(student[curr_student], expert) # print len(student['learner1'].pool.remaining), len(student['learner2'].pool.remaining) if query is not None and labels is not None: # progress print "\n%.1f %% completed" % (100. * combined_budget / (2 * self.budget)) # re-train the learner student[curr_student].student = self.retrain(student[curr_student].student, student[curr_student].pool, student[curr_student].train) #We can evaluate later step_oracle = self.evaluate_oracle(query, labels, labels=np.unique(student[curr_student].pool.target)) # record labels expert_labels[curr_student] = self.record_labels(expert_labels[curr_student], query, labels, time=expert.get_annotation_time(), pause=expert.get_pause()) if self.debug: self._debug(student[curr_student], expert, query, step_oracle) combined_budget = student['learner1'].budget + student['learner2'].budget i += 1 t1 = time() print "\nTotal annotation time: %.3f secs (%.3f mins)" % ((t1-t0), (t1-t0)/60) self.save_results(student, expert_times, expert_labels) ##TODO evaluate the students after getting labels # self.evaluate_student(student['learner1'], student['learner1'].train.index, pool, test, order=False) # self.evaluate_student(student['learner2'], student['learner1'].train.index, pool, test, order=True) t = bunch.Bunch(index=expert_labels['learner1']['index'], target=expert_labels['learner1']['labels']) self.evaluate_student(student['learner1'].student.model, t, original_sequence, pool, test, name=student['learner1'].name, order=False) t = bunch.Bunch(index=expert_labels['learner2']['index'], target=expert_labels['learner2']['labels']) self.evaluate_student(student['learner2'].student.model, t, original_sequence, pool, test, name=student['learner2'].name, order=True)
def start(self): import copy from collections import deque self.set_options(self.config) self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed, shuffle=True, percent=self.split, keep_subject=True) sequence = self.get_sequence(len(self.data.train.target), self.budget+self.bootstrap_size) pool, test = self._sample_data(self.data, sequence, []) # pool2, _ = self._sample_data(self.data, sequence, []) # pool2 = copy.deepcopy(pool) pool2 = self.copy_pool(pool) # pool2.remaining = [] student1, student2 = self.get_student(self.config, [pool, pool2], sequence) expert = self.get_expert(self.config, self.data.train.target_names) combined_budget = 0 coin = np.random.RandomState(9187465) i = 0 # expert_labels = self.start_record() student = {'learner1':student1, 'learner2':student2} expert_times = {'learner1':[], 'learner2':[]} expert_labels = {'learner1': self.start_record(), 'learner2': self.start_record()} original_sequence = [] while combined_budget < (2 * self.budget): if i == 0: ## Bootstrap # bootstrap train = self.bootstrap(student['learner1'].pool, self.bootstrap_size, bunch.Bunch(index=[], target=[])) student['learner1'].train = train student['learner2'].train = bunch.Bunch(index=copy.copy(train.index), target=copy.copy(train.target)) student['learner1'].student = self.retrain(student['learner1'].student, student['learner1'].pool, student['learner1'].train) student['learner2'].student = self.retrain(student['learner2'].student, student['learner2'].pool, student['learner2'].train) for t in train.index: student['learner1'].pool.remaining.remove(t) student['learner2'].pool.remaining.remove(t) tmp_list = list(student['learner1'].pool.remaining) pool_sample = self.rnd_state.choice(tmp_list, self.budget, False) student['learner1'].pool.remaining = deque(pool_sample) self.rnd_state.shuffle(pool_sample) student['learner2'].pool.remaining = deque(pool_sample) else: # select student next_turn = coin.random_sample() if next_turn < .5: curr_student = 'learner1' else: # first1 student curr_student = 'learner2' query, labels = self.al_cycle(student[curr_student], expert) # print len(student['learner1'].pool.remaining), len(student['learner2'].pool.remaining) if query is not None and labels is not None: # re-train the learner student[curr_student].student = self.retrain(student[curr_student].student, student[curr_student].pool, student[curr_student].train) #We can evaluate later step_oracle = self.evaluate_oracle(query, labels, labels=np.unique(student[curr_student].pool.target)) # record labels expert_labels[curr_student] = self.record_labels(expert_labels[curr_student], query, labels, time=expert.get_annotation_time()) if self.debug: self._debug(student[curr_student], expert, query, step_oracle, names=self.data.train.target_names) combined_budget = student['learner1'].budget + student['learner2'].budget i += 1
with open("imdb-vocab-annotated.txt") as f: lines = f.readlines() vocab = [l.strip() for l in lines] print "Dictionary size: %d" %len(vocab) #print vocab # Loading Data print "Loading the data" vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, vocabulary=vocab) if os.path.isfile("imdb-data.pkl"): data = pickle.load(open("imdb-data.pkl", 'rb')) else: data = datautil.load_dataset("imdb", "C:\\Users\\mbilgic\\Desktop\\aclIMDB", categories=None, rnd=5463, shuffle=True) data.train.data = np.array(data.train.data, dtype=object) data.test.data = np.array(data.test.data, dtype=object) pickle.dump(data, open("imdb-data.pkl", 'wb')) print "Fitting the vectorizer" data.test.bow = vct.fit_transform(data.test.data) # Fit the expert print "Training the expert" expert = LogisticRegression('l2', C=1) expert.fit(data.test.bow, data.test.target) terms = np.array(vct.get_feature_names()) coefs = expert.coef_[0]
# In[2]: ## Get the data ready import re vct = CountVectorizer(min_df=2, token_pattern=re.compile(r'(?u)\b\w+\b')) # vct_doc = CountVectorizer(encoding='ISO-8859-1', min_df=2, max_df=1.0, binary=True, token_pattern='\\b\\w+\\b') vct_doc = exputil.get_vectorizer({'vectorizer':'bow', 'limit':None, 'min_size':2}) sent_tk = nltk.data.load('tokenizers/punkt/english.pickle') imdb = load_dataset("imdb",IMDB_DATA, keep_subject=True) imdb.train.bow = vct_doc.fit_transform(imdb.train.data) imdb.test.bow = vct_doc.transform(imdb.test.data) # In[3]: class Document(object): def __init__(self, raw_text, lbl, sent_tk, vct_gral, sent_lbl=None): self.sentences = sent_tk.tokenize_sents([raw_text])[0] self.doc_label = lbl self.sent_bow = vct_gral.transform(self.sentences) # counts per sentence if sent_lbl is not None: self.sent_labels = [lbl] * len(self.sentences) else:
import utilities.experimentutils as exputil import learner import utilities.datautils as datautil import numpy as np import experiment.base as exp import nltk if __name__ == '__main__': ## Get the data ready imdb_path = 'C:/Users/mbilgic/Desktop/aclImdb/' rnd = np.random.RandomState(2345) clf = exputil.get_classifier('lrl2',parameter=1) expert = exputil.get_classifier('lrl2',parameter=1) vct = exputil.get_vectorizer({'vectorizer':"tfidf", 'limit':None, 'min_size':None}) data = datautil.load_dataset('imdb', imdb_path, categories=None, rnd=5463, shuffle=True) data.train.bow = vct.fit_transform(data.train.data) expert = exputil.get_classifier('lrl2',parameter=1) ## Set the learner options and expert sent_tk = nltk.data.load('tokenizers/punkt/english.pickle') student = learner.strategy.StructuredLearner(clf) student.set_sent_tokenizer(sent_tk) student.set_vct(vct) student.set_snippet_utility('sr') student.set_calibration(True) expert.fit(data.train.bow, data.train.target) ## Get the boostrap and train data.train.remaining = rnd.permutation(len(data.train.target)) ## balanced bootstrap