def getTrainingTestSetSplit(self, trainingSetPercentageSplit=0.8, randomStateSeed=12345): print('starting getTrainingTestSetSplit') trainingSet = Bunch() testSet = Bunch() articleIdsSet = shuffle(list( set([elem[0] for elem in (self.dataset).data])), random_state=randomStateSeed) articlesIdsTestSet = articleIdsSet[0:int( math.floor((( (1.0 - trainingSetPercentageSplit) * len(articleIdsSet)) - 1)))] trainingSet.data = [ elem for elem in (self.dataset).data if elem[0] not in articlesIdsTestSet ] trainingSet.target = [ (self.dataset).target[elemIndex] for elemIndex in range(len((self.dataset).target)) if (self.dataset).data[elemIndex][0] not in articlesIdsTestSet ] trainingSet.target_names = (self.dataset).target_names testSet.data = [ elem for elem in (self.dataset).data if elem[0] in articlesIdsTestSet ] testSet.target = [ (self.dataset).target[elemIndex] for elemIndex in range(len((self.dataset).target)) if (self.dataset).data[elemIndex][0] in articlesIdsTestSet ] testSet.target_names = (self.dataset).target_names print('ended getTrainingTestSetSplit') return (trainingSet, testSet)
def getTainingTestSetSplit(self, trainingSetPercentageSplit= 0.6, randomStateSeed= 12345): trainingSet = Bunch() testSet = Bunch() X_train, X_test, y_train, y_test = train_test_split(self.dataset.data, self.dataset.target, test_size = 1 - trainingSetPercentageSplit, random_state = randomStateSeed) trainingSet.data = X_train trainingSet.target = y_train trainingSet.target_names = (self.dataset).target_names testSet.data = X_test testSet.target = y_test testSet.target_names = (self.dataset).target_names return (trainingSet, testSet)
def randomUndersamplingForBinaryClassification(self, dataset, randomSeed=12345): b2 = Bunch() b2.data = [] b2.target = [] b2.target_names = dataset.target_names datasetTemp = Bunch() datasetTemp.data = [] datasetTemp.target = [] # shuffle dataset datasetTemp.data, datasetTemp.target = shuffle(dataset.data, dataset.target, random_state=randomSeed) positiveExamplesIndexes = [ i for i in xrange(len(datasetTemp.target)) if datasetTemp.target[i] == 1 ] numberOfPositiveExamples = len(positiveExamplesIndexes) numberOfNegativeExamples = len( datasetTemp.target) - len(positiveExamplesIndexes) numberOfPositiveExamplesToKeep = 0 numberOfNegativeExamplesToKeep = 0 if numberOfPositiveExamples > numberOfNegativeExamples: numberOfPositiveExamplesToKeep = numberOfNegativeExamples numberOfNegativeExamplesToKeep = numberOfNegativeExamples else: numberOfNegativeExamplesToKeep = numberOfPositiveExamples numberOfPositiveExamplesToKeep = numberOfPositiveExamples for trainingExampleIndex in xrange(len(datasetTemp.target)): if datasetTemp.target[trainingExampleIndex] == 1: if numberOfPositiveExamplesToKeep > 0: (b2.data).append(datasetTemp.data[trainingExampleIndex]) (b2.target).append(1) numberOfPositiveExamplesToKeep -= 1 else: if numberOfNegativeExamplesToKeep > 0: (b2.data).append(datasetTemp.data[trainingExampleIndex]) (b2.target).append(0) numberOfNegativeExamplesToKeep -= 1 return b2
def train(self, items): topic_words = self.topic(items) sentences = self.items2sentences(items) texts = self.sentences2texts(sentences) id2word = self.sentences2dict(sentences) cats = sorted(list(set([item["cat"] for item in items]))) test1 = Bunch() test1.target = [cats.index(item['cat']) for item in items] test1.target_names = cats #print test1['target'] test1.data = [] for item in items: row = [] for topic_word in topic_words: if topic_word in item["q"]: row.append(1) else: row.append(0) test1.data.append(row) #print test1['data'][0] #X_train = X_train_data.as_matrix() #y_train = y_train_data.as_matrix() clf = Pipeline([ #("imputer", Imputer(missing_values='NaN', strategy="mean", axis=0)), # ('feature_selection', VarianceThreshold(threshold=(.97 * (1 - .97)))), # ('feature_selection', SelectKBest(chi2, k=50)), # ('scaler', StandardScaler()), # ('classification', svm.SVC(class_weight='balanced', cache_size=10240))]) # ('classification', svm.LinearSVC(class_weight='balanced')) # ('classification', SGDClassifier(n_jobs=-1)) ('classification', GradientBoostingClassifier()) ]) text_clf = clf.fit(test1.data, test1.target) #3fold scores = cross_validation.cross_val_score(text_clf, test1.data, test1.target, cv=3) print scores print scores.mean(), scores.std() #confusion predicted = text_clf.predict(test1.data) print( metrics.classification_report(test1.target, predicted, target_names=test1.target_names)) print(metrics.confusion_matrix(test1.target, predicted)) #predicted = text_clf.predict(docs_test) #metrics.confusion_matrix(test1.target, predicted) return text_clf
def combine_sessions(sessions, **kwargs): """ Merge session data sets in single data set. """ # make a copy of the sessions, just to be safe sessions_ = list(sessions) # define dataset based on first session dataset_ = None # append data from other sessions for i, session_ in enumerate(sessions_): print("[+] session: {}, file: {}".format( i, session_.meta.reset_index(drop=False).session[0]) ) if dataset_ is None: dataset_ = Bunch(**dict(session_)) else: dataset_.data = dataset_.data.append(session_.data, ignore_index=True, sort=False) dataset_.meta = dataset_.meta.append(session_.meta, ignore_index=False, sort=False) dataset_.tmask = dataset_.tmask.append(session_.tmask, ignore_index=False, sort=False) # clean if kwargs.get('clean_meta'): dataset_.meta = clean_meta(dataset_.meta, **kwargs).reset_index(drop=False) # set X, y dataset_.X = dataset_.data.values.reshape(-1, dataset_.data.shape[-1]) dataset_.y = dataset_.meta.values.reshape(-1, dataset_.meta.shape[-1]) # cache sessions dataset_.sessions = list(sessions) return dataset_
def createWikiDast(path_to_corpus): lineNum = 0 printLine = 0 data = [] target = [] with open(path_to_corpus) as raw_c: dir = os.path.abspath( os.path.join(path_to_corpus, os.pardir)) # TODO check that this is the parent directory of the file. print(dir) line = 'first' while line != "": line = raw_c.readline() lineNum += 1 printLine += 1 if printLine == 1000000: print(lineNum) printLine = 0 if lineNum == 364270: break data.append(line) target.append(1) dast = Bunch() dast.data = data dast.target = target # dast.target = numpy.zeros(shape=(lineNum), dtype='int32') # docs = {'data': data, 'target':np.asarray(target)} return dast
def shuffleData(self, res): shuffle(res) train = Bunch() train.data = map(lambda x:x[1], res) train.target = map(lambda x:x[0], res) train.target_names = self.names return train
def shuffleData(self, res): shuffle(res) train = Bunch() train.data = map(lambda x: x[1], res) train.target = map(lambda x: x[0], res) train.target_names = self.names return train
def createFullCategoryESWiki(enPathList, simPathList): data = [] target = [] for index, enPath in enumerate(enPathList): d, t = createESWiki(enPath, simPathList[index]) data.extend(d) target.extend(t) docs = Bunch() docs.data = data docs.target = target return docs
def get_data(whichData='train'): dataset = Bunch() dataset.data = np.array([]) dataset.target = np.array([]) if whichData=='train': data = d3mds.get_train_data() targets = d3mds.get_train_targets() elif whichData=='test': data = d3mds.get_test_data() targets = d3mds.get_test_targets() else: raise RuntimeError('get_data should be passed either train or test, but got%s'%whichData) for i, rf in enumerate(data['raw_text_file']): path = os.path.join(textPath, rf) raw = open(path, encoding='utf-8').read() dataset.data = np.append(dataset.data, raw) dataset.target = targets.ravel() return dataset
def split_data(data_set, split_amount): data = Bunch() data.data = data_set.values[:, 0:-1] data.target = data_set.values[:, -1] split_index = int(split_amount * len(data.data)) indices = np.random.permutation(len(data.data)) # indices = range(len(data.data)) train_data = data.data[indices[:split_index]] train_target = data.target[indices[:split_index]] test_data = data.data[indices[split_index:]] test_target = data.target[indices[split_index:]] return train_data, train_target, test_data, test_target
def createTwitterDast(path): csv.field_size_limit(sys.maxsize) data = [] target = [] with open(path, 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|') for row in spamreader: sent = "" for counter, word in enumerate(row): if counter > 5 and counter + 1 < len(row): sent = sent + " " + word data.append(sent) target.append(0) # print sent docs = Bunch() docs.data = data docs.target = target return docs
def createPWKP(path): sentPack = [] data = [] target = [] with open(path) as f: line = f.readline() while line != "": while line != "\n": sentPack.append(line) line = f.readline() if len(sentPack) != 0: data.append(sentPack[0]) target.append(0) data.append(sentPack[1]) target.append(1) sentPack = [] line = f.readline() docs = Bunch() docs.data = data docs.target = target return docs
def load_subject_data(dataset, index=0, mask='mask_vt', sample_mask=None, smoothing_fwhm=4, **kwargs): """ Load functional data for a single haxby subject. """ # extract relevant files func_fn = dataset.func[index] mask_fn = dataset.get(mask) if not isinstance(mask_fn, str): mask_fn = mask_fn[index] # extract data from func using mask_vt masker = NiftiMasker( mask_img=mask_fn, sample_mask=sample_mask, standardize=True, detrend=True, smoothing_fwhm=smoothing_fwhm, low_pass=0.09, high_pass=0.008, t_r=2.5, memory="nilearn_cache", ) X = masker.fit_transform(func_fn) data = pd.DataFrame(X) # return as bunch subject = Bunch() subject.data = data subject.X = X subject.masker = masker subject.mask = mask_fn subject.func = func_fn subject.subject_code = os.path.basename(os.path.dirname(func_fn)) return subject
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(10, args.fixk) if args.fixk < 0: args.fixk = None # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) # fixk_saved = "{0}{1}.p".format(args.train, args.fixk) data, vct = load_from_file(args.train, categories, args.fixk, min_size, vct) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) #### COST MODEL parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### ACCURACY MODEL accu_parameters = parse_parameters_mat(args.accu_model) #### CLASSIFIER clf = set_classifier(args.classifier) print "\nClassifier: %s" % clf #### EXPERT MODEL if "fixed" in args.expert: expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0], cost_function=cost_model.cost_function) #average value of accuracy of the experts elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "linear" in args.expert: #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function) raise Exception("We do not know linear yet!!") elif "log" in args.expert: expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function) elif "direct" in args.expert: expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function) elif "neutral" in args.expert: exp_clf = LogisticRegression(penalty='l1', C=1) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) else: raise Exception("We need a defined cost function options [fixed|log|linear]") exp_clf = LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) print "\nExpert: %s " % expert coef = exp_clf.coef_[0] # print_features(coef, vct.get_feature_names()) #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, 50)) t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t if args.student in "unc": student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, subpool=250) else: student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training if args.fixk is None: pool.fixk = data.train.bow.tocsr() else: pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] # pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random bootstrap #bt = randomsampling.BootstrapRandom(random_state=t * 10) ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) # query = pool.fixk[query_index] # query with k words query = pool.data[query_index] # print query_index # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] query_size = [1]*query.shape[0] ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) query_cost = np.array(spent).sum() current_cost += query_cost # train_indices.extend(query_index) # remove labels from pool pool.remaining.difference_update(query_index) # add labels to training # train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ( "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, format_spent(spent))) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count #x_axis_range = int(current_cost / eval_range) x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)
# -*- coding: utf-8 -*- import pandas from sklearn.linear_model import Perceptron from sklearn.preprocessing import StandardScaler from sklearn.datasets.base import Bunch # 1. Загрузите обучающую и тестовую выборки из файлов perceptron- # train.csv и perceptron-test.csv. Целевая переменная записана в первом # столбце, признаки — во втором и третьем. data = pandas.read_csv('perceptron-train.csv', header=None) train, test = Bunch(), Bunch() train.data, train.target = data.loc[:, 1:], data.loc[:, 0] data = pandas.read_csv('perceptron-test.csv', header=None) test.data, test.target = data.loc[:, 1:], data.loc[:, 0] # 2. Обучите персептрон со стандартными параметрами и random_state=241 perc = Perceptron(random_state=241) perc.fit(train.data, train.target) # learning # 3. Подсчитайте качество (долю правильно классифицированных объ- # ектов, accuracy) полученного классификатора на тестовой выборке. accuracy = perc.score(test.data, test.target) # predicting print accuracy # 4. Нормализуйте обучающую и тестовую выборку с помощью класса # StandardScaler.
import numpy as np from skimage import io from sklearn.datasets.base import Bunch from dip.load_data import load_image_files, load_mask_images from dip.mask import bounding_rect_of_mask datasets = load_mask_images() data = [] for f, mask in zip( datasets.filenames, load_image_files(datasets.filenames), ): # rect: (min_x, max_x, min_y, max_x) rect = bounding_rect_of_mask(mask, negative=True) data.append(list(rect)) print('{0}: {1}'.format(f, rect)) bunch = Bunch(name='mask rects') bunch.data = np.array(data) bunch.filenames = datasets.filenames bunch.target = datasets.target bunch.target_names = datasets.target_names bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)' with gzip.open('rects.pkl.gz', 'wb') as f: pickle.dump(bunch, f)
def mlviz_two(_, a, b, c): import numpy as np import pandas as pd from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, log_loss import xgboost as xgb import json from sklearn.datasets.base import Bunch from sklearn.preprocessing import LabelEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.impute import SimpleImputer from sklearn.svm import SVC data = a train = b test = c names = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'martial-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income' ] meta = { 'target_names': list(data.income.unique()), 'feature_names': list(data.columns), 'categorical_features': { column: list(data[column].unique()) for column in data.columns if data[column].dtype == 'object' } } names = meta['feature_names'] meta['categorical_features'].pop('income') dataset = Bunch(data=train[names[:-1]], target=train[names[-1]], data_test=test[names[:-1]], target_test=test[names[-1]], target_names=meta['target_names'], feature_names=meta['feature_names'], categorical_features=meta['categorical_features'], DESCR="descr") # return dataset class EncodeCategorical(BaseEstimator, TransformerMixin): """ Encodes a specified list of columns or all columns if None. """ def __init__(self, columns=None): self.columns = columns self.encoders = None def fit(self, data, target=None): """ Expects a data frame with named columns to encode. """ # Encode all columns if columns is None if self.columns is None: self.columns = data.columns # Fit a label encoder for each column in the data frame self.encoders = { column: LabelEncoder().fit(data[column]) for column in self.columns } return self def transform(self, data): """ Uses the encoders to transform a data frame. """ output = data.copy() for column, encoder in self.encoders.items(): output[column] = encoder.transform(data[column]) return output encoder = EncodeCategorical(dataset.categorical_features.keys()) dataset.data = encoder.fit_transform(dataset.data) dataset.data_test = encoder.fit_transform(dataset.data_test) # return dataset class ImputeCategorical(BaseEstimator, TransformerMixin): """ Encodes a specified list of columns or all columns if None. """ def __init__(self, columns=None): self.columns = columns self.imputer = None def fit(self, data, target=None): """ Expects a data frame with named columns to impute. """ # Encode all columns if columns is None if self.columns is None: self.columns = data.columns # Fit an imputer for each column in the data frame self.imputer = SimpleImputer(missing_values=0, strategy='most_frequent') self.imputer.fit(data[self.columns]) return self def transform(self, data): """ Uses the encoders to transform a data frame. """ output = data.copy() output[self.columns] = self.imputer.transform(output[self.columns]) return output imputer = ImputeCategorical(['workclass', 'native-country', 'occupation']) dataset.data = imputer.fit_transform(dataset.data) dataset.data_test = imputer.fit_transform(dataset.data_test) X_train = dataset.data yencode = LabelEncoder().fit(dataset.target) y_train = yencode.transform(dataset.target) X_test = dataset.data_test y_test = yencode.transform([y.rstrip(".") for y in dataset.target_test]) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) def grid_test_xgboost(colsample_tree, subsample, max_depth, min_child_weight, eta): # train model params = { 'objective': 'multi:softprob', 'num_class': 2, 'eval_metric': 'mlogloss', 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'eta': eta, 'subsample': subsample, 'colsample_bytree': colsample_tree } model = xgb.train(params, dtrain, evals=[(dtrain, 'train')], verbose_eval=False) # evaluate model y_proba = model.predict(dtest) y_pred = y_proba.argmax(axis=1) loss = log_loss(y_test, y_proba) acc = accuracy_score(y_test, y_pred) return acc def grid_test_svm(kernel, gamma, C): clf = SVC(kernel=kernel, gamma=gamma, C=C).fit(X_train, y_train) accuracy = clf.score(X_test, y_test) return accuracy # colsample_tree = [1.0] # subsample = [1.0] # max_depth = [1, 10] # min_child_weight = [1, 10] # eta = [.9, .3, .01, .005] colsample_tree = [1.0] subsample = [1.0] max_depth = [1] min_child_weight = [1] eta = [.9] val = None for i in colsample_tree: for j in subsample: for k in max_depth: for l in min_child_weight: for m in eta: val = grid_test_xgboost(i, j, k, l, m) return val
def train(classify_name): digits = Bunch() digits.data = [] digits.target = [] digits.target_names = [] parent_path = classify_name for category in os.listdir(parent_path): full_category_path = os.path.join(parent_path, category) if not os.path.isdir(full_category_path): continue for file in os.listdir(full_category_path): full_file_path = os.path.join(full_category_path, file) im = cv2.imread(full_file_path) im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) im = cv2.resize(im, (30, 30), interpolation=cv2.INTER_AREA) im = np.array(im, 'float64') digits.data.append(im) digits.target.append(category) if category not in digits.target_names: digits.target_names.append(category) # Extract the features and labels features = np.array(digits.data, 'int16') labels = np.array(digits.target, 'string') # Extract the hog features list_hog_fd = [] for feature in features: fd = hog(feature.reshape((30, 30)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') # Normalize the features pp = preprocessing.StandardScaler().fit(hog_features) hog_features = pp.transform(hog_features) print "training..." from sklearn import svm clf = svm.SVC(gamma=0.001, C=100.) clf.fit(hog_features, labels) # Save the classifier from sklearn.externals import joblib joblib.dump((clf, pp), 'PKL/%s.pkl' % classify_name, compress=3) print "testing..." correct = 0 incorrect = 0 incorrect_list = {} for category in os.listdir(parent_path): full_category_path = os.path.join(parent_path, category) if not os.path.isdir(full_category_path): continue for file in os.listdir(full_category_path): full_file_path = os.path.join(full_category_path, file) pic_data = cv2.imread(full_file_path) pic_data = cv2.cvtColor(pic_data, cv2.COLOR_BGR2GRAY) pic_data = cv2.resize(pic_data, (30, 30), interpolation=cv2.INTER_AREA) pic_data = np.array(pic_data, 'int16') pic_hog_fd = hog(pic_data.reshape((30, 30)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False) pic_hog_fd = pp.transform(np.array([pic_hog_fd], 'float64')) print full_file_path prediction = clf.predict(pic_hog_fd)[0] print prediction if prediction == category: correct += 1 else: incorrect += 1 incorrect_list.update({full_file_path: prediction}) return correct, incorrect, incorrect_list
def getCompleteDataset(self): b= Bunch() b.data= (self.dataset).data b.target= (self.dataset).target b.target_names= (self.dataset).target_names return b
else: print(filename, 'is not a regular file.', file=sys.stderr, flush=True) print(status_update(filename, no_samples, end), flush=True) no_samples += 1 if len(data) is 0 or len(target) is 0: print('Data array collection error: no data found.', file=sys.stderr, flush=True) exit(1) X = np.asarray(data) y = np.asarray(target) samples = Bunch() samples.data = data samples.target = target samples_file = path.join(args.directory, 'poly2d.pkl.xz') joblib.dump(samples, samples_file) cv_neighbors = 5 knn = KNeighborsClassifier(n_neighbors=cv_neighbors, n_jobs=-1) knn.fit(X, y) model_file = path.join(args.directory, 'knn_model.pkl.xz') joblib.dump(knn, model_file) cv_folds = 5 try: scores = cross_val_score(knn, X, y, cv=cv_folds) except ValueError as e: message = 'Error computing cross_val_score.'
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(50, args.fixk) if "imdb" in args.train: ########## IMDB MOVIE REVIEWS ########### data = load_imdb(args.train, shuffle=True, rnd=2356, vct=vct, min_size=min_size, fix_k=args.fixk) # should brind data as is elif "aviation" in args.train: raise Exception("We are not ready for that data yet") elif "20news" in args.train: ########## 20 news groups ###### data = load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=min_size, fix_k=args.fixk) # for testing purposes elif "dummy" in args.train: ########## DUMMY DATA########### data = load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True, rnd=2356, vct=vct, min_size=0, fix_k=args.fixk) else: raise Exception("We do not know that dataset") print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) #print(data.train.data[0]) #### COST MODEL parameters = parse_parameters(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### ACCURACY MODEL # try: # # accu_parameters = parse_parameters(args.accu_model) # except ValueError: accu_parameters = parse_parameters_mat(args.accu_model) # else # print("Error: Accuracy parameters didn't work") print "Accuracy Parameters %s" % accu_parameters #if "fixed" in args.accu_function: # accuracy_model = base_models.FixedAccuracyModel(accuracy_value=.7) #elif "log" in args.accu_function: # accuracy_model = base_models.LogAccuracyModel(model=parameters) #elif "linear" in args.accu_function: # accuracy_model = base_models.LRAccuracyModel(model=parameters) #else: # raise Exception("We need a defined cost function options [fixed|log|linear]") # #print "\nAccuracy Model: %s " % accuracy_model #### CLASSIFIER #### Informed priors #feature_counts = np.ones(x_train.shape[0]) * x_train #feature_frequencies = feature_counts / np.sum(feature_counts) #alpha = feature_frequencies alpha = 1 clf = MultinomialNB(alpha=alpha) print "\nClassifier: %s" % clf #### EXPERT MODEL #expert = baseexpert.BaseExpert() if "fixed" in args.expert: expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0], cost_function=cost_model.cost_function) #average value of accuracy of the experts elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "linear" in args.expert: #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function) raise Exception("We do not know linear yet!!") elif "log" in args.expert: expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function) elif "direct" in args.expert: expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function) else: raise Exception("We need a defined cost function options [fixed|log|linear]") #expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, 50)) t0 = time.time() ### experiment starts for t in range(args.trials): print "*" * 60 print "Trial: %s" % t # TODO shuffle the data?? #student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget, # seed=t) student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool #for x in pool.fixk: # print x.todense().sum() bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random bootstrap #bt = randomsampling.BootstrapRandom(random_state=t * 10) ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) query = pool.fixk[query_index] # query with k words query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] #if query_size[0] >50: # print "*** %s" % pool.kwords[query_index] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth else: #labels = expert.label_instances(query, ground_truth) labels = expert.label_instances(query_size, ground_truth) #spent = expert.estimate_instances(pool.kwords[query_index]) spent = expert.estimate_instances(query_size) query_cost = np.array(spent).sum() current_cost += query_cost train_indices.extend(query_index) # remove labels from pool pool.remaining.difference_update(query_index) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels #train_y = pool.target[train_indices] train_y.extend(labels) if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) #auc = metrics.roc_auc_score(data.test.target, y_probas[:,1]) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ( "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, spent)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count #x_axis_range = int(current_cost / eval_range) x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results #accuracies[len(train_indices)].append(accu) #aucs[len(train_indices)].append(auc) accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) iteration += 1 print("Elapsed time %.3f" % (time() - t0)) print_results(x_axis, accuracies, aucs)
def getCrossValidationSplits(self, completeDataset, nSplits): datasetUniqueArticlesIds = list( set([elem["ArticleId"] for elem in completeDataset.data])) articleIdUniqueIntKeyDict = {} integerKey = 0 for articleId in datasetUniqueArticlesIds: articleIdUniqueIntKeyDict[articleId] = integerKey integerKey = integerKey + 1 trainingSetGroups = np.asarray([ articleIdUniqueIntKeyDict[elem["ArticleId"]] for elem in completeDataset.data ]) cvStrategyTrainingTestData = GroupKFold(n_splits=nSplits) cvFoldsTrainingTestData = cvStrategyTrainingTestData.split( completeDataset.data, completeDataset.target, trainingSetGroups) foldsPartition = [] foldsPartitionIndexes = [] for currentFoldTrainingSetIdx, currentFoldTestIdx in cvFoldsTrainingTestData: # get Training/Dev set partitions cvStrategyTrainingDevData = GroupKFold(n_splits=nSplits) cvFoldsTrainingDevData = cvStrategyTrainingDevData.split( completeDataset.data[currentFoldTrainingSetIdx], completeDataset.target[currentFoldTrainingSetIdx], trainingSetGroups[currentFoldTrainingSetIdx]) cvFoldsSplits = [ (trainingSetIdx, devSetIdx) for trainingSetIdx, devSetIdx in cvFoldsTrainingDevData ] currentFoldTrainingSetFinalIdx = currentFoldTrainingSetIdx[ cvFoldsSplits[0][0]] currentFoldDevIdx = currentFoldTrainingSetIdx[cvFoldsSplits[0][1]] currentFoldTrainingSet = Bunch() currentFoldTrainingSet.data = completeDataset.data[ currentFoldTrainingSetFinalIdx] currentFoldTrainingSet.target = completeDataset.target[ currentFoldTrainingSetFinalIdx] currentFoldTrainingSet.target_names = completeDataset.target_names currentFoldDevSet = Bunch() currentFoldDevSet.data = completeDataset.data[currentFoldDevIdx] currentFoldDevSet.target = completeDataset.target[ currentFoldDevIdx] currentFoldDevSet.target_names = completeDataset.target_names currentFoldTestSet = Bunch() currentFoldTestSet.data = completeDataset.data[currentFoldTestIdx] currentFoldTestSet.target = completeDataset.target[ currentFoldTestIdx] currentFoldTestSet.target_names = completeDataset.target_names foldsPartition.append((currentFoldTrainingSet, currentFoldDevSet, currentFoldTestSet)) foldsPartitionIndexes.append( (currentFoldTrainingSetFinalIdx, currentFoldDevIdx, currentFoldTestIdx)) # store training, development and test set on pickle file cvFoldsPartitionInfoFile = codecs.open( str(self.corpusPath) + "/" + "FoldsPartition" + "/" + str(self.corpusFilename) + "_DatasetPartition_" + str(nSplits) + "Folds.pkl", mode="w", encoding="utf-8") pickle.dump(foldsPartition, cvFoldsPartitionInfoFile) cvFoldsPartitionInfoFile.close() # store learning instances indexes for training, development and test set cvFoldsPartitionIndexesInfoFile = codecs.open( str(self.corpusPath) + "/" + "FoldsPartition" + "/" + str(self.corpusFilename) + "_DatasetPartition_" + str(nSplits) + "FoldsIndexes.pkl", mode="w", encoding="utf-8") pickle.dump(foldsPartitionIndexes, cvFoldsPartitionIndexesInfoFile) cvFoldsPartitionIndexesInfoFile.close() return True
stringInput) return stringInput trainingStrings = [] trainingStringCats = [] i = 0 for cat in trainingData: for trainingString in trainingData[cat]: trainingStrings.append(doRegexReplacement(trainingString)) trainingStringCats.append(i) i += 1 chat = Bunch() chat.data = trainingStrings chat.target = trainingStringCats chat.target_names = trainingData.keys() text_clf = Pipeline([ ('vect', CountVectorizer(stop_words=['please'])), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) _ = text_clf.fit(chat.data, chat.target) nextFunc = None
def translateDataset(self, sourceLanguage, targetLanguage): print "\n Translating training set ..." trainingSetTranslations = Bunch() trainingSetTranslations.data = [] trainingSetTranslations.target = [] trainingSetTranslations.target_names = ["None", "Support"] # training set for learningInstanceIndex in xrange(len((self.trainingSet).target)): sourceADUContent = str( tb((self.trainingSet ).data[learningInstanceIndex]["SourceADU"]).translate( from_lang=sourceLanguage, to=targetLanguage)) targetADUContent = str( tb((self.trainingSet ).data[learningInstanceIndex]["TargetADU"]).translate( from_lang=sourceLanguage, to=targetLanguage)) (trainingSetTranslations.data).append({ "SourceADU": sourceADUContent, "TargetADU": targetADUContent, "SourceADU_tokens": self.myTokenizer(sourceADUContent, lowercase=True, removePunctuationMarks=False), "TargetADU_tokens": self.myTokenizer(targetADUContent, lowercase=True, removePunctuationMarks=False), "ArticleId": (self.trainingSet).data[learningInstanceIndex]["ArticleId"] }) (trainingSetTranslations.target).append( (self.trainingSet).target[learningInstanceIndex]) print "\n Translating validation set ..." validationSetTranslations = Bunch() validationSetTranslations.data = [] validationSetTranslations.target = [] validationSetTranslations.target_names = ["None", "Support"] # validation set for learningInstanceIndex in xrange(len((self.validationSet).target)): sourceADUContent = str( tb((self.validationSet ).data[learningInstanceIndex]["SourceADU"]).translate( from_lang=sourceLanguage, to=targetLanguage)) targetADUContent = str( tb((self.validationSet ).data[learningInstanceIndex]["TargetADU"]).translate( from_lang=sourceLanguage, to=targetLanguage)) (validationSetTranslations.data).append({ "SourceADU": sourceADUContent, "TargetADU": targetADUContent, "SourceADU_tokens": self.myTokenizer(sourceADUContent, lowercase=True, removePunctuationMarks=False), "TargetADU_tokens": self.myTokenizer(targetADUContent, lowercase=True, removePunctuationMarks=False), "ArticleId": (self.validationSet).data[learningInstanceIndex]["ArticleId"] }) (validationSetTranslations.target).append( (self.validationSet).target[learningInstanceIndex]) print "\n Translating test set ..." testSetTranslations = Bunch() testSetTranslations.data = [] testSetTranslations.target = [] testSetTranslations.target_names = ["None", "Support"] # training set for learningInstanceIndex in xrange(len((self.testSet).target)): sourceADUContent = str( tb((self.testSet ).data[learningInstanceIndex]["SourceADU"]).translate( from_lang=sourceLanguage, to=targetLanguage)) targetADUContent = str( tb((self.testSet ).data[learningInstanceIndex]["TargetADU"]).translate( from_lang=sourceLanguage, to=targetLanguage)) (testSetTranslations.data).append({ "SourceADU": sourceADUContent, "TargetADU": targetADUContent, "SourceADU_tokens": self.myTokenizer(sourceADUContent, lowercase=True, removePunctuationMarks=False), "TargetADU_tokens": self.myTokenizer(targetADUContent, lowercase=True, removePunctuationMarks=False), "ArticleId": (self.testSet).data[learningInstanceIndex]["ArticleId"] }) (testSetTranslations.data).target.append( (self.testSet).target[learningInstanceIndex]) translationsFile = codecs.open(self.corpusPath + "/" + self.corpusFilename + "_translatedTo_" + targetLanguage + ".pkl", mode="w", encoding="utf-8") pickle.dump( { "trainingSet": trainingSetTranslations, "validationSet": validationSetTranslations, "testSet": testSetTranslations }, translationsFile) translationsFile.close() print "[Done] Translations"
def main(): print args print accuracies = defaultdict(lambda: []) ora_accu = defaultdict(lambda: []) oracle_accuracies =[] ora_cm = defaultdict(lambda: []) lbl_dit = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = experiment_utils.parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ ### SENTENCE TRANSFORMATION if args.train == "twitter": sent_detector = TwitterSentenceTokenizer() else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() if not args.fulloracle: train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) if not args.fulloracle: print "Training expert documents:%s" % len(expert_data.oracle.train.data) labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) else: # expert_data.data = np.concatenate((data.train.data, data.test.data)) # expert_data.target = np.concatenate((data.train.target, data.test.target)) expert_data.data =data.train.data expert_data.target = data.train.target expert_data.target_names = data.train.target_names labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit) expert_data.bow = vct.transform(sent_train) expert_data.target = labels expert_data.data = sent_train exp_clf.fit(expert_data.bow, expert_data.target) if "neutral" in args.expert: expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "pred" in args.expert: expert = baseexpert.PredictingExpert(exp_clf, #threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "human" in args.expert: expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ") else: raise Exception("We need an expert!") print "\nExpert: %s " % expert #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") sent_clf = None if args.cheating: labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### STUDENT CLASSIFIER clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) print "\nStudent Classifier: %s" % clf print "\nSentence Classifier: %s" % sent_clf print "\nExpert Oracle Classifier: %s" % exp_clf print "\nPenalty Oracle:", exp_clf.C print "\nVectorizer: %s" % vct #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t student = get_student(clf, cost_model, sent_clf, sent_detector, vct) student.human_mode = args.expert == 'human' print "\nStudent: %s " % student train_indices = [] neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data pool.target = data.train.target pool.predicted = [] pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None oracle_answers = 0 calibrated=args.calibrate while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: chosen = student.pick_next(pool=pool, step_size=step_size) query_index = [x for x, y in chosen] # document id of chosen instances query = [y[0] for x, y in chosen] # sentence of the document query_size = [1] * len(query_index) ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) ## add data recent acquired to train if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct) # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels) if neu_y.shape[0] != neu_x.shape[0]: raise Exception("Training data corrupted!") if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train_all(train_x, train_y, neu_x, neu_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum() accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format( len(train_indices), accu, auc, query_cost, current_cost, ground_truth, len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count # oracle accuracy (from queries) oracle_answers += correct_labels x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ora_accu[x_axis_range].append(1. * correct_labels) ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y))) lbl_dit[x_axis_range].append(np.sum(train_y)) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) # oracle_accuracies[x_axis_range].append(oracle_answers) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size)) print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers, iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size)) #end trial loop if args.cost_function not in "uniform": accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean() print("Elapsed time %.3f" % (time.time() - t0)) cheating = "CHEATING" if args.cheating else "NOCHEAT" experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student) experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
Launch a cross-validation on three models to see which is best on custom data from: https://archive.ics.uci.edu/ml/datasets.html Chosen data: https://archive.ics.uci.edu/ml/datasets/Leaf Tested models: SVC, RandomForestClassifier, DecisionTreeClassifier. Author: Claudio Sousa, David Gonzalez """ from sklearn import datasets from cross_validation import cross_validate, plot_validation, output_csv, normalise_data from models import instanciate_svc_model, instanciate_randomforest_model, instanciate_decisiontree_model import numpy as np from sklearn.datasets.base import Bunch import pandas as pd csv = pd.read_csv("../data/leaf.csv") data = Bunch(data=np.array([list(d[1:]) for d in csv.values]), target=np.array([d[0] for d in csv.values])) data.data = normalise_data(data.data) models = [ instanciate_svc_model(), instanciate_randomforest_model(1, 11), instanciate_decisiontree_model(1, 11) ] best_model = cross_validate(data, models, 5, 10) output_csv(models, best_model, "leaf") plot_validation(models, best_model)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(100, args.fixk) fixk_saved = "{0}{1}.p".format(args.train, args.fixk) try: fixk_file = open(fixk_saved, "rb") data = pickle.load(fixk_file) except IOError: data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) fixk_file = open(fixk_saved, "wb") pickle.dump(data, fixk_file) # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) print "\nStudent Classifier: %s" % clf #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Cheating experiment - use full uncertainty query k words") t0 = time.time() ### experiment starts tx =[] tac = [] tau = [] for t in range(args.trials): trial_accu =[] trial_aucs = [] trial_x_axis = [] print "*" * 60 print "Trial: %s" % t student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) query = pool.fixk[query_index] # query with k words query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ## add data recent acquired to train ## CHANGE: if label is not useful, ignore and do not charge money for it useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None]) # train_indices.extend(query_index) if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) #count for cost ### accumulate the cost of the query # query_cost = np.array(spent).sum() # current_cost += query_cost query_cost = useful_answers[:, 2] query_cost = np.sum(query_cost) current_cost += query_cost if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, spent)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ## partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop accuracies = extrapolate_trials(tac) aucs = extrapolate_trials(tau) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(100, args.fixk) if args.fixk < 0: args.fixk = None fixk_saved = "{0}{1}.p".format(args.train, args.fixk) try: print "Loading existing file... %s " % args.train fixk_file = open(fixk_saved, "rb") data = pickle.load(fixk_file) fixk_file.close() vectorizer = open("{0}vectorizer.p".format(args.train), "rb") vct = pickle.load(vectorizer) vectorizer.close() except (IOError, ValueError): print "Loading from scratch..." data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) fixk_file = open(fixk_saved, "wb") pickle.dump(data, fixk_file) fixk_file.close() vectorizer = open("{0}vectorizer.p".format(args.train), "wb") pickle.dump(vct, vectorizer) vectorizer.close() # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) # clf = set_classifier(args.classifier) print "\nStudent Classifier: %s" % clf #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t if args.student in "anyunc": student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model) elif args.student in "lambda": student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model, lambda_value=args.lambda_value) elif args.student in "anyzero": student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model) else: raise ValueError("Oops! We do not know that anytime strategy. Try again.") print "\nStudent: %s " % student train_indices = [] neutral_text = [] # save the raw text of the queries neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data # pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] # pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: # print "pick instance" ## chose returns: index, k ## util returns: utility, k, unc query_chosen, util = student.pick_next(pool=pool, step_size=step_size) query_index = [a for a, b in query_chosen] query_size = [b for a, b in query_chosen] # query = pool.fixk[query_index] # query with k words qk = [] for q, k in query_chosen: qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)])) query = vct.transform(qk) # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost # print query_index useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) # print labels # print "label\tutility\tk\tunc" # print format_query(zip(labels, util)) ## add data recent acquired to train if useful_answers.shape[0] != 0: # print "get training" # train_indices.extend(query_index) train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels #train_y = pool.target[train_indices] train_y.extend(useful_answers[:, 1]) if neutral_answers.shape[0] != 0: # current query neutrals qlbl = [] for xik, lbl in zip(query, labels): # neutral_data.append(xik) if isinstance(neutral_data, list): neutral_data = xik else: neutral_data = vstack([neutral_data, xik], format='csr') qlbl.append(neutral_label(lbl)) ## append the labels of the current query neu_y = np.append(neu_y, qlbl) neu_x = neutral_data #end usefulanswers if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model # current_model = student.train(train_x, train_y) # print "train models" current_model = student.train_all(train_x, train_y, neu_x, neu_y) # print "evaluate" # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format( len(train_indices), accu, auc, query_cost, current_cost, format_spent(spent), len(neutral_answers), neu_y.shape[0])) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)