def populate_table(table_fname, features, labels, camNames, actNames, partiNames): n_samples = labels.shape[0] pbar = start_progressbar(n_samples, '%i features to Pytable' % (n_samples)) h5 = ta.openFile(table_fname, mode='a') table = h5.root.input_output_data.readout pp = table.row for i in xrange(n_samples): pp['frame_index'] = i pp['features'] = features[i, :] pp['label'] = labels[i] #pp['aviNames'] = aviNames[i][0:-4] pp['camNames'] = camNames[i] pp['actNames'] = actNames[i] pp['partiNames'] = partiNames[i] pp.append() update_progressbar(pbar, i) end_progressbar(pbar) # save everything in the file and close it table.cols.camNames.createIndex() table.cols.actNames.createIndex() table.cols.partiNames.createIndex() table.flush() h5.close()
def get_bfast_splits(table_fname, settings, n_samples = N_SAMPLES, n_features = N_FEATURES, n_lab = N_LAB, contig_labels = True): h5_all = ta.openFile(table_fname, mode = 'r') table_all = h5_all.root.input_output_data.readout train_p = settings['train_p'] test_p = settings['test_p'] cams = settings['cameras'] #import ipdb; ipdb.set_trace() #row_train = [table_all.where("(partiNames == '%s') & (camNames == '%s')" % (ll, nn) ) for ll in train_p for nn in cams ] labels_train = [] features_train = [] labels_test = [] features_test = [] pbar = start_progressbar(len(train_p), str(len(train_p))+ ' training participants' ) for jj,pat in enumerate(train_p): for cam in cams: labels_train = labels_train + [row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] features_train = features_train+ [row['features'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] update_progressbar(pbar, jj) end_progressbar(pbar) pbar = start_progressbar(len(train_p), str(len(train_p))+ ' testing participants' ) for jj,pat in enumerate(test_p): for cam in cams: labels_test = labels_test + [row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] features_test = features_test+ [row['features'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] update_progressbar(pbar, jj) end_progressbar(pbar) tic = time.time() #uniqLabels = np.intersect1d(np.unique(labels_train), np.unique(labels_test)) uniqLabels = np.unique(labels_train) #KILL UNUSED uniqLabels=uniqLabels[uniqLabels!='SIL'] uniqLabels = uniqLabels[:n_lab] print 'using ',str(len(uniqLabels)),' labels in total' labels_train = np.array(labels_train) selector = np.zeros_like(labels_train, dtype= 'bool') for uL in uniqLabels: selector = np.squeeze(selector|[labels_train == uL]) labels_train = labels_train[selector] features_train = np.array(features_train)[selector,:n_features] labels_test = np.array(labels_test) selector = np.zeros_like(labels_test, dtype= 'bool') for uL in uniqLabels: selector = np.squeeze(selector|[labels_test == uL]) labels_test = labels_test[selector] features_test = np.array(features_test)[selector,:n_features] print "Loaded features converted in ", round(time.time() - tic) , "seconds" table_all.flush() h5_all.close() return features_train , labels_train, features_test, labels_test
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples): uniqLabels = np.unique(labels) print 'Taking ', str(n_lab), ' labels' uniqLabels = uniqLabels[:n_lab] used_labels = uniqLabels pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels)) allLearners = [] for yy ,targetLab in enumerate(uniqLabels): runs=[] for rrr in xrange(n_runs): #import ipdb;ipdb.set_trace() feats,labs = get_binary_sets(features, labels, targetLab, n_samples) #print 'fitting stump' #import ipdb;ipdb.set_trace() baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10) baseClf.fit(feats, labs) ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate, n_estimators=n_estim, algorithm="SAMME.R") #import ipdb;ipdb.set_trace() runs.append(ada_real.fit(feats, labs)) allLearners.append(runs) update_progressbar(pbar, yy) end_progressbar(pbar) return allLearners, used_labels
def populate_table(table_fname, features, labels, camNames, actNames, partiNames): n_samples = labels.shape[0] pbar = start_progressbar(n_samples, '%i features to Pytable' % (n_samples)) h5 = ta.openFile(table_fname, mode = 'a') table = h5.root.input_output_data.readout pp = table.row for i in xrange(n_samples): pp['frame_index'] = i pp['features'] = features[i, :] pp['label'] = labels[i] #pp['aviNames'] = aviNames[i][0:-4] pp['camNames'] = camNames[i] pp['actNames'] = actNames[i] pp['partiNames'] = partiNames[i] pp.append() update_progressbar(pbar, i) end_progressbar(pbar) # save everything in the file and close it table.cols.camNames.createIndex() table.cols.actNames.createIndex() table.cols.partiNames.createIndex() table.flush() h5.close()
def compute_confidence(allLearners, dada, classifier_type): #import ipdb;ipdb.set_trace() tic = time.time() #import ipdb;ipdb.set_trace() if classifier_type == 'adaboost': lab_confidence = np.zeros([dada.shape[0], len(allLearners)], dtype='float64') pbar = start_progressbar(len(allLearners), '%i producing weighted outputs' % len(allLearners)) for ii,thisLab in enumerate(allLearners): res = np.zeros([dada.shape[0]], dtype='float64') for jj, thisLearner in enumerate(thisLab): my_weights = thisLearner.estimator_weights_ #tic = time.time() for hh, thisEstimator in enumerate(thisLearner): res = res+thisEstimator.predict(dada)*my_weights[hh] #import ipdb;ipdb.set_trace() lab_confidence[:,ii] = np.float64(res) update_progressbar(pbar, ii) end_progressbar(pbar) if classifier_type == 'randomforest': #import ipdb;ipdb.set_trace() lab_confidence = np.zeros((dada.shape[0],len(allLearners[0].classes_)), dtype='float64') pbar = start_progressbar(len(allLearners), '%i producing weighted outputs' % len(allLearners[0].classes_)) for ii, thisRun in enumerate(allLearners): lab_confidence += thisRun.predict_proba(dada) update_progressbar(pbar, ii) end_progressbar(pbar) return lab_confidence
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples): uniqLabels = np.unique(labels) print 'Taking ', str(n_lab), ' labels' uniqLabels = uniqLabels[:n_lab] used_labels = uniqLabels pbar = start_progressbar( len(uniqLabels), 'training adaboost for %i labels' % len(uniqLabels)) allLearners = [] for yy, targetLab in enumerate(uniqLabels): runs = [] for rrr in xrange(n_runs): #import ipdb;ipdb.set_trace() feats, labs = get_binary_sets(features, labels, targetLab, n_samples) #print 'fitting stump' #import ipdb;ipdb.set_trace() baseClf = DecisionTreeClassifier(max_depth=1, min_samples_leaf=4, min_samples_split=4) baseClf.fit(feats, labs) ada_real = AdaBoostClassifier(base_estimator=baseClf, learning_rate=learning_rate, n_estimators=n_estim, algorithm="SAMME.R") #import ipdb;ipdb.set_trace() runs.append(ada_real.fit(feats, labs)) allLearners.append(runs) update_progressbar(pbar, yy) end_progressbar(pbar) return allLearners, used_labels
def getMonkeySplits_lim_old(table_fname, splitNo, n_samples=N_SAMPLES, n_features=N_FEATURES): h5_tr = ta.openFile(table_fname + str(splitNo) + '_train.h5', mode='r') table_tr = h5_tr.root.input_output_data.readout h5_te = ta.openFile(table_fname + str(splitNo) + '_test.h5', mode='r') table_te = h5_te.root.input_output_data.readout print 'Converting arrays to sp' uniqLabels = np.unique(table_tr.cols.label) #KILL UNUSED uniqLabels = uniqLabels[uniqLabels != 'unused'] labels_train = [] features_train = [] exctCnt = 0 pbar = start_progressbar(len(uniqLabels), [int(len(uniqLabels)), ' training labels']) for i, thisLab in enumerate(uniqLabels): tempLabels = [ row['label'] for row in table_tr.where("label == thisLab") ] try: selInd = random.sample(range(0, len(tempLabels)), n_samples) except ValueError: selInd = range(0, len(tempLabels)) exctCnt = exctCnt + 1 labels_train = labels_train + [tempLabels[gg] for gg in selInd] tempFeatures = [ row['features'][:][:n_features] for row in table_tr.where("label == thisLab") ] features_train = features_train + [tempFeatures[gg] for gg in selInd] update_progressbar(pbar, i) end_progressbar(pbar) print '%d exceptions occured' % (exctCnt) features_train = sp.array(features_train)[:, :n_features] labels_train = sp.array(labels_train) labels_test = sp.array(table_te.cols.label) #import ipdb; ipdb.set_trace() features_test = sp.array( table_te.cols.features)[labels_test != 'unused', :n_features] labels_test = labels_test[labels_test != 'unused'] print 'Converted' #import ipdb; ipdb.set_trace() table_tr.flush() table_te.flush() h5_tr.close() h5_te.close() print "feature loading completed" return features_train, labels_train, features_test, labels_test
def split_data_from_table(table_fname, n_samples = N_SAMPLES, n_features = N_FEATURES): h5 = ta.openFile(table_fname, mode = 'r') table = h5.root.input_output_data.readout l_features = table.cols.features l_index = table.cols.frame_index l_labels = table.cols.label import ipdb; ipdb.set_trace() n_samples_total = len(l_labels) assert(2*n_samples < n_samples_total) import warnings ; warnings.warn("""have something that takes a split depending on classes to keep it balanced""") #TODO: have a balanced split on each class ind_total = sp.random.permutation(n_samples_total) ind_train = ind_total[:n_samples] ind_test = ind_total[n_samples:2*n_samples] sp.array([(ind_train == test).any() for test in ind_test]).any() print "checked that train and test do not overlap" """ features_train = features.T[ind_train, :n_features] labels_train = labels[ind_train] features_test = features.T[ind_test, :n_features] labels_test = labels[ind_test] """ features_train = sp.zeros((n_samples, n_features), dtype = 'uint8') features_test = sp.zeros((n_samples, n_features), dtype = 'uint8') labels_train = [] labels_test = [] pbar = start_progressbar(len(ind_train), '%i train features' % (len(ind_train))) for i, ind in enumerate(ind_train): features_train[i] = l_features[ind][:n_features] labels_train.append(l_labels[ind]) update_progressbar(pbar, i) end_progressbar(pbar) pbar = start_progressbar(len(ind_test), '%i test features' % (len(ind_test))) for i, ind in enumerate(ind_test): features_test[i] = l_features[ind][:n_features] labels_test.append(l_labels[i]) update_progressbar(pbar, i) end_progressbar(pbar) labels_train = sp.array(labels_train) labels_test = sp.array(labels_test) table.flush() h5.close() return features_train , labels_train, features_test, labels_test
def getMonkeySplits_lim_old(table_fname, splitNo, n_samples = N_SAMPLES, n_features = N_FEATURES): h5_tr = ta.openFile(table_fname + str(splitNo) + '_train.h5', mode = 'r') table_tr = h5_tr.root.input_output_data.readout h5_te = ta.openFile(table_fname + str(splitNo) + '_test.h5', mode = 'r') table_te = h5_te.root.input_output_data.readout print 'Converting arrays to sp' uniqLabels = np.unique(table_tr.cols.label) #KILL UNUSED uniqLabels=uniqLabels[uniqLabels!='unused'] labels_train = [] features_train = [] exctCnt = 0 pbar = start_progressbar(len(uniqLabels), [ int(len(uniqLabels)), ' training labels'] ) for i, thisLab in enumerate(uniqLabels): tempLabels = [row['label'] for row in table_tr.where("label == thisLab")] try: selInd = random.sample(range(0,len(tempLabels)), n_samples) except ValueError: selInd = range(0,len(tempLabels)) exctCnt = exctCnt+1 labels_train = labels_train + [tempLabels[gg] for gg in selInd] tempFeatures = [row['features'][:][:n_features] for row in table_tr.where("label == thisLab")] features_train = features_train + [tempFeatures[gg] for gg in selInd] update_progressbar(pbar, i) end_progressbar(pbar) print '%d exceptions occured' % (exctCnt) features_train = sp.array(features_train)[:,:n_features] labels_train = sp.array(labels_train) labels_test = sp.array(table_te.cols.label) #import ipdb; ipdb.set_trace() features_test = sp.array(table_te.cols.features)[labels_test!='unused',:n_features] labels_test=labels_test[labels_test!='unused'] print 'Converted' #import ipdb; ipdb.set_trace() table_tr.flush() table_te.flush() h5_tr.close() h5_te.close() print "feature loading completed" return features_train , labels_train, features_test, labels_test
def populate_table(table_fname, features, labels, names): n_samples = labels.shape[0] pbar = start_progressbar(n_samples, '%i features to Pytable' % (n_samples)) h5 = ta.openFile(table_fname, mode='a') table = h5.root.input_output_data.readout pp = table.row for i in xrange(n_samples): pp['features'] = features[i, :] pp['label'] = labels[i] pp.append() update_progressbar(pbar, i) end_progressbar(pbar) # save everything in the file and close it table.flush() h5.close()
def populate_table(table_fname, features, labels, names): n_samples = labels.shape[0] pbar = start_progressbar(n_samples, '%i features to Pytable' % (n_samples)) h5 = ta.openFile(table_fname, mode = 'a') table = h5.root.input_output_data.readout pp = table.row for i in xrange(n_samples): pp['features'] = features[i,:] pp['label'] = labels[i] pp.append() update_progressbar(pbar, i) end_progressbar(pbar) # save everything in the file and close it table.flush() h5.close()
def compute_confidence(allLearners, dada): #import ipdb;ipdb.set_trace() lab_confidence = np.zeros([dada.shape[0], len(allLearners)]) tic = time.time() #import ipdb;ipdb.set_trace() pbar = start_progressbar(len(allLearners), '%i producing weighted outputs' % len(allLearners)) for ii,thisLab in enumerate(allLearners): res = np.zeros([dada.shape[0]]) for jj, thisLearner in enumerate(thisLab): for hh, thisEstimator in enumerate(thisLearner): #multiply the predictions with the weight of the learner res = res+thisEstimator.predict(dada)*thisLearner.estimator_weights_[hh] lab_confidence[:,ii] = res update_progressbar(pbar, ii) end_progressbar(pbar) print "time taken to produce confidence:", round(time.time() - tic,2), "seconds" #import ipdb;ipdb.set_trace() return lab_confidence
def compute_confidence(allLearners, dada): #import ipdb;ipdb.set_trace() lab_confidence = np.zeros([dada.shape[0], len(allLearners)]) tic = time.time() #import ipdb;ipdb.set_trace() pbar = start_progressbar( len(allLearners), '%i producing weighted outputs' % len(allLearners)) for ii, thisLab in enumerate(allLearners): res = np.zeros([dada.shape[0]]) for jj, thisLearner in enumerate(thisLab): for hh, thisEstimator in enumerate(thisLearner): #multiply the predictions with the weight of the learner res = res + thisEstimator.predict( dada) * thisLearner.estimator_weights_[hh] lab_confidence[:, ii] = res update_progressbar(pbar, ii) end_progressbar(pbar) print "time taken to produce confidence:", round(time.time() - tic, 2), "seconds" #import ipdb;ipdb.set_trace() return lab_confidence
def compute_confidence(allLearners, dada, classifier_type): #import ipdb;ipdb.set_trace() tic = time.time() #import ipdb;ipdb.set_trace() if classifier_type == 'adaboost': lab_confidence = np.zeros( [dada.shape[0], len(allLearners)], dtype='float64') pbar = start_progressbar( len(allLearners), '%i producing weighted outputs' % len(allLearners)) for ii, thisLab in enumerate(allLearners): res = np.zeros([dada.shape[0]], dtype='float64') for jj, thisLearner in enumerate(thisLab): my_weights = thisLearner.estimator_weights_ #tic = time.time() for hh, thisEstimator in enumerate(thisLearner): res = res + thisEstimator.predict(dada) * my_weights[hh] #import ipdb;ipdb.set_trace() lab_confidence[:, ii] = np.float64(res) update_progressbar(pbar, ii) end_progressbar(pbar) if classifier_type == 'randomforest': #import ipdb;ipdb.set_trace() lab_confidence = np.zeros( (dada.shape[0], len(allLearners[0].classes_)), dtype='float64') pbar = start_progressbar( len(allLearners), '%i producing weighted outputs' % len(allLearners[0].classes_)) for ii, thisRun in enumerate(allLearners): lab_confidence += thisRun.predict_proba(dada) update_progressbar(pbar, ii) end_progressbar(pbar) return lab_confidence
def euclidien(features1, features2=None): """ Builds a similarity matrix based ont eh euclidien distance """ if features2 is None: features2 = features1 nfeat1 = len(features1) nfeat2 = len(features2) # go kernelmatrix = sp.empty((nfeat1, nfeat2), dtype="float") if features1 is features2: # set up progress bar n_iter = 0 niter = nfeat1 * (nfeat2 + 1) / 2 pbar = start_progressbar(niter, "Kernel Train") for ifeat1, feat1 in enumerate(features1): a_2 = (feat1**2.).sum() for ifeat2, feat2 in enumerate(features2): if ifeat1 == ifeat2: kernelmatrix[ifeat1, ifeat2] = 0 # mattrix symmetric, do only top triangle elif ifeat1 > ifeat2: a_b = sp.dot(feat1, feat2.T) b_2 = (feat2**2.).sum() dist = (a_2 - 2 * a_b + b_2) # since kernel matrix is symmetric kernelmatrix[ifeat1, ifeat2] = dist kernelmatrix[ifeat2, ifeat1] = dist update_progressbar(pbar, n_iter + 1) n_iter += 1 else: # set up progress bar n_iter = 0 niter = nfeat1 * nfeat2 pbar = start_progressbar(niter, "Kernel Test") for ifeat1, feat1 in enumerate(features1): a_2 = (feat1**2.).sum() for ifeat2, feat2 in enumerate(features2): a_b = sp.dot(feat1, feat2.T) b_2 = (feat2**2.).sum() dist = (a_2 - 2 * a_b + b_2) kernelmatrix[ifeat1, ifeat2] = dist pbar = update_progressbar(pbar, n_iter) n_iter += 1 end_progressbar(pbar) return kernelmatrix
def get_monkey_splits_lim(table_fname, splitNo, n_samples = N_SAMPLES, n_features = N_FEATURES, n_lab = N_LAB, contig_labels = True): h5_tr = ta.openFile(table_fname + str(splitNo) + '_train.h5', mode = 'r') table_tr = h5_tr.root.input_output_data.readout h5_te = ta.openFile(table_fname + str(splitNo) + '_test.h5', mode = 'r') table_te = h5_te.root.input_output_data.readout # if label_focus == 'test': # uniqLabels = np.unique(table_te.cols.label) # else: # uniqLabels = np.unique(table_tr.cols.label) uniqLabels = np.intersect1d(np.unique(table_te.cols.label), np.unique(table_tr.cols.label)) #KILL UNUSED uniqLabels=uniqLabels[uniqLabels!='unused'] uniqLabels = uniqLabels[:n_lab] labels_train = [] features_train = [] exctCnt = 0 pbar = start_progressbar(len(uniqLabels), 'fetching %i training labels' %len(uniqLabels)) for i, thisLab in enumerate(uniqLabels): tempLabels = [row['label'] for row in table_tr.where("label == thisLab")] if contig_labels: toThis = min(len(tempLabels), n_samples) selInd = range(0,toThis) else: try: selInd = random.sample(range(0,len(tempLabels)), n_samples) except ValueError: selInd = range(0,len(tempLabels)) exctCnt = exctCnt+1 labels_train = labels_train + [tempLabels[gg] for gg in selInd] tempFeatures = [row['features'][:][:n_features] for row in table_tr.where("label == thisLab")] features_train = features_train + [tempFeatures[gg] for gg in selInd] update_progressbar(pbar, i) end_progressbar(pbar) #import ipdb; ipdb.set_trace() print '%d exceptions occured' % (exctCnt) pbar = start_progressbar(len(uniqLabels), 'fetching %i testing labels' %len(uniqLabels)) labels_test = [] features_test = [] for i, thisLab in enumerate(uniqLabels): tempLabels = [row['label'] for row in table_te.where("label == thisLab")] labels_test = labels_test + tempLabels tempFeatures = [row['features'][:][:n_features] for row in table_te.where("label == thisLab")] features_test = features_test + tempFeatures update_progressbar(pbar, i) end_progressbar(pbar) features_train = sp.array(features_train)[:,:n_features] labels_train = sp.array(labels_train) features_test = sp.array(features_test)[:,:n_features] labels_test = sp.array(labels_test) print 'Converted' table_tr.flush() table_te.flush() h5_tr.close() h5_te.close() print "feature loading completed" return features_train , labels_train, features_test, labels_test
def get_HMDB_splits(table_fname, vidName, vidMode, n_samples = N_SAMPLES, n_features = N_FEATURES): h5 = ta.openFile(table_fname, mode = 'r') table = h5.root.input_output_data.readout l_features = table.cols.features l_index = table.cols.frame_index l_labels = table.cols.label l_aviNames = table.cols.aviNames # assert(2*n_samples < n_samples_total) trVidName = []; teVidName = []; noVidName = []; for j in range(0, len(vidMode)-1): innerVidMode = vidMode[j] #import ipdb; ipdb.set_trace() trVidName = trVidName + [vidName[j][i] for i, x in enumerate(innerVidMode) if x == '1'] teVidName = teVidName + [vidName[j][i] for i, x in enumerate(innerVidMode) if x == '2'] noVidName = noVidName + [vidName[j][i] for i, x in enumerate(innerVidMode) if x == '0'] features_train =[] labels_train = [] features_test = [] labels_test = [] #import ipdb; ipdb.set_trace() exctCnt = 0 pbar = start_progressbar(len(trVidName), '%i train features' % (len(trVidName))) for i, vid in enumerate(trVidName): tempLabels = [row['label'] for row in table.where("aviNames == vid")] try: selInd = random.sample(range(0,len(tempLabels)), n_samples) except ValueError: selInd = range(0,len(tempLabels)) exctCnt = exctCnt+1 labels_train = labels_train + [tempLabels[gg] for gg in selInd] tempFeatures = [row['features'][:][:n_features] for row in table.where("aviNames == vid")] features_train = features_train + [tempFeatures[gg] for gg in selInd] update_progressbar(pbar, i) end_progressbar(pbar) print'finished with %i exceptions' % (exctCnt) pbar = start_progressbar(len(teVidName), '%i test features' % (len(teVidName))) for i, vid in enumerate(teVidName): labels_test = labels_test + [row['label'] for row in table.where("aviNames == vid")] features_test = features_test + [row['features'][:][:n_features] for row in table.where("aviNames == vid")] update_progressbar(pbar, i) end_progressbar(pbar) print 'Converting arrays to sp' features_train = sp.array(features_train, dtype = 'uint8') features_test = sp.array(features_test, dtype = 'uint8') labels_train = sp.array(labels_train) labels_test = sp.array(labels_test) print 'Converted' table.flush() h5.close() print "feature loading completed" return features_train , labels_train, features_test, labels_test
def euclidien(features1, features2 = None): """ Builds a similarity matrix based ont eh euclidien distance """ if features2 is None: features2 = features1 nfeat1 = len(features1) nfeat2 = len(features2) # go kernelmatrix = sp.empty((nfeat1, nfeat2), dtype="float") if features1 is features2: # set up progress bar n_iter = 0 niter = nfeat1 * (nfeat2+1) / 2 pbar = start_progressbar(niter, "Kernel Train") for ifeat1, feat1 in enumerate(features1): a_2 = (feat1**2.).sum() for ifeat2, feat2 in enumerate(features2): if ifeat1 == ifeat2: kernelmatrix[ifeat1, ifeat2] = 0 # mattrix symmetric, do only top triangle elif ifeat1 > ifeat2: a_b = sp.dot(feat1, feat2.T) b_2 = (feat2**2.).sum() dist = (a_2 - 2 *a_b + b_2) # since kernel matrix is symmetric kernelmatrix[ifeat1, ifeat2] = dist kernelmatrix[ifeat2, ifeat1] = dist update_progressbar(pbar, n_iter+1) n_iter += 1 else: # set up progress bar n_iter = 0 niter = nfeat1 * nfeat2 pbar = start_progressbar(niter, "Kernel Test") for ifeat1, feat1 in enumerate(features1): a_2 = (feat1**2.).sum() for ifeat2, feat2 in enumerate(features2): a_b = sp.dot(feat1, feat2.T) b_2 = (feat2**2.).sum() dist = (a_2 - 2 *a_b + b_2) kernelmatrix[ifeat1, ifeat2] = dist pbar = update_progressbar(pbar, n_iter) n_iter += 1 end_progressbar(pbar) return kernelmatrix
def get_bfast_splits(table_fname, settings, n_samples=N_SAMPLES, n_features=N_FEATURES, n_lab=N_LAB, contig_labels=True): h5_all = ta.openFile(table_fname, mode='r') table_all = h5_all.root.input_output_data.readout train_p = settings['train_p'] test_p = settings['test_p'] cams = settings['cameras'] #import ipdb; ipdb.set_trace() #row_train = [table_all.where("(partiNames == '%s') & (camNames == '%s')" % (ll, nn) ) for ll in train_p for nn in cams ] labels_train = [] features_train = [] labels_test = [] features_test = [] pbar = start_progressbar(len(train_p), str(len(train_p)) + ' training participants') for jj, pat in enumerate(train_p): for cam in cams: labels_train = labels_train + [ row['label'] for row in table_all.readWhere( "(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] features_train = features_train + [ row['features'] for row in table_all.readWhere( "(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] update_progressbar(pbar, jj) end_progressbar(pbar) pbar = start_progressbar(len(train_p), str(len(train_p)) + ' testing participants') for jj, pat in enumerate(test_p): for cam in cams: labels_test = labels_test + [ row['label'] for row in table_all.readWhere( "(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] features_test = features_test + [ row['features'] for row in table_all.readWhere( "(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] update_progressbar(pbar, jj) end_progressbar(pbar) tic = time.time() #uniqLabels = np.intersect1d(np.unique(labels_train), np.unique(labels_test)) uniqLabels = np.unique(labels_train) #KILL UNUSED uniqLabels = uniqLabels[uniqLabels != 'SIL'] uniqLabels = uniqLabels[:n_lab] print 'using ', str(len(uniqLabels)), ' labels in total' labels_train = np.array(labels_train) selector = np.zeros_like(labels_train, dtype='bool') for uL in uniqLabels: selector = np.squeeze(selector | [labels_train == uL]) labels_train = labels_train[selector] features_train = np.array(features_train)[selector, :n_features] labels_test = np.array(labels_test) selector = np.zeros_like(labels_test, dtype='bool') for uL in uniqLabels: selector = np.squeeze(selector | [labels_test == uL]) labels_test = labels_test[selector] features_test = np.array(features_test)[selector, :n_features] print "Loaded features converted in ", round(time.time() - tic), "seconds" table_all.flush() h5_all.close() return features_train, labels_train, features_test, labels_test
def get_bfast_splits(table_fname, settings, n_samples = N_SAMPLES, n_features = N_FEATURES, n_lab = N_LAB, contig_labels = True): h5_all = ta.openFile(table_fname, mode = 'r') table_all = h5_all.root.input_output_data.readout train_p = settings['train_p'] test_p = settings['test_p'] cams = settings['cameras'] cur_cam = settings['cur_cam'] lab_per_cam = {} print 'figuring out the shared labels between cams' for cam in cams: lab_per_pat = [] for pat in train_p: rowz = [row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] lab_per_pat += list(np.unique(rowz)) lab_per_cam[cam] = np.unique(lab_per_pat) #import ipdb; ipdb.set_trace() aaa = lab_per_cam.values() #avoiding a possible bug here by making sure we take only the labels that exist for all cams labs_for_all = set(aaa[0]).intersection(*aaa) # will use this towards the end to weed out un-shared ones len_data = 0 print 'figuring out how many training samples we have for the cam ', cur_cam for pat in train_p: rowz = [row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cur_cam)) ] len_data += len(rowz) features_train = np.empty((len_data,n_features), dtype='float64') labels_train = np.empty(len_data, dtype='|S24') cnt =0 pbar = start_progressbar(len(train_p), str(len(train_p))+ ' training participants loading for cam '+cur_cam ) for jj,pat in enumerate(train_p): temp = [row['features'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cur_cam)) ] temp2 = [row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cur_cam)) ] temp = [roww[:n_features] for roww in temp] if temp: #features_train[cnt:cnt+len(temp),:] = np.array(temp)[:,:n_features] features_train[cnt:cnt+len(temp),:] = temp labels_train[cnt:cnt+len(temp)] = temp2 cnt = cnt+len(temp) update_progressbar(pbar, jj) end_progressbar(pbar) len_data = 0 pbar = start_progressbar(len(test_p), ' now figuring out how many test samples we have' ) for jj,pat in enumerate(test_p): for cam in cams: len_data += len([row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ]) update_progressbar(pbar, jj) end_progressbar(pbar) features_test = np.empty((len_data,n_features), dtype='float64') labels_test = np.empty(len_data, dtype='|S24') cnt =0 pbar2 = start_progressbar(len(test_p), str(len(test_p))+ ' testing participants loading' ) for jj,pat in enumerate(test_p): for cam in cams: temp = [row['features'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] temp2 = [row['label'] for row in table_all.readWhere("(partiNames == '%s') & (camNames == '%s')" % (pat, cam)) ] temp = [roww[:n_features] for roww in temp] if temp: #features_train[cnt:cnt+len(temp),:] = np.array(temp)[:,:n_features] features_test[cnt:cnt+len(temp),:] = temp labels_test[cnt:cnt+len(temp)] = temp2 #import ipdb; ipdb.set_trace() cnt = cnt+len(temp) update_progressbar(pbar2, jj) end_progressbar(pbar2) tic = time.time() uniqLabels = np.intersect1d(labs_for_all, np.unique(labels_test)) #KILL UNUSED uniqLabels = uniqLabels[uniqLabels!='SIL'] uniqLabels = uniqLabels[:n_lab] print 'using ',str(len(uniqLabels)),' labels in total' labels_train = np.array(labels_train) selector = np.zeros_like(labels_train, dtype= 'bool') excpt = 0 for uL in uniqLabels: label_all = labels_train == uL label_all_subs = np.where(label_all)[0] if label_all_subs.shape >= n_samples: label_some_subs = label_all_subs[:n_samples] else: excpt += 1 label_some_subs = label_all_subs label_lim = np.zeros_like(label_all,dtype='bool') label_lim[label_some_subs] = True selector = np.squeeze(selector|[label_lim]) labels_train = labels_train[selector] features_train = features_train[selector,:n_features] labels_test = np.array(labels_test) selector = np.zeros_like(labels_test, dtype= 'bool') for uL in uniqLabels: selector = np.squeeze(selector|[labels_test == uL]) labels_test = labels_test[selector] features_test = features_test[selector,:n_features] print "Loaded features converted in ", round(time.time() - tic) , "seconds" print "there were ", str(excpt), " exceptions " table_all.flush() h5_all.close() labels_train = group_labels(labels_train) labels_test = group_labels(labels_test) print 'using ',str(len(labels_test)),' labels in total' return features_train , labels_train, features_test, labels_test
def split_data_from_table(table_fname, n_samples=N_SAMPLES, n_features=N_FEATURES): h5 = ta.openFile(table_fname, mode='r') table = h5.root.input_output_data.readout l_features = table.cols.features l_index = table.cols.frame_index l_labels = table.cols.label import ipdb ipdb.set_trace() n_samples_total = len(l_labels) assert (2 * n_samples < n_samples_total) import warnings warnings.warn( """have something that takes a split depending on classes to keep it balanced""" ) #TODO: have a balanced split on each class ind_total = sp.random.permutation(n_samples_total) ind_train = ind_total[:n_samples] ind_test = ind_total[n_samples:2 * n_samples] sp.array([(ind_train == test).any() for test in ind_test]).any() print "checked that train and test do not overlap" """ features_train = features.T[ind_train, :n_features] labels_train = labels[ind_train] features_test = features.T[ind_test, :n_features] labels_test = labels[ind_test] """ features_train = sp.zeros((n_samples, n_features), dtype='uint8') features_test = sp.zeros((n_samples, n_features), dtype='uint8') labels_train = [] labels_test = [] pbar = start_progressbar(len(ind_train), '%i train features' % (len(ind_train))) for i, ind in enumerate(ind_train): features_train[i] = l_features[ind][:n_features] labels_train.append(l_labels[ind]) update_progressbar(pbar, i) end_progressbar(pbar) pbar = start_progressbar(len(ind_test), '%i test features' % (len(ind_test))) for i, ind in enumerate(ind_test): features_test[i] = l_features[ind][:n_features] labels_test.append(l_labels[i]) update_progressbar(pbar, i) end_progressbar(pbar) labels_train = sp.array(labels_train) labels_test = sp.array(labels_test) table.flush() h5.close() return features_train, labels_train, features_test, labels_test
def get_monkey_splits_lim(table_fname, splitNo, n_samples=N_SAMPLES, n_features=N_FEATURES, n_lab=N_LAB, contig_labels=True): h5_tr = ta.openFile(table_fname + str(splitNo) + '_train.h5', mode='r') table_tr = h5_tr.root.input_output_data.readout h5_te = ta.openFile(table_fname + str(splitNo) + '_test.h5', mode='r') table_te = h5_te.root.input_output_data.readout # if label_focus == 'test': # uniqLabels = np.unique(table_te.cols.label) # else: # uniqLabels = np.unique(table_tr.cols.label) uniqLabels = np.intersect1d(np.unique(table_te.cols.label), np.unique(table_tr.cols.label)) #KILL UNUSED uniqLabels = uniqLabels[uniqLabels != 'unused'] uniqLabels = uniqLabels[:n_lab] labels_train = [] features_train = [] exctCnt = 0 pbar = start_progressbar(len(uniqLabels), 'fetching %i training labels' % len(uniqLabels)) for i, thisLab in enumerate(uniqLabels): tempLabels = [ row['label'] for row in table_tr.where("label == thisLab") ] if contig_labels: toThis = min(len(tempLabels), n_samples) selInd = range(0, toThis) else: try: selInd = random.sample(range(0, len(tempLabels)), n_samples) except ValueError: selInd = range(0, len(tempLabels)) exctCnt = exctCnt + 1 labels_train = labels_train + [tempLabels[gg] for gg in selInd] tempFeatures = [ row['features'][:][:n_features] for row in table_tr.where("label == thisLab") ] features_train = features_train + [tempFeatures[gg] for gg in selInd] update_progressbar(pbar, i) end_progressbar(pbar) #import ipdb; ipdb.set_trace() print '%d exceptions occured' % (exctCnt) pbar = start_progressbar(len(uniqLabels), 'fetching %i testing labels' % len(uniqLabels)) labels_test = [] features_test = [] for i, thisLab in enumerate(uniqLabels): tempLabels = [ row['label'] for row in table_te.where("label == thisLab") ] labels_test = labels_test + tempLabels tempFeatures = [ row['features'][:][:n_features] for row in table_te.where("label == thisLab") ] features_test = features_test + tempFeatures update_progressbar(pbar, i) end_progressbar(pbar) features_train = sp.array(features_train)[:, :n_features] labels_train = sp.array(labels_train) features_test = sp.array(features_test)[:, :n_features] labels_test = sp.array(labels_test) print 'Converted' table_tr.flush() table_te.flush() h5_tr.close() h5_te.close() print "feature loading completed" return features_train, labels_train, features_test, labels_test
def get_HMDB_splits(table_fname, vidName, vidMode, n_samples=N_SAMPLES, n_features=N_FEATURES): h5 = ta.openFile(table_fname, mode='r') table = h5.root.input_output_data.readout l_features = table.cols.features l_index = table.cols.frame_index l_labels = table.cols.label l_aviNames = table.cols.aviNames # assert(2*n_samples < n_samples_total) trVidName = [] teVidName = [] noVidName = [] for j in range(0, len(vidMode) - 1): innerVidMode = vidMode[j] #import ipdb; ipdb.set_trace() trVidName = trVidName + [ vidName[j][i] for i, x in enumerate(innerVidMode) if x == '1' ] teVidName = teVidName + [ vidName[j][i] for i, x in enumerate(innerVidMode) if x == '2' ] noVidName = noVidName + [ vidName[j][i] for i, x in enumerate(innerVidMode) if x == '0' ] features_train = [] labels_train = [] features_test = [] labels_test = [] #import ipdb; ipdb.set_trace() exctCnt = 0 pbar = start_progressbar(len(trVidName), '%i train features' % (len(trVidName))) for i, vid in enumerate(trVidName): tempLabels = [row['label'] for row in table.where("aviNames == vid")] try: selInd = random.sample(range(0, len(tempLabels)), n_samples) except ValueError: selInd = range(0, len(tempLabels)) exctCnt = exctCnt + 1 labels_train = labels_train + [tempLabels[gg] for gg in selInd] tempFeatures = [ row['features'][:][:n_features] for row in table.where("aviNames == vid") ] features_train = features_train + [tempFeatures[gg] for gg in selInd] update_progressbar(pbar, i) end_progressbar(pbar) print 'finished with %i exceptions' % (exctCnt) pbar = start_progressbar(len(teVidName), '%i test features' % (len(teVidName))) for i, vid in enumerate(teVidName): labels_test = labels_test + [ row['label'] for row in table.where("aviNames == vid") ] features_test = features_test + [ row['features'][:][:n_features] for row in table.where("aviNames == vid") ] update_progressbar(pbar, i) end_progressbar(pbar) print 'Converting arrays to sp' features_train = sp.array(features_train, dtype='uint8') features_test = sp.array(features_test, dtype='uint8') labels_train = sp.array(labels_train) labels_test = sp.array(labels_test) print 'Converted' table.flush() h5.close() print "feature loading completed" return features_train, labels_train, features_test, labels_test