def main(): if len(sys.argv) < 4: print_usage() dataset = sys.argv[1] base_dir = sys.argv[2] out_file = sys.argv[3] phase = sys.argv[4] names = util.load_names(dataset, phase) lbls = util.load_labels(dataset, phase) centers = [] for idx, strin in enumerate(lbls): # load label data joints = np.asarray(np.reshape(strin.split(), (21, 3)), dtype=np.float32) # convert label data from world coordinates to pixel locations joints, skel_camcoords = util.world2pixel(joints, dataset) # calculate centers c = util.get_center_fpad(joints) c = np.asarray(c, dtype=np.float32) centers.append(c.reshape((1, 3))) if idx % 500 == 0: print('{}/{}'.format(idx + 1, len(names))) util.save_results(centers, out_file)
def compute_validation_predictions(model_id, validation_set): d = importlib.import_module("nets.net_" + model_id) model, X, y = d.define_net() model.load_params_from(params.SAVE_URL + "/" + model_id + "/best_weights") # Lower batch size since TTA multiplies batch size by 16 params.BATCH_SIZE = 32 io = ImageIO() mean, std = io.load_mean_std() # Read training labels for the keys y = util.load_labels() keys = y.index.values model.batch_iterator_predict = TTABatchIterator(keys, params.BATCH_SIZE, std, mean, cv = True) print "TTAs per image: %i, augmented batch size: %i" % (model.batch_iterator_predict.ttas, model.batch_iterator_predict.ttas * params.BATCH_SIZE) padded_batches = ceil(validation_set.shape[0]/float(params.BATCH_SIZE)) pred = model.predict_proba(validation_set) pred = pred.reshape(padded_batches, model.batch_iterator_predict.ttas, params.BATCH_SIZE) pred = np.mean(pred, axis = 1) pred = pred.reshape(padded_batches * params.BATCH_SIZE) # Remove padded lines pred = pred[:validation_set.shape[0]] return pred
def get_labels(self): """ get_labels() Read labels from memcache stored by the 'post' view, Returns: An array contains sorted labels. """ return util.load_labels(self.request, self.response)
def main(): dataset = None if len(sys.argv) > 1: dataset = sys.argv[1] metadata = util.get_metadata((dataset + "_metadata") if dataset else None) mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None))) # Load pyAudioAnalysis features with open("F", "rb") as f: feats, files = pickle.load(f, encoding="latin1") files = [f.split(".")[0].split("XC")[-1] for f in files] F = dict(zip(files, feats)) full_dataset = True for item in metadata[1:]: if item[0] not in F: full_dataset = False X2, X3 = [], [] if full_dataset: X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]] X2 = [F[item[0]] for item in metadata[1:]] X1 = [mfcc[item[0]] for item in metadata[1:]] for X in [X1, X2]: NUM_RUNS = 50 Y = util.load_labels((dataset + "_metadata") if dataset else None) samples = range(len(X))#range(1, len(X), 12)#random.sample(range(len(X)), 25) samps = samples#range(len(X))#samples x = [X[i] for i in samps] y = [Y[i] for i in samples] N_ESTIMATORS = 20 avg_mat = None for run in range(NUM_RUNS): clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=20, oob_score=True).fit(X, Y) similarity = dict() for dt in clf.estimators_: leaves = dt.apply(X) for i in samps: for j in samps: if leaves[i] == leaves[j]: similarity[(i,j)] = similarity.get((i,j), 0) + 1 mat = np.array([[(1.0 - similarity.get((i,j), 0)/N_ESTIMATORS)**2 for j in samples] for i in samples]) mat = squareform(mat) if avg_mat is None: avg_mat = mat else: avg_mat = np.add(avg_mat, mat) avg_mat = avg_mat / NUM_RUNS linkage_matrix = linkage(avg_mat, "single") matplotlib.rcParams['lines.linewidth'] = 2.5 dendrogram(linkage_matrix, color_threshold=0.8, labels=y, show_leaf_counts=True) plt.xlabel("label") plt.ylabel("distance") plt.show()
def profile(subset=1000, multi=True, n_threads = 4, batch_size=64, thread_pool=False): # Load a bunch of imagenames y = util.load_labels() y = y[:subset] keys = y.index.values #Create sublists (batches) batched_keys = util.chunks(keys, batch_size) if multi: augment_multithreaded(batched_keys, n_threads=n_threads, thread_pool=thread_pool) else: augment_singlethreaded(batched_keys)
def profile(subset=1000, multi=True, n_threads=4, batch_size=64, thread_pool=False): # Load a bunch of imagenames y = util.load_labels() y = y[:subset] keys = y.index.values #Create sublists (batches) batched_keys = util.chunks(keys, batch_size) if multi: augment_multithreaded(batched_keys, n_threads=n_threads, thread_pool=thread_pool) else: augment_singlethreaded(batched_keys)
def get_dataloader(sts, labels=None, keys=['obs1', 'obs2', 'hyp1', 'hyp2'], \ batch_size=64, num_buckets=10, bucket_ratio=.5, \ ctx=mx.gpu(), max_seq_length=25, sample_num=None): ''' this function will use the helpers above, take sentence file path, label file path, and batch_size, num_buckets, bucket_ratio, to get the dataloader for model to us. sample_num controls how many samples in dataset the model will use, defualt to None, e.g., use all ''' if labels: sentences = load_sentences(sts, keys=keys) sentences = sentences[:sample_num] labels = load_labels(labels)[:sample_num] try: assert(len(sentences)==len(labels)) except: logger.error('Sample sentence length does not equal to label\'s length!') exit(-1) dataset = to_dataset(sentences, labels, ctx=ctx, batch_size=batch_size, \ max_seq_length=max_seq_length) dataloader = to_dataloader(dataset=dataset, batch_size=batch_size, \ num_buckets=num_buckets, bucket_ratio=bucket_ratio) else: dataset = to_dataset(sts, labels, ctx=ctx, batch_size=batch_size, \ max_seq_length=max_seq_length) dataloader = [] for sample in dataset: batch = [] for emb in sample: batch.append(nd.array(emb.reshape(1, *emb.shape))) dataloader.append(batch) return dataloader
def main(): dataset = None if len(sys.argv) > 1: dataset = sys.argv[1] metadata = util.get_metadata((dataset + "_metadata") if dataset else None) mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None))) feats, files = None,None with open("F", "rb") as f: feats, files = pickle.load(f, encoding="latin1") files = [f.split(".")[0].split("XC")[-1] for f in files] F = dict(zip(files, feats)) full_dataset = True for item in metadata[1:]: if item[0] not in F: full_dataset = False X2, X3 = [], [] if full_dataset: X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]] X2 = [F[item[0]] for item in metadata[1:]] X1 = [mfcc[item[0]] for item in metadata[1:]] Y = util.load_labels((dataset + "_metadata") if dataset else None)#"bbsmd.csv") for X in [X1, X2] if full_dataset else [X1,]: print("------") classifiers = [ RandomForestClassifier(n_estimators=50, max_features=15, oob_score=True), KNeighborsClassifier(3), svm.SVC(kernel='linear', C=1), svm.SVC(gamma=2, C=1), GaussianNB() ] for clf in classifiers: scores = cross_val_score(clf, X, Y, cv=5) score = sum(scores)/len(scores) print(type(clf).__name__, "\t", score)
import image_loops ################################## # This script creates hdf5 files for RGB-D data since different processing is required # Run for both phase='test' and phase='train' to get both datafiles # colormap and depth2cords cython functions are used here ################################## DIR = "../labels/" phase = 'train' h5_fn = os.path.join(DIR, ('rgbd_' + phase + '_data_.h5')) base_dir = '/home/bilbeisi/REN/cropped/' names = util.load_names('fpad', phase) labels = util.load_labels('fpad', phase) cnames = util.load_names('fpac', phase) centers = util.load_centers('fpad', phase).astype(float) imgs = np.zeros((len(names), 4, 96, 96), dtype=np.float32) lbls = np.zeros((len(labels), 63), dtype=np.float) cube_size = 150 # cube size in mm for cropping for idx, name in enumerate(names): cname = cnames[idx] img = util.load_image('fpad', os.path.join('/home/bilbeisi/REN/', name)) img[img == 0] = 1 cimg = util.load_image('fpac', os.path.join('/home/bilbeisi/REN/', cname)) cimg = cimg.astype(float)
cv_folds = args.cv_folds cv_lno = args.cv_lno n_jobs = args.n_jobs if calibrate is None: calibrate = False else: calibrate = bool(calibrate) print(calibrate) if n_jobs is not None: n_jobs = int(n_jobs) # load filenames and labels sample_images = util.load_sample_images(out_dir) samples, cats, labels = util.load_labels(out_dir) if sample_weight is not None: # get labels for sample_weight category c = np.where(cats == sample_weight)[0][0] ln = np.unique([l[c] for l in labels]) ln.sort() ln = list(ln) if '' in ln: del ln[ln.index('')] label_names_sw = ln labels_sw = np.array( [ln.index(l) if l in ln else -1 for l in labels[:, c]]) if group is not None: # get labels for group category if group == sample_weight:
def tf_classify(): # TODO: python -m scripts.label_image --graph=tf_files/retrained_graph.pb --image=test/aurelia.jpeg import socket print("In tf_classify handler from {}".format(socket.getfqdn())) file_name = "models/mobilenet/example/3475870145_685a19116d.jpg" file_name = "https://www.eopugetsound.org/sites/default/files/styles/magazinewidth_592px/public/topical_article/images/moon_jellyfish.jpg?itok=Esreg6zX" # Get payload payload = request.get_json(silent=True, force=True) if payload == None: if request.get_data() != None: payload = json.loads(request.get_data()) if payload != None: if payload.get("nlp").get("entities").get("url"): file_name = payload.get("nlp").get("entities").get("url")[0].get( "raw") # Load model file model_file = "models/mobilenet/retrained_graph.pb" label_file = "models/mobilenet/retrained_labels.txt" input_height = 224 input_width = 224 input_mean = 128 input_std = 128 input_layer = "input" output_layer = "final_result" graph = util.load_graph(model_file) t = util.read_tensor_from_image_file(file_name, input_height=input_height, input_width=input_width, input_mean=input_mean, input_std=input_std) input_name = "import/" + input_layer output_name = "import/" + output_layer input_operation = graph.get_operation_by_name(input_name) output_operation = graph.get_operation_by_name(output_name) with tf.Session(graph=graph) as sess: start = time.time() results = sess.run(output_operation.outputs[0], {input_operation.outputs[0]: t}) end = time.time() results = np.squeeze(results) top_k = results.argsort()[-5:][::-1] labels = util.load_labels(label_file) print('\nEvaluation time (1-image): {:.3f}s\n'.format(end - start)) template = "{} (score={:0.5f})" print(top_k) for i in top_k: print(template.format(labels[i], results[i])) # I really don't know, my best guess is [] if results[0] < 0.1: response = "I really don't know, my best guess is that this looks like a " + labels[ top_k[0]] else: response = 'I think this is a ' + labels[top_k[0]] response = 'I think this is a ' + labels[top_k[0]] return jsonify( status=200, replies=[{ 'type': 'text', 'content': response }], conversation={'memory': { 'plankton': labels[top_k[0]] }})
def predict(model_id, raw, validation, train, n_eyes, average_over_eyes): params.DISABLE_CUDNN = True params.MULTIPROCESS = False d = importlib.import_module("nets.net_" + model_id) model, X, y = d.define_net() model.load_params_from(params.SAVE_URL + "/" + model_id + "/best_weights") f = get_iter_func(model) # Decrease batch size because TTA increases it 16-fold # Uses too much memory otherwise params.BATCH_SIZE = 8 io = ImageIO() mean, std = io.load_mean_std() if validation or train: y = util.load_labels() else: y = util.load_sample_submission() keys = y.index.values tta_bi = TTABatchIterator(keys, params.BATCH_SIZE, std, mean, cv=validation or train, n_eyes=n_eyes) print "TTAs per image: %i, augmented batch size: %i" % ( tta_bi.ttas, tta_bi.ttas * params.BATCH_SIZE * n_eyes) if validation: X_test = np.load(params.IMAGE_SOURCE + "/X_valid.npy") elif train: X_test = np.load(params.IMAGE_SOURCE + "/X_train.npy") else: X_test = np.arange(y.shape[0]) padded_batches = ceil(X_test.shape[0] / float(params.BATCH_SIZE)) pred = get_activations(X_test, tta_bi, f) concat_preds = [] for batch_pred in pred: hidden = batch_pred[0] output = batch_pred[1] concat = np.concatenate([output, hidden], axis=1) #if average_over_eyes: #means = concat.reshape(concat.shape[0] / 2, 2, concat.shape[1]) #means = means.mean(axis = 1) #means = np.repeat(means, 2, axis = 0) concat_preds.append(concat) pred = np.vstack(concat_preds) output_units = pred.shape[1] #pred = model.predict_proba(X_test) pred = pred.reshape(padded_batches, tta_bi.ttas, params.BATCH_SIZE, output_units) pred = np.mean(pred, axis=1) pred = pred.reshape(padded_batches * params.BATCH_SIZE, output_units) # Remove padded lines pred = pred[:X_test.shape[0]] # Save unrounded #y.loc[keys] = pred if validation: filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_validation.npy" elif train: filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_train.npy" else: filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_test.npy" np.save(filename, pred) #y.to_csv(filename) print "Saved raw predictions to " + filename if not raw and not validation and not train: W = np.load(params.SAVE_URL + "/" + model_id + "/optimal_thresholds.npy") pred = weighted_round(pred, W) pred = pred[:, np.newaxis] # add axis for pd compatability hist, _ = np.histogram(pred, bins=5) print "Distribution over class predictions on test set: ", hist / float( y.shape[0]) y.loc[keys] = pred y.to_csv(params.SAVE_URL + "/" + model_id + "/submission.csv") print "Gzipping..." if not params.ON_COMA: call("gzip -c " + params.SAVE_URL + "/" + model_id + "/submission.csv > " + params.SAVE_URL + "/" + model_id + "/submission.csv.gz", shell=True) print "Done! File saved to models/" + model_id + "/submission.csv"
def define_net(): define_net_specific_parameters() io = ImageIO() # Read pandas csv labels y = util.load_labels() if params.SUBSET is not 0: y = y[:params.SUBSET] X = np.arange(y.shape[0]) mean, std = io.load_mean_std(circularized=params.CIRCULARIZED_MEAN_STD) keys = y.index.values if params.AUGMENT: train_iterator = AugmentingParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all=y) else: train_iterator = ParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all=y) test_iterator = ParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all=y) if params.REGRESSION: y = util.float32(y) y = y[:, np.newaxis] if 'gpu' in theano.config.device: # Half of coma does not support cuDNN, check whether we can use it on this node # If not, use cuda_convnet bindings from theano.sandbox.cuda.dnn import dnn_available if dnn_available() and not params.DISABLE_CUDNN: from lasagne.layers import dnn Conv2DLayer = dnn.Conv2DDNNLayer MaxPool2DLayer = dnn.MaxPool2DDNNLayer else: from lasagne.layers import cuda_convnet Conv2DLayer = cuda_convnet.Conv2DCCLayer MaxPool2DLayer = cuda_convnet.MaxPool2DCCLayer else: Conv2DLayer = layers.Conv2DLayer MaxPool2DLayer = layers.MaxPool2DLayer Maxout = layers.pool.FeaturePoolLayer net = NeuralNet( layers=[ ('input', layers.InputLayer), ('conv0', Conv2DLayer), ('pool0', MaxPool2DLayer), ('conv1', Conv2DLayer), ('pool1', MaxPool2DLayer), ('conv2', Conv2DLayer), ('pool2', MaxPool2DLayer), ('conv3', Conv2DLayer), ('pool3', MaxPool2DLayer), ('conv4', Conv2DLayer), ('pool4', MaxPool2DLayer), ('dropouthidden1', layers.DropoutLayer), ('hidden1', layers.DenseLayer), ('maxout1', Maxout), ('dropouthidden2', layers.DropoutLayer), ('hidden2', layers.DenseLayer), ('maxout2', Maxout), ('dropouthidden3', layers.DropoutLayer), ('output', layers.DenseLayer), ], input_shape=(None, params.CHANNELS, params.PIXELS, params.PIXELS), conv0_num_filters=32, conv0_filter_size=(5, 5), conv0_stride=(2, 2), pool0_pool_size=(2, 2), pool0_stride=(2, 2), conv1_num_filters=64, conv1_filter_size=(3, 3), conv1_border_mode='same', pool1_pool_size=(2, 2), pool1_stride=(2, 2), conv2_num_filters=128, conv2_filter_size=(3, 3), conv2_border_mode='same', pool2_pool_size=(2, 2), pool2_stride=(2, 2), conv3_num_filters=192, conv3_filter_size=(3, 3), conv3_border_mode='same', pool3_pool_size=(2, 2), pool3_stride=(2, 2), conv4_num_filters=256, conv4_filter_size=(3, 3), conv4_border_mode='same', pool4_pool_size=(2, 2), pool4_stride=(2, 2), hidden1_num_units=1024, hidden2_num_units=1024, dropouthidden1_p=0.5, dropouthidden2_p=0.5, dropouthidden3_p=0.5, maxout1_pool_size=2, maxout2_pool_size=2, output_num_units=1 if params.REGRESSION else 5, output_nonlinearity=None if params.REGRESSION else nonlinearities.softmax, update_learning_rate=theano.shared( util.float32(params.START_LEARNING_RATE)), update_momentum=theano.shared(util.float32(params.MOMENTUM)), custom_score=('kappa', quadratic_kappa), regression=params.REGRESSION, batch_iterator_train=train_iterator, batch_iterator_test=test_iterator, on_epoch_finished=[ AdjustVariable('update_learning_rate', start=params.START_LEARNING_RATE), stats.Stat(), ModelSaver() ], max_epochs=500, verbose=1, # Only relevant when create_validation_split = True eval_size=0.1, # Need to specify splits manually like indicated below! create_validation_split=params.SUBSET > 0, ) # It is recommended to use the same training/validation split every model for ensembling and threshold optimization # # To set specific training/validation split: net.X_train = np.load(params.IMAGE_SOURCE + "/X_train.npy") net.X_valid = np.load(params.IMAGE_SOURCE + "/X_valid.npy") net.y_train = np.load(params.IMAGE_SOURCE + "/y_train.npy") net.y_valid = np.load(params.IMAGE_SOURCE + "/y_valid.npy") return net, X, y
from matplotlib import pyplot as plt np.set_printoptions(threshold=np.nan) ######################## ## This is the validation script for RGB-D. The creation and cropping of the RGBD images are done in craete_rgbd_hdf5.py because the "images" cannot be stored in the intermediate step between creation/cropping and moving to hdf5 ## Some directories may need to be created before running some validation segments if they do not exist ## All preprocessing of the labels and centers is identical to that of depth therefore there is no need to redo it.. ######################## dataset = 'rgbd' phase = 'test' root_dir = '/home/bilbeisi/REN/' ############################# Create RGB-D Images ################################# names = util.load_names('fpad', phase) labels = util.load_labels('fpad', phase) cnames = util.load_names('fpac', phase) centers = util.load_centers('fpad', phase).astype(float) imgs = np.zeros((len(names), 4, 96, 96), dtype=np.float32) lbls = np.zeros((len(labels), 63), dtype=np.float) cube_size = 150 # cube size in mm for cropping for idx, name in enumerate(names): if idx % 1000 == 0: cname = cnames[idx] img = util.load_image('fpad', os.path.join('/home/bilbeisi/REN/', name)) img[img == 0] = 1 cimg = util.load_image('fpac',
# Remember to change dataset depending on type of data (fpad for depth, and fpac for rgb) # Run for both phase='test' and phase='train' to get both datafiles ################################## ################ Confirm dataset before running this script! ############################### dataset = 'fpad' # fpac or fpad DIR = "../labels/" phase = 'test' # test/train h5_fn = os.path.join(DIR, (dataset + '_' + phase + '_data.h5')) ############## Dir containing preprocessed images ############################### base_dir = '/home/bilbeisi/REN/cropped/' names = util.load_names(dataset, phase) labels = util.load_labels(dataset, phase) if dataset == 'fpad': imgs = np.zeros((len(names), 1, 96, 96), dtype=np.float32) # depth else: imgs = np.zeros((len(names), 3, 96, 96), dtype=np.float32) lbls = np.zeros((len(labels), 63), dtype=np.float) for idx, name in enumerate(names): if dataset == 'fpac': name = name.replace('.jpeg', '.png') img = util.load_image(dataset, os.path.join(base_dir, name)) img = img.astype(float) # revert back to normalized -1,1 since images where saved in 0,255 to allow viewing/verifying img[:] *= 2 img[:] /= 255
def predict(model_id, raw, validation, train, n_eyes, average_over_eyes): params.DISABLE_CUDNN = True params.MULTIPROCESS = False d = importlib.import_module("nets.net_" + model_id) model, X, y = d.define_net() model.load_params_from(params.SAVE_URL + "/" + model_id + "/best_weights") f = get_iter_func(model) # Decrease batch size because TTA increases it 16-fold # Uses too much memory otherwise params.BATCH_SIZE = 8 io = ImageIO() mean, std = io.load_mean_std() if validation or train: y = util.load_labels() else: y = util.load_sample_submission() keys = y.index.values tta_bi = TTABatchIterator(keys, params.BATCH_SIZE, std, mean, cv = validation or train, n_eyes = n_eyes) print "TTAs per image: %i, augmented batch size: %i" % (tta_bi.ttas, tta_bi.ttas * params.BATCH_SIZE * n_eyes) if validation: X_test = np.load(params.IMAGE_SOURCE + "/X_valid.npy") elif train: X_test = np.load(params.IMAGE_SOURCE + "/X_train.npy") else: X_test = np.arange(y.shape[0]) padded_batches = ceil(X_test.shape[0]/float(params.BATCH_SIZE)) pred = get_activations(X_test, tta_bi, f) concat_preds = [] for batch_pred in pred: hidden = batch_pred[0] output = batch_pred[1] concat = np.concatenate([output, hidden], axis = 1) #if average_over_eyes: #means = concat.reshape(concat.shape[0] / 2, 2, concat.shape[1]) #means = means.mean(axis = 1) #means = np.repeat(means, 2, axis = 0) concat_preds.append(concat) pred = np.vstack(concat_preds) output_units = pred.shape[1] #pred = model.predict_proba(X_test) pred = pred.reshape(padded_batches, tta_bi.ttas, params.BATCH_SIZE, output_units) pred = np.mean(pred, axis = 1) pred = pred.reshape(padded_batches * params.BATCH_SIZE, output_units) # Remove padded lines pred = pred[:X_test.shape[0]] # Save unrounded #y.loc[keys] = pred if validation: filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_validation.npy" elif train: filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_train.npy" else: filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_test.npy" np.save(filename, pred) #y.to_csv(filename) print "Saved raw predictions to " + filename if not raw and not validation and not train: W = np.load(params.SAVE_URL + "/" + model_id + "/optimal_thresholds.npy") pred = weighted_round(pred, W) pred = pred[:, np.newaxis] # add axis for pd compatability hist, _ = np.histogram(pred, bins=5) print "Distribution over class predictions on test set: ", hist / float(y.shape[0]) y.loc[keys] = pred y.to_csv(params.SAVE_URL + "/" + model_id + "/submission.csv") print "Gzipping..." if not params.ON_COMA: call("gzip -c " + params.SAVE_URL + "/" + model_id + "/submission.csv > " + params.SAVE_URL + "/" + model_id + "/submission.csv.gz", shell=True) print "Done! File saved to models/" + model_id + "/submission.csv"
def singlePipeline(nr_centroids, nr_it, label_path = "../data/preprocessed.h5", clsfr = "SGD", calc_centroids = True, dogfeed=True, train_model=True, cache_size=4000, degree=3, tol=1e-3, max_iter=-1, kernel='rbf', model_file='UNSPECIFIED'): if calc_centroids: print "calculating centroids..." #Finds the features using kmeans kmTrainer = kmeans.kMeansTrainer(nr_centroids = nr_centroids, nr_it = nr_it) centroids = kmTrainer.fit() kmTrainer.save_centroids(centroids) print "calculating activations..." #Calculates the activaiton of the test set act_calc = act.ActivationCalculation() features = act_calc.pipeline(centroids) else: print "loading activations from file..." #loads feature data feature_data = h5py.File("../data/activations_train/"+str(nr_centroids)+"activationkmeans.h5") features = feature_data["activations"] print "Loading labels from file..." #get the labels labels = util.load_labels(label_path) label_names = util.load_label_names(label_path) print "Got labels" if clsfr == "SGD": if train_model: #Train the SGD classifier print "Begin training of SGD..." train.trainSGD(features, labels, nr_centroids) print "Training done" if not dogfeed: return print "Dogfeeding" #Predict based on SGD training print "Begin SGD predictions..." classified = classifier.predict(features, nr_centroids, degree=degree, cache_size=cache_size) print "Predicting done" elif clsfr == "SVC" or clsfr == "NUSVR": if train_model: print "Begin training of Model..." if clsfr=="SVC": #Train SVC classifier model = svc.train_svc(features, labels, nr_centroids, degree=degree, cache_size=cache_size, tol=tol, max_iter=max_iter, kernel = kernel) else : #Train SVC classifier model = svc.train_svc(features, labels, nr_centroids, degree=degree, cache_size=cache_size, tol=tol, max_iter=max_iter, kernel=kernel) print "Training done" else: print "Loading model" model = joblib.load(model_file) if not dogfeed: return print "Dogfeeding" #Predict based on SVC training print "Begin SVC predictions..." classified = model.predict_proba(features) print "Predicting done" else: print "Selected classifier not available, please use an available classifier" return print "Calculating log loss..." summing = 0 correct = 0 np.savetxt("meuk.csv", classified, delimiter=";") loss = metrics.log_loss(labels, classified) print loss print -np.mean(np.log(classified)[np.arange(len(labels)), labels]) #calculate the log loss for i, label in enumerate(labels): actual = labels[i] if(classified[i][label] == 0): summing+= np.log(10e-15) else: summing+= np.log(classified[i][label]) if actual == np.argmax(classified[i]): correct += 1 image = np.zeros((len(label_names),len(labels))) for j, label_index in enumerate(labels): image[label_index,j] = 1 scipy.misc.imsave('correct.png', image) scipy.misc.imsave('predicted.png', classified.T) error = image - classified.T scipy.misc.imsave('error.png', error) print "Calculation finished" summing = -summing/len(labels) print "log loss: ", summing print "correct/amount_of_labels: ", correct/len(labels) print "lowest classification score: ", np.min(classified) # print summing np.savetxt( "realLabel.csv", labels, delimiter=";") # np.savetxt( "SGD_label.csv", max_SGD, delimiter=";") if calc_centroids is False: feature_data.close()
def define_net(): define_net_specific_parameters() io = ImageIO() # Read pandas csv labels y = util.load_labels() if params.SUBSET is not 0: y = y[:params.SUBSET] X = np.arange(y.shape[0]) mean, std = io.load_mean_std(circularized=params.CIRCULARIZED_MEAN_STD) keys = y.index.values if params.AUGMENT: train_iterator = AugmentingParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all = y) else: train_iterator = ParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all = y) test_iterator = ParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all = y) if params.REGRESSION: y = util.float32(y) y = y[:, np.newaxis] if 'gpu' in theano.config.device: # Half of coma does not support cuDNN, check whether we can use it on this node # If not, use cuda_convnet bindings from theano.sandbox.cuda.dnn import dnn_available if dnn_available(): from lasagne.layers import dnn Conv2DLayer = dnn.Conv2DDNNLayer MaxPool2DLayer = dnn.MaxPool2DDNNLayer else: from lasagne.layers import cuda_convnet Conv2DLayer = cuda_convnet.Conv2DCCLayer MaxPool2DLayer = cuda_convnet.MaxPool2DCCLayer else: Conv2DLayer = layers.Conv2DLayer MaxPool2DLayer = layers.MaxPool2DLayer Maxout = layers.pool.FeaturePoolLayer net = NeuralNet( layers=[ ('input', layers.InputLayer), ('conv0', Conv2DLayer), ('pool0', MaxPool2DLayer), ('conv1', Conv2DLayer), ('pool1', MaxPool2DLayer), ('conv2', Conv2DLayer), ('pool2', MaxPool2DLayer), ('conv3', Conv2DLayer), ('pool3', MaxPool2DLayer), ('conv4', Conv2DLayer), ('pool4', MaxPool2DLayer), ('dropouthidden1', layers.DropoutLayer), ('hidden1', layers.DenseLayer), ('maxout1', Maxout), ('dropouthidden2', layers.DropoutLayer), ('hidden2', layers.DenseLayer), ('maxout2', Maxout), ('dropouthidden3', layers.DropoutLayer), ('output', layers.DenseLayer), ], input_shape=(None, params.CHANNELS, params.PIXELS, params.PIXELS), conv0_num_filters=32, conv0_filter_size=(5, 5), conv0_stride=(2, 2), pool0_pool_size=(2, 2), pool0_stride=(2, 2), conv1_num_filters=64, conv1_filter_size=(5, 5), conv1_border_mode = 'same', pool1_pool_size=(2, 2), pool1_stride=(2, 2), conv2_num_filters=128, conv2_filter_size=(3, 3), conv2_border_mode = 'same', pool2_pool_size=(2, 2), pool2_stride=(2, 2), conv3_num_filters=192, conv3_filter_size=(3, 3), conv3_border_mode = 'same', pool3_pool_size=(2, 2), pool3_stride=(2, 2), conv4_num_filters=256, conv4_filter_size=(3, 3), conv4_border_mode = 'same', pool4_pool_size=(2, 2), pool4_stride=(2, 2), hidden1_num_units=1024, hidden2_num_units=1024, dropouthidden1_p=0.5, dropouthidden2_p=0.5, dropouthidden3_p=0.5, maxout1_pool_size=2, maxout2_pool_size=2, output_num_units=1 if params.REGRESSION else 5, output_nonlinearity=None if params.REGRESSION else nonlinearities.softmax, update_learning_rate=theano.shared(util.float32(params.START_LEARNING_RATE)), update_momentum=theano.shared(util.float32(params.MOMENTUM)), custom_score=('kappa', quadratic_kappa), regression=params.REGRESSION, batch_iterator_train=train_iterator, batch_iterator_test=test_iterator, on_epoch_finished=[ AdjustVariable('update_learning_rate', start=params.START_LEARNING_RATE), stats.Stat(), ModelSaver() ], max_epochs=500, verbose=1, # Only relevant when create_validation_split = True eval_size=0.1, # Need to specify splits manually like indicated below! create_validation_split=params.SUBSET>0, ) # It is recommended to use the same training/validation split every model for ensembling and threshold optimization # # To set specific training/validation split: net.X_train = np.load(params.IMAGE_SOURCE + "/X_train.npy") net.X_valid = np.load(params.IMAGE_SOURCE + "/X_valid.npy") net.y_train = np.load(params.IMAGE_SOURCE + "/y_train.npy") net.y_valid = np.load(params.IMAGE_SOURCE + "/y_valid.npy") return net, X, y
if __name__ == "__main__": # # Serve the app with gevent # http_server = WSGIServer(("0.0.0.0", 5000), app) # http_server.serve_forever() MODEL_PATH = "models/model_cpc_1.tflite" LABELS_PATH = "models/imageLabels.txt" interpreter = tf.lite.Interpreter( model_path = MODEL_PATH) interpreter.allocate_tensors() labels = load_labels(LABELS_PATH) input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() print(f"INPUT {input_details}") print(f"OUTPUT {output_details}") floating_model = input_details[0]['dtype'] == np.float32 # NxHxWxC, H:1, W:2 height = input_details[0]['shape'][1] width = input_details[0]['shape'][2] input_mean = 127.5 input_std = 127.5 print("Running server on http://127.0.0.1:5000/")
def main(): dataset = None if len(sys.argv) > 1: dataset = sys.argv[1] metadata = util.get_metadata((dataset + "_metadata") if dataset else None) mfcc = dict( zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None))) # Load pyAudioAnalysis features with open("F", "rb") as f: feats, files = pickle.load(f, encoding="latin1") files = [f.split(".")[0].split("XC")[-1] for f in files] F = dict(zip(files, feats)) full_dataset = True for item in metadata[1:]: if item[0] not in F: full_dataset = False X2, X3 = [], [] if full_dataset: X3 = [ np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:] ] X2 = [F[item[0]] for item in metadata[1:]] X1 = [mfcc[item[0]] for item in metadata[1:]] #X = util.load_features((dataset + "_features") if dataset else None) for X in [X1, X2]: labels = [] avg_mat = None all_sims = dict() Y = util.load_labels((dataset + "_metadata") if dataset else None) samples = range( len(X)) #range(1, len(X), 12)#random.sample(range(len(X)), 25) samps = range(len(X)) #samples x = [X[i] for i in samps] y = [Y[i] for i in samples] N_ESTIMATORS = 80 NUM_RUNS = 5 for run in range(NUM_RUNS): clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=25, oob_score=True).fit(X, Y) similarity = dict() for dt in clf.estimators_: leaves = dt.apply(X) for i in samps: for j in samps: if leaves[i] == leaves[j]: similarity[(i, j)] = similarity.get( (i, j), 0) + (1 / N_ESTIMATORS) species_similarity = dict() for i in samps: for j in samps: species_similarity[(Y[i], Y[j])] = species_similarity.get( (Y[i], Y[j]), 0) + similarity.get( (i, j), 0)**2 / (Y.count(Y[i]) * Y.count(Y[j])) for k in species_similarity: species_similarity[k] = species_similarity[k]**(0.5) labels = clf.classes_ for i in range(len(labels)): normal = species_similarity[(labels[i], labels[i])] for j in range(i, len(labels)): k = labels[i], labels[j] species_similarity[k] /= normal species_similarity[(k[1], k[0])] = species_similarity[k] all_sims[k] = all_sims.get( k, 0) + species_similarity[k] / NUM_RUNS mat = np.array([[(1.0 - species_similarity.get((i, j), 0))**2 for j in labels] for i in labels]) print(mat) mat = squareform(mat) if avg_mat is None: avg_mat = mat else: avg_mat = np.add(avg_mat, mat) avg_mat = avg_mat / NUM_RUNS print(avg_mat) for k in all_sims: if k[0] != k[1] and all_sims[k] > 0.1: print("{}\t{}\t{}".format(k[0], k[1], all_sims[k])) linkage_matrix = linkage(avg_mat, "single") matplotlib.rcParams['lines.linewidth'] = 2.5 dendrogram(linkage_matrix, color_threshold=0.65, labels=labels, show_leaf_counts=True) plt.xlabel("label") plt.ylabel("distance") plt.show()
def main(data_path, out_path, labelfile, resume, batch_size, epochs, resnet_depth, train): try: if not os.path.isdir(out_path): os.makedirs(out_path) training_data, validation_data = load_data(data_path) imChannels, _, _ = training_data[0][0].shape # load labels in decoder format labels, alpha_len = load_labels(labelfile) if resume is None: # start training from beginning model = Model(out_path, resnet_depth, imChannels, alpha_len, labels) # weight_decay == l2 lambda # SGD tends to get better end-results # Learning-rate is reduced on plateau, check model.py for details. optimizer = optim.SGD(model.parameters(), lr=1e-4, weight_decay=0.2) #optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.2) # save the summary of the model with open(out_path + os.sep + "modelsummary_ctc.txt", 'w') as f: with redirect_stdout(f): print(str(model)) else: # resume from checkpoint if resume == "best": print("Resuming from best checkpoint") checkpoint = torch.load( os.path.join(out_path, 'checkpoint_best.pth.tar')) else: print("Resuming from last checkpoint") checkpoint = torch.load( os.path.join(out_path, 'checkpoint.pth.tar')) model = Model(checkpoint['model_params']['output'], checkpoint['model_params']['resnet_depth'], checkpoint['model_params']['imChannels'], checkpoint['model_params']['alphabet_length'], checkpoint['model_params']['labels'], last_epoch=checkpoint['epoch']) model.load_state_dict(checkpoint['model_states']) optimizer = optim.SGD(model.parameters(), lr=1e-4, weight_decay=0.2) #optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.2) optimizer.load_state_dict(checkpoint['optimizer']) # optimizer states have to be moved to GPU manually if torch.cuda.is_available(): for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # logger visualizes training process, can be followed during training logger = HistoryLogger(model, out_path, batch_size, epochs, validation_data[0], validation_data[1], training_data[0], training_data[1]) if train: model.fit(training_data, validation_data, optimizer, batch_size=batch_size, epochs=epochs, logger=logger) torch.save(model, out_path + os.sep + "model_final.pth.tar") except Exception as err: print(err.args) raise
# lbls[lid] = np.reshape(joints, (63)) # # lbls = np.reshape(lbls,(-1,63)) # # x = util.normalize_pose(dataset, lbls, centers, 150, fx, fy) # # util.save_results(x, out_file) # ################################################################################ ########################### Test RGB Normalized joints: norm to 2D pixel to 3D World back to 2D pixel and plot ################################# ### Test label normalization by projecting the normalized joints onto some RGB image samples ### this segment is only for validation ############################################################################################################################################# lbls = util.load_labels(dataset,phase) ### load test/train data names = util.load_names(dataset,phase) centers = util.load_centers(dataset,phase).astype(float) fx, fy, ux, uy = util.get_param(dataset) lbls = [s.split() for s in lbls] lbls = np.reshape(np.asarray(lbls, dtype=np.float32),(-1,63)) lbls = util.transform_pose(dataset, lbls, centers, 150, fx, fy) # norm to 2D pixel centers = np.reshape(centers,(-1,3)) for idx, name in enumerate(names): if idx%1000 == 0: lbl = util.pixel2world(lbls[idx], dataset) # pixel to 3D world lbl, skel_camcoords = util.world2pixel(lbl, dataset) # back to 2d pixel from 3D world img = util.load_image(dataset, os.path.join(root_dir, name))
######################## ## This is the preprocessing and validation script for Depth. ## Keep the segment you would like to use and comment out the rest ## this is because there are multiple data files (labels before/after normalization) that will cause conflicts ######################## dataset = 'fpad' phase = 'train' ## test/train root_dir = '/home/bilbeisi/REN/' ############################ Draw pose on depth samples ################################# ### draw pose on some depth samples to validate world2pixel and image/label loading ### this segment is only for validation ############################################################################################## lbls = util.load_labels(dataset, phase) ### load test/train data names = util.load_names(dataset, phase) centers = util.load_centers(dataset, phase).astype(float) for idx, name in enumerate(names): if idx % 1000 == 0: lbl = np.asarray(np.reshape(lbls[idx].split(), (21, 3)), dtype=np.float32) lbl, skel_camcoords = util.world2pixel(lbl, dataset) img = util.load_image(dataset, os.path.join(root_dir, name)) img /= 1160 img *= 255 points = centers[idx] img = util.draw_pose(dataset, img, lbl, 3, (255, 0, 0), points) cv2.imwrite( root_dir + 'samples/depth/' + phase + '_' + str(idx) + '.png', img)
def train_classifier(feature_name, train_batch_num, base_npz_dir, test_batches): test_acc = [] base_path = util.get_base_path() categories = util.get_categories() train_batches = range(0, train_batch_num) #test_batches = range(train_batch_num,train_batch_num+1) JC edit set_name = 'setb50k' label_set_name = set_name subset = '' #'_pca1' classifier_paramstring = '' if do_norm: classifier_paramstring += 'N' if props['C'] != 0: classifier_paramstring += 'C%d' % props['C'] out_fn = os.path.join( base_npz_dir, feature_name, '%s%s_%s%s_%d-%d.pickle' % (classifier_type, classifier_paramstring, set_name, subset, train_batches[0], train_batches[-1])) if do_norm: out_fn_norm = os.path.join( base_npz_dir, feature_name, 'norm_%s%s_%d.pickle' % (set_name, subset, train_batches[0])) print 'Training %s...' % out_fn if classifier_type == 'sgd_svm': is_incremental = True else: is_incremental = False norm = dict() clf = None for i_batch, train_batch in enumerate(train_batches + test_batches): fn = os.path.join(base_npz_dir, feature_name, '%s_%05d%s.npz' % (set_name, train_batch, subset)) print 'Processing feature file %s.' % fn print fn with np.load(fn) as file_contents: data = file_contents['data'] true_labels, _ = util.load_labels(label_set_name, train_batch) if do_norm: if i_batch == 0: # Initial batch to determine mean and variance for normalization norm['mean'] = np.expand_dims(data.mean(axis=0), 0) norm['std'] = np.expand_dims(data.std(axis=0), 0) norm['std'] = np.maximum(norm['std'], 0.01) with open(out_fn_norm, 'wb') as fid: pickle.dump(norm, fid) data -= norm['mean'] data /= norm['std'] print 'Data after normalization: Mean %f, Std %f' % (data.mean( axis=0).mean(axis=0), data.std(axis=0).mean(axis=0)) if is_incremental: # Incremental: Do training every training iteration # Do testing not just on test but also during training before feeding the new training data do_train = (i_batch < len(train_batches)) do_test = (i_batch > 0) use_data = data use_true_labels = true_labels else: # Non-incremental: Train once when all training batches have been collected do_train = (i_batch == len(train_batches) - 1) do_test = (i_batch >= len(train_batches)) # data collection phase if not do_test: if i_batch == 0: data_all = data all_true_labels = true_labels else: data_all = np.concatenate((data_all, data), axis=0) all_true_labels = np.concatenate( (all_true_labels, true_labels), axis=0) use_data = data_all use_true_labels = all_true_labels print ' use data %s.' % str(use_data.shape) print ' use labels %s' % str(use_true_labels.shape) if do_test: # After some batch training has been done, predict performance pred_labels = clf.predict(data) acc = float(sum(pred_labels == true_labels)) / true_labels.size test_acc.append(acc) print ' Batch accuracy: %.1f%%' % (acc * 100) if do_train: if classifier_type == 'sgd_svm': clf = train_sgd(clf, 'hinge', use_data, use_true_labels) elif classifier_type == 'svm': clf = train_svm(clf, use_data, use_true_labels, props) pred_labels = clf.predict(use_data) acc = float( sum(pred_labels == use_true_labels)) / use_true_labels.size print ' Train accuracy: %.1f%%' % (acc * 100) # Dump classifier data at every iteration with open(out_fn, 'wb') as fid: pickle.dump(clf, fid) return np.mean(test_acc)