def preprocess(self, tokenizer_name, var_length=False): # types of csv columns time_start = time.time() tokenizer = self.choose_tokenizer(tokenizer_name) self.text = data.Field(batch_first=True, tokenize=tokenizer, include_lengths=var_length) self.qid = data.Field() self.target = data.Field(sequential=False, use_vocab=False, is_target=True) # read and tokenize data print('read and tokenize data...') self.train = MyTabularDataset(path=self.train_csv, format='csv', fields={ 'qid': ('qid', self.qid), 'question_text': ('text', self.text), 'target': ('target', self.target) }) self.test = MyTabularDataset(path=self.test_csv, format='csv', fields={ 'qid': ('qid', self.qid), 'question_text': ('text', self.text) }) print_duration(time_start, 'time to read and tokenize data: ') self.text.build_vocab(self.train, self.test, min_freq=1) self.qid.build_vocab(self.train, self.test) print_duration(time_start, 'time to read, tokenize and build vocab: ')
def k_split_indices(randperm, cut_idxs, k, i, is_test): time_start = time.time() # val index val_start_idx = cut_idxs[i] val_end_idx = cut_idxs[i + 1] val_index = randperm[val_start_idx:val_end_idx] # test index if is_test: if i <= k - 2: test_start_idx = cut_idxs[i + 1] test_end_idx = cut_idxs[i + 2] else: test_start_idx = cut_idxs[0] test_end_idx = cut_idxs[1] test_index = randperm[test_start_idx:test_end_idx] else: test_index = [] val_test_index = set(val_index + test_index) # train index print_duration(time_start, message='k_split_indices time') train_index = [idx for idx in randperm if idx not in val_test_index] print_duration(time_start, message='k_split_indices time') return train_index, val_index, test_index
def read_embedding(self, embeddings, unk_std, max_vectors, to_cache): time_start = time.time() unk_init = partial(normal_init, std=unk_std) for emb in embeddings: self.vectors.append( MyVectors(emb, cache=self.cache, to_cache=to_cache, unk_init=unk_init, max_vectors=max_vectors)) print_duration(time_start, 'time to read embedding: ')
def main(): # load up the SVM stored from prior training try: svm = cv2.ml.SVM_load(params.HOG_SVM_PATH_SAVED) except: print("Missing files SVM") print("-- have you performed training to produce this file ?") exit() # load ** testing ** data sets in the same class order as training # (here we perform patch sampling only from the centre of the +ve # class and only a single sample is taken # hence [0,0] sample sizes and [False,True] centre weighting flags) print("Loading test data as a batch ...") paths = [params.DATA_testing_path_neg, params.DATA_testing_path_pos] use_centre_weighting = [False, True] class_names = params.DATA_CLASS_NAMES imgs_data = utils.load_images(paths, class_names, [0, 0], use_centre_weighting) print("Computing HOG descriptors...") # for each testing image start = cv2.getTickCount() [img_data.compute_hog_descriptor() for img_data in imgs_data] utils.print_duration(start) # get the example/sample HOG descriptors and class labels samples, class_labels = utils.get_hog_descriptors( imgs_data), utils.get_class_labels(imgs_data) # perform batch SVM classification over the whole set print("Performing batch SVM classification over all data ...") results = svm.predict(samples) output = results[1].ravel() # compute and report the error over the whole set error = ((np.absolute(class_labels.ravel() - output).sum()) / float(output.shape[0])) print("Successfully trained SVM with {}% testing set error".format( round(error * 100, 2))) print( "-- meaining the SVM got {}% of the testing examples correct!".format( round((1.0 - error) * 100, 2)))
def main(): ############################################################################ # load our training data set of images examples program_start = cv2.getTickCount() print("Loading images...") start = cv2.getTickCount() # N.B. specify data path names in same order as class names (neg, pos) paths = [params.DATA_training_path_neg, params.DATA_training_path_pos] # build a list of class names automatically from our dictionary of class (name,number) pairs class_names = [utils.get_class_name(class_number) for class_number in range(len(params.DATA_CLASS_NAMES))] # specify number of sub-window samples to take from each positive and negative # example image in the data set # N.B. specify in same order as class names (neg, pos) - again sampling_sizes = [params.DATA_training_sample_count_neg, params.DATA_training_sample_count_pos] # do we want to take samples only centric to the example image or ramdonly? # No - for background -ve images (first class) # Yes - for object samples +ve images (second class) sample_from_centre = [False, True]; # perform image loading imgs_data = utils.load_images(paths, class_names, sampling_sizes, sample_from_centre, params.DATA_WINDOW_OFFSET_FOR_TRAINING_SAMPLES, params.DATA_WINDOW_SIZE); print(("Loaded {} image(s)".format(len(imgs_data)))) utils.print_duration(start) ############################################################################ # perform HOG feature extraction print("Computing HOG descriptors...") # for each training image start = cv2.getTickCount() #each HoG descriptor is stored in its respective img_data instance [img_data.compute_hog_descriptor() for img_data in imgs_data] utils.print_duration(start) ############################################################################ # train an SVM based on these norm_features print("Training SVM...") start = cv2.getTickCount() # define SVM parameters svm = cv2.ml.SVM_create() svm.setType(cv2.ml.SVM_C_SVC) # set SVM type svm.setKernel(params.HOG_SVM_kernel) # use specific kernel type # get hog descriptor for each image and store in single global array samples = utils.get_hog_descriptors(imgs_data) # get class label for each training image (i.e. 0 for other, 1 for pedestrian... can extend) class_labels = utils.get_class_labels(imgs_data); # specify the termination criteria for the SVM training svm.setTermCriteria((cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, params.HOG_SVM_max_training_iterations, 1.e-06)) # perform auto training for the SVM which will essentially perform grid # search over the set of parameters for the chosen kernel and the penalty # cost term, C (N.B. trainAuto() syntax is correct as of OpenCV 3.4.x) svm.trainAuto(samples, cv2.ml.ROW_SAMPLE, class_labels, kFold = 10, balanced = True); # save the trained SVM to file so that we can load it again for testing / detection svm.save(params.HOG_SVM_PATH_TRAIN) ############################################################################ # measure performance of the SVM trained on the bag of visual word features # perform prediction over the set of examples we trained over output = svm.predict(samples)[1].ravel() error = (np.absolute(class_labels.ravel() - output).sum()) / float(output.shape[0]) # we are succesful if our prediction > than random # e.g. for 2 class labels this would be 1/2 = 0.5 (i.e. 50%) if error < (1.0 / len(params.DATA_CLASS_NAMES)): print("Trained SVM obtained {}% training set error".format(round(error * 100,2))) print("-- meaining the SVM got {}% of the training examples correct!".format(round((1.0 - error) * 100,2))) else: print("Failed to train SVM. {}% error".format(round(error * 100,2))) utils.print_duration(start) print(("Finished training HoG detector. {}".format(format_time(get_elapsed_time(program_start)))))
def fit(self, epoch, n_eval, tresh, early_stop, warmup_epoch, clip): step = 0 min_loss = 1e5 max_f1 = -1 max_test_f1 = -1 no_improve_epoch = 0 no_improve_in_previous_epoch = False fine_tuning = False losses = [] best_test_info = None torch.backends.cudnn.benchmark = False eval_every = int(len(list(iter(self.train_dl))) / n_eval) time_start = time.time() print(self.model) for e in range(epoch): self.scheduler.step() if e >= warmup_epoch: if no_improve_in_previous_epoch: no_improve_epoch += 1 if no_improve_epoch >= early_stop: e = e - 1 break else: no_improve_epoch = 0 no_improve_in_previous_epoch = True if not fine_tuning and e >= warmup_epoch: self.model.embedding.weight.requires_grad = True fine_tuning = True self.train_dl.init_epoch() for train_batch in iter(self.train_dl): step += 1 self.model.zero_grad() self.model.train() model_input = self.to_cuda(train_batch.text) y = train_batch.target.type(torch.Tensor).cuda() pred = self.model.forward(*model_input).view(-1) loss = self.loss_func(pred, y) self.recorder.tr_record.append( {'tr_loss': loss.cpu().data.numpy()}) loss.backward() total_norm = nn.utils.clip_grad_norm_(self.model.parameters(), clip) # parameters = list(filter(lambda p: p.grad is not None, self.model.parameters())) # total_norm = 0 # for p in parameters: # param_norm = p.grad.data.norm(2) # total_norm += param_norm.item() ** 2 # total_norm = total_norm ** (1. / 2) self.recorder.norm_record.append({'grad_norm': total_norm}) self.optimizer.step() if step % eval_every == 0: # self.scheduler.step() with torch.no_grad(): # train evaluation losses.append(loss.cpu().data.numpy()) train_loss = np.mean(losses) # val evaluation val_loss, val_f1, val_ids, val_prob, val_true = self.evaluate( self.val_dl, tresh) pred_to_csv( val_ids, val_prob, val_true, f'tmp/val_probs_{int(step / eval_every) - 1}.csv') self.recorder.val_record.append({ 'step': step, 'loss': val_loss, 'f1': val_f1 }) info = { 'best_ep': e, 'step': step, 'train_loss': train_loss, 'val_loss': val_loss, 'val_f1': val_f1 } self.recorder.save_step(info, message=True) if val_f1 > max_f1: self.recorder.save(self.model, info) max_f1 = val_f1 no_improve_in_previous_epoch = False if val_loss < min_loss: min_loss = val_loss # test evaluation # if self.args.test: # test_loss, test_f1 = self.evaluate(self.test_dl, tresh) # test_info = {'test_ep': e, 'test_step': step, 'test_loss': test_loss, 'test_f1': test_f1} # self.recorder.test_record.append({'step': step, 'loss': test_loss, 'f1': test_f1}) # print('epoch {:02} - step {:06} - test_loss {:.4f} - test_f1 {:.4f}'.format(*list(test_info.values()))) # if test_f1 >= max_test_f1: # max_test_f1 = test_f1 # best_test_info = test_info tr_time = print_duration(time_start, 'training time: ') self.recorder.append_info({'ep_time': tr_time / (e + 1)}) #if self.args.test: # self.recorder.append_info(best_test_info, message='Best results for test:') self.recorder.append_info({'min_loss': min_loss}, 'min val loss: ') self.model, info = self.recorder.load(message='best model:') # final train evaluation train_loss, train_f1, _, _, _ = self.evaluate(self.train_dl, tresh) tr_info = {'train_loss': train_loss, 'train_f1': train_f1} self.recorder.append_info(tr_info, message='train loss and f1:')
def embedding_lookup(self): print('embedding lookup...') time_start = time.time() self.text.vocab.load_vectors(self.vectors) print_duration(time_start, 'time for embedding lookup: ') return
feed_dict={x: testX, y_: testY, keep_prob: 1.0}) exam_cross_entropy_avg = utils.get_moving_avg(exam_cross_entropy_avg, test_cross_entropy) exam_accuracy_avg = utils.get_moving_avg(exam_accuracy_avg, test_accuracy) exam_writer.add_summary(summary, i) if i % DISPLAY_STEP == 0: print("step %d, exam accuracy %f cross_entropy %f" % (i, exam_accuracy_avg, exam_cross_entropy_avg)) print("=====> test result training accuracy %f cross_entropy %f" % (exam_accuracy_avg, exam_cross_entropy_avg)) exam_writer.close() if PUBLISH: # start publish: test_data_source = Input(TEST_PATH, TRAIN_FILE, shuffle=False, loop=False, temp_file_path=TEMP_FILE, n_mfcc=FLATTEN_SIZE_W, fixed_sample=FLATTEN_SIZE_H) result_frame = pd.DataFrame(np.zeros([test_data_source.get_total_files(), 2]), columns=['fname', 'label']) for i in range(0, test_data_source.get_total_files()): testX, _ = test_data_source.next(1) # testX = np.pad(testX, ((0, BATCH_SIZE - 1), (0, 0)), 'constant') logits = sess.run(y_conv, feed_dict={x: testX, keep_prob: 1.0}) result_frame.iloc[i, 0] = test_data_source.get_last_read_file_name() result_frame.iloc[i, 1] = utils.trans_index_to_label(test_data_source, utils.top_n_value(tf.squeeze(logits).eval(session=sess), 3)) result_frame.to_csv(path_or_buf=SUBMIT_FILE) print("=====> publish done") sess.close() utils.print_duration(start_time) print("===== job finish =====")
# scale the disparity to 8-bit for viewing disparity_scaled = (disparity / 16.).astype(np.uint8) # crop area not seen by *both* cameras and and area with car bonnet disparity_scaled = utils.crop_image(disparity_scaled, 0, 390, 135, width) return disparity_scaled # </section>End of Functions Section # <section>~~~~~~~~~~~~~~~~~~~~~~~~~~~~Main~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Colors = utils.gen_N_colors( 81) #get N different colors for the N possible classes utils.print_duration(time_to_setup) #print how long it took to set up # cycle through the images for filename_left in left_file_list: # <section>---------------Directory Checks--------------- # skipping if requested if check_skip(skip_forward_file_pattern, filename_left): continue else: skip_forward_file_pattern = "" # from the left image filename get the correspondoning right image filename_right = filename_left.replace("_L", "_R") full_path_filenames = join_paths_both_sides(full_path_directory_left, filename_left, full_path_directory_right,