Example #1
0
    def preprocess(self, tokenizer_name, var_length=False):
        # types of csv columns
        time_start = time.time()
        tokenizer = self.choose_tokenizer(tokenizer_name)
        self.text = data.Field(batch_first=True,
                               tokenize=tokenizer,
                               include_lengths=var_length)
        self.qid = data.Field()
        self.target = data.Field(sequential=False,
                                 use_vocab=False,
                                 is_target=True)

        # read and tokenize data
        print('read and tokenize data...')
        self.train = MyTabularDataset(path=self.train_csv,
                                      format='csv',
                                      fields={
                                          'qid': ('qid', self.qid),
                                          'question_text': ('text', self.text),
                                          'target': ('target', self.target)
                                      })

        self.test = MyTabularDataset(path=self.test_csv,
                                     format='csv',
                                     fields={
                                         'qid': ('qid', self.qid),
                                         'question_text': ('text', self.text)
                                     })
        print_duration(time_start, 'time to read and tokenize data: ')
        self.text.build_vocab(self.train, self.test, min_freq=1)
        self.qid.build_vocab(self.train, self.test)
        print_duration(time_start, 'time to read, tokenize and build vocab: ')
Example #2
0
def k_split_indices(randperm, cut_idxs, k, i, is_test):
    time_start = time.time()

    # val index
    val_start_idx = cut_idxs[i]
    val_end_idx = cut_idxs[i + 1]
    val_index = randperm[val_start_idx:val_end_idx]

    # test index
    if is_test:
        if i <= k - 2:
            test_start_idx = cut_idxs[i + 1]
            test_end_idx = cut_idxs[i + 2]
        else:
            test_start_idx = cut_idxs[0]
            test_end_idx = cut_idxs[1]
        test_index = randperm[test_start_idx:test_end_idx]
    else:
        test_index = []
    val_test_index = set(val_index + test_index)
    # train index
    print_duration(time_start, message='k_split_indices time')
    train_index = [idx for idx in randperm if idx not in val_test_index]
    print_duration(time_start, message='k_split_indices time')
    return train_index, val_index, test_index
Example #3
0
 def read_embedding(self, embeddings, unk_std, max_vectors, to_cache):
     time_start = time.time()
     unk_init = partial(normal_init, std=unk_std)
     for emb in embeddings:
         self.vectors.append(
             MyVectors(emb,
                       cache=self.cache,
                       to_cache=to_cache,
                       unk_init=unk_init,
                       max_vectors=max_vectors))
     print_duration(time_start, 'time to read embedding: ')
Example #4
0
def main():

    # load up the SVM stored from prior training

    try:
        svm = cv2.ml.SVM_load(params.HOG_SVM_PATH_SAVED)
    except:
        print("Missing files  SVM")
        print("-- have you performed training to produce this file ?")
        exit()

    # load ** testing ** data sets in the same class order as training
    # (here we perform patch sampling only from the centre of the +ve
    # class and only a single sample is taken
    # hence [0,0] sample sizes and [False,True] centre weighting flags)

    print("Loading test data as a batch ...")

    paths = [params.DATA_testing_path_neg, params.DATA_testing_path_pos]
    use_centre_weighting = [False, True]
    class_names = params.DATA_CLASS_NAMES
    imgs_data = utils.load_images(paths, class_names, [0, 0],
                                  use_centre_weighting)

    print("Computing HOG descriptors...")  # for each testing image
    start = cv2.getTickCount()
    [img_data.compute_hog_descriptor() for img_data in imgs_data]
    utils.print_duration(start)

    # get the example/sample HOG descriptors and class labels

    samples, class_labels = utils.get_hog_descriptors(
        imgs_data), utils.get_class_labels(imgs_data)

    # perform batch SVM classification over the whole set

    print("Performing batch SVM classification over all data  ...")

    results = svm.predict(samples)
    output = results[1].ravel()

    # compute and report the error over the whole set

    error = ((np.absolute(class_labels.ravel() - output).sum()) /
             float(output.shape[0]))
    print("Successfully trained SVM with {}% testing set error".format(
        round(error * 100, 2)))
    print(
        "-- meaining the SVM got {}% of the testing examples correct!".format(
            round((1.0 - error) * 100, 2)))
def main():

    ############################################################################
    # load our training data set of images examples

    program_start = cv2.getTickCount()

    print("Loading images...")
    start = cv2.getTickCount()

    # N.B. specify data path names in same order as class names (neg, pos)

    paths = [params.DATA_training_path_neg, params.DATA_training_path_pos]

    # build a list of class names automatically from our dictionary of class (name,number) pairs

    class_names = [utils.get_class_name(class_number) for class_number in range(len(params.DATA_CLASS_NAMES))]

    # specify number of sub-window samples to take from each positive and negative
    # example image in the data set
    # N.B. specify in same order as class names (neg, pos) - again

    sampling_sizes = [params.DATA_training_sample_count_neg, params.DATA_training_sample_count_pos]

    # do we want to take samples only centric to the example image or ramdonly?
    # No - for background -ve images (first class)
    # Yes - for object samples +ve images (second class)

    sample_from_centre = [False, True];

    # perform image loading

    imgs_data = utils.load_images(paths, class_names, sampling_sizes, sample_from_centre,
                            params.DATA_WINDOW_OFFSET_FOR_TRAINING_SAMPLES, params.DATA_WINDOW_SIZE);

    print(("Loaded {} image(s)".format(len(imgs_data))))
    utils.print_duration(start)

    ############################################################################
    # perform HOG feature extraction

    print("Computing HOG descriptors...") # for each training image
    start = cv2.getTickCount()
    #each HoG descriptor is stored in its respective img_data instance
    [img_data.compute_hog_descriptor() for img_data in imgs_data]
    utils.print_duration(start)

    ############################################################################
    # train an SVM based on these norm_features

    print("Training SVM...")
    start = cv2.getTickCount()

    # define SVM parameters
    svm = cv2.ml.SVM_create()
    svm.setType(cv2.ml.SVM_C_SVC)           # set SVM type
    svm.setKernel(params.HOG_SVM_kernel)    # use specific kernel type

    # get hog descriptor for each image and store in single global array
    samples = utils.get_hog_descriptors(imgs_data)

    # get class label for each training image (i.e. 0 for other, 1 for pedestrian... can extend)
    class_labels = utils.get_class_labels(imgs_data);

    # specify the termination criteria for the SVM training
    svm.setTermCriteria((cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, params.HOG_SVM_max_training_iterations, 1.e-06))

    # perform auto training for the SVM which will essentially perform grid
    # search over the set of parameters for the chosen kernel and the penalty
    # cost term, C (N.B. trainAuto() syntax is correct as of OpenCV 3.4.x)
    svm.trainAuto(samples, cv2.ml.ROW_SAMPLE, class_labels, kFold = 10, balanced = True);

    # save the trained SVM to file so that we can load it again for testing / detection
    svm.save(params.HOG_SVM_PATH_TRAIN)

    ############################################################################
    # measure performance of the SVM trained on the bag of visual word features

    # perform prediction over the set of examples we trained over
    output = svm.predict(samples)[1].ravel()
    error = (np.absolute(class_labels.ravel() - output).sum()) / float(output.shape[0])

    # we are succesful if our prediction > than random
    # e.g. for 2 class labels this would be 1/2 = 0.5 (i.e. 50%)
    if error < (1.0 / len(params.DATA_CLASS_NAMES)):
        print("Trained SVM obtained {}% training set error".format(round(error * 100,2)))
        print("-- meaining the SVM got {}% of the training examples correct!".format(round((1.0 - error) * 100,2)))
    else:
        print("Failed to train SVM. {}% error".format(round(error * 100,2)))

    utils.print_duration(start)

    print(("Finished training HoG detector. {}".format(format_time(get_elapsed_time(program_start)))))
Example #6
0
    def fit(self, epoch, n_eval, tresh, early_stop, warmup_epoch, clip):

        step = 0
        min_loss = 1e5
        max_f1 = -1
        max_test_f1 = -1
        no_improve_epoch = 0
        no_improve_in_previous_epoch = False
        fine_tuning = False
        losses = []
        best_test_info = None
        torch.backends.cudnn.benchmark = False
        eval_every = int(len(list(iter(self.train_dl))) / n_eval)

        time_start = time.time()
        print(self.model)
        for e in range(epoch):
            self.scheduler.step()
            if e >= warmup_epoch:
                if no_improve_in_previous_epoch:
                    no_improve_epoch += 1
                    if no_improve_epoch >= early_stop:
                        e = e - 1
                        break
                else:
                    no_improve_epoch = 0
                no_improve_in_previous_epoch = True
            if not fine_tuning and e >= warmup_epoch:
                self.model.embedding.weight.requires_grad = True
                fine_tuning = True
            self.train_dl.init_epoch()

            for train_batch in iter(self.train_dl):
                step += 1
                self.model.zero_grad()
                self.model.train()
                model_input = self.to_cuda(train_batch.text)
                y = train_batch.target.type(torch.Tensor).cuda()
                pred = self.model.forward(*model_input).view(-1)
                loss = self.loss_func(pred, y)
                self.recorder.tr_record.append(
                    {'tr_loss': loss.cpu().data.numpy()})
                loss.backward()
                total_norm = nn.utils.clip_grad_norm_(self.model.parameters(),
                                                      clip)
                # parameters = list(filter(lambda p: p.grad is not None, self.model.parameters()))
                # total_norm = 0
                # for p in parameters:
                #     param_norm = p.grad.data.norm(2)
                #     total_norm += param_norm.item() ** 2
                # total_norm = total_norm ** (1. / 2)

                self.recorder.norm_record.append({'grad_norm': total_norm})
                self.optimizer.step()

                if step % eval_every == 0:
                    # self.scheduler.step()
                    with torch.no_grad():
                        # train evaluation
                        losses.append(loss.cpu().data.numpy())
                        train_loss = np.mean(losses)

                        # val evaluation
                        val_loss, val_f1, val_ids, val_prob, val_true = self.evaluate(
                            self.val_dl, tresh)
                        pred_to_csv(
                            val_ids, val_prob, val_true,
                            f'tmp/val_probs_{int(step / eval_every) - 1}.csv')
                        self.recorder.val_record.append({
                            'step': step,
                            'loss': val_loss,
                            'f1': val_f1
                        })
                        info = {
                            'best_ep': e,
                            'step': step,
                            'train_loss': train_loss,
                            'val_loss': val_loss,
                            'val_f1': val_f1
                        }
                        self.recorder.save_step(info, message=True)
                        if val_f1 > max_f1:
                            self.recorder.save(self.model, info)
                            max_f1 = val_f1
                            no_improve_in_previous_epoch = False
                        if val_loss < min_loss:
                            min_loss = val_loss

                        # test evaluation
                        # if self.args.test:
                        #    test_loss, test_f1 =  self.evaluate(self.test_dl, tresh)
                        #    test_info = {'test_ep': e, 'test_step': step, 'test_loss': test_loss, 'test_f1': test_f1}
                        #    self.recorder.test_record.append({'step': step, 'loss': test_loss, 'f1': test_f1})
                        #    print('epoch {:02} - step {:06} - test_loss {:.4f} - test_f1 {:.4f}'.format(*list(test_info.values())))
                        #    if test_f1 >= max_test_f1:
                        #        max_test_f1 = test_f1
                        #        best_test_info = test_info

        tr_time = print_duration(time_start, 'training time: ')
        self.recorder.append_info({'ep_time': tr_time / (e + 1)})

        #if self.args.test:
        #    self.recorder.append_info(best_test_info, message='Best results for test:')
        self.recorder.append_info({'min_loss': min_loss}, 'min val loss: ')

        self.model, info = self.recorder.load(message='best model:')

        # final train evaluation
        train_loss, train_f1, _, _, _ = self.evaluate(self.train_dl, tresh)
        tr_info = {'train_loss': train_loss, 'train_f1': train_f1}
        self.recorder.append_info(tr_info, message='train loss and f1:')
Example #7
0
 def embedding_lookup(self):
     print('embedding lookup...')
     time_start = time.time()
     self.text.vocab.load_vectors(self.vectors)
     print_duration(time_start, 'time for embedding lookup: ')
     return
Example #8
0
                                                              feed_dict={x: testX, y_: testY, keep_prob: 1.0})
        exam_cross_entropy_avg = utils.get_moving_avg(exam_cross_entropy_avg, test_cross_entropy)
        exam_accuracy_avg = utils.get_moving_avg(exam_accuracy_avg, test_accuracy)
        exam_writer.add_summary(summary, i)
        if i % DISPLAY_STEP == 0:
            print("step %d, exam accuracy %f cross_entropy %f" % (i, exam_accuracy_avg, exam_cross_entropy_avg))
    print("=====> test result training accuracy %f cross_entropy %f" % (exam_accuracy_avg, exam_cross_entropy_avg))
exam_writer.close()

if PUBLISH:
    # start publish:
    test_data_source = Input(TEST_PATH, TRAIN_FILE, shuffle=False,
                                  loop=False, temp_file_path=TEMP_FILE,
                                  n_mfcc=FLATTEN_SIZE_W,
                                  fixed_sample=FLATTEN_SIZE_H)
    result_frame = pd.DataFrame(np.zeros([test_data_source.get_total_files(), 2]), columns=['fname', 'label'])
    for i in range(0, test_data_source.get_total_files()):
        testX, _ = test_data_source.next(1)
        # testX = np.pad(testX, ((0, BATCH_SIZE - 1), (0, 0)), 'constant')
        logits = sess.run(y_conv,
                          feed_dict={x: testX, keep_prob: 1.0})
        result_frame.iloc[i, 0] = test_data_source.get_last_read_file_name()
        result_frame.iloc[i, 1] = utils.trans_index_to_label(test_data_source, utils.top_n_value(tf.squeeze(logits).eval(session=sess), 3))

    result_frame.to_csv(path_or_buf=SUBMIT_FILE)
    print("=====> publish done")

sess.close()
utils.print_duration(start_time)
print("===== job finish =====")
Example #9
0
    # scale the disparity to 8-bit for viewing
    disparity_scaled = (disparity / 16.).astype(np.uint8)

    # crop area not seen by *both* cameras and and area with car bonnet
    disparity_scaled = utils.crop_image(disparity_scaled, 0, 390, 135, width)

    return disparity_scaled


# </section>End of Functions Section

# <section>~~~~~~~~~~~~~~~~~~~~~~~~~~~~Main~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Colors = utils.gen_N_colors(
    81)  #get N different colors for the N possible classes

utils.print_duration(time_to_setup)  #print how long it took to set up

# cycle through the images
for filename_left in left_file_list:
    # <section>---------------Directory Checks---------------
    # skipping if requested
    if check_skip(skip_forward_file_pattern, filename_left):
        continue
    else:
        skip_forward_file_pattern = ""

    # from the left image filename get the correspondoning right image
    filename_right = filename_left.replace("_L", "_R")
    full_path_filenames = join_paths_both_sides(full_path_directory_left,
                                                filename_left,
                                                full_path_directory_right,