def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname): gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname) logging.info('Loaded gensim model of subgraph vectors') subgraph_vocab = sorted(gensim_model.vocab.keys()) logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab))) wlk_files = get_files(corpus_dir, extn) logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files))) c_vectorizer = CountVectorizer(input='filename', tokenizer=subgraph2vec_tokenizer, lowercase=False, vocabulary=subgraph_vocab) normalizer = Normalizer() X = c_vectorizer.fit_transform(wlk_files) X = normalizer.fit_transform(X) logging.info('X (sample) matrix shape: {}'.format(X.shape)) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test) subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab) deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
def perform_classification(corpus_dir, extension, embedding_fname, class_labels_fname): """ Perform classification from :param corpus_dir: folder containing subgraph2vec sentence files :param extension: extension of the subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format :param class_labels_fname: files containing labels of each graph :return:None """ # weisfeiler lehman kernel files wlk_files = get_files(corpus_dir, extension) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) with open(embedding_fname, 'r') as fh: graph_embedding_dict = json.load(fh) X = np.array([graph_embedding_dict[fname] for fname in wlk_files]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) logging.info('Training and Test Matrix Shapes: {}. {}. {}. {} '.format( X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test) return scores
def main(): # load up the SVM stored from prior training try: svm = cv2.ml.SVM_load(params.HOG_SVM_PATH_SAVED) except: print("Missing files SVM") print("-- have you performed training to produce this file ?") exit() # load ** testing ** data sets in the same class order as training # (here we perform patch sampling only from the centre of the +ve # class and only a single sample is taken # hence [0,0] sample sizes and [False,True] centre weighting flags) print("Loading test data as a batch ...") paths = [params.DATA_testing_path_neg, params.DATA_testing_path_pos] use_centre_weighting = [False, True] class_names = params.DATA_CLASS_NAMES imgs_data = utils.load_images(paths, class_names, [0, 0], use_centre_weighting) print("Computing HOG descriptors...") # for each testing image start = cv2.getTickCount() [img_data.compute_hog_descriptor() for img_data in imgs_data] utils.print_duration(start) # get the example/sample HOG descriptors and class labels samples, class_labels = utils.get_hog_descriptors( imgs_data), utils.get_class_labels(imgs_data) # perform batch SVM classification over the whole set print("Performing batch SVM classification over all data ...") results = svm.predict(samples) output = results[1].ravel() # compute and report the error over the whole set error = ((np.absolute(class_labels.ravel() - output).sum()) / float(output.shape[0])) print("Successfully trained SVM with {}% testing set error".format( round(error * 100, 2))) print( "-- meaining the SVM got {}% of the testing examples correct!".format( round((1.0 - error) * 100, 2)))
def cross_val_accuracy(corpus_dir, extension, embedding_fname, class_labels_fname, cv=10, mode=None): """ Performs 10 (default) fold cross validation, returns the mean accuracy and associated standard deviation :param corpus_dir: folder containing subgraph2vec sentence files :param extension: extension of the subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format :param class_labels_fname: files containing labels of each graph :param cv: integer stating number of folds and therefore experiments to carry out """ # our accuracies acc_results = [] # weisfeiler lehman kernel files wlk_files = get_files(corpus_dir, extension) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) for i in range(cv): seed = randint(0, 1000) with open(embedding_fname, 'r') as fh: graph_embedding_dict = json.load(fh) X = np.array([graph_embedding_dict[fname] for fname in wlk_files]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) if mode == "linear": scores = linear_svm_classify(X_train, X_test, Y_train, Y_test) else: scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test) acc_results.append(scores[0]) return np.mean(acc_results), np.std(acc_results)
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname): ''' Perform classification from :param corpus_dir: folder containing subgraph2vec sentence files :param extn: extension of subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code) :param class_labels_fname: files containing labels of each graph :return: None ''' wlk_files = get_files(corpus_dir, extn) logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files))) c_vectorizer = CountVectorizer(input='filename', tokenizer=subgraph2vec_tokenizer, lowercase=False) normalizer = Normalizer() X = c_vectorizer.fit_transform(wlk_files) X = normalizer.fit_transform(X) logging.info('X (sample) matrix shape: {}'.format(X.shape)) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_svm_classify(X_train, X_test, Y_train, Y_test) with open(embedding_fname,'r') as fh: graph_embedding_dict = json.load(fh) X = np.array([graph_embedding_dict[fname] for fname in wlk_files]) # X = normalizer.fit_transform(X) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_svm_classify(X_train, X_test, Y_train, Y_test)
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname): ''' Perform classification from :param corpus_dir: folder containing subgraph2vec sentence files :param extn: extension of subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code) :param class_labels_fname: files containing labels of each graph :return: None ''' gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname) logging.info('Loaded gensim model of subgraph vectors') subgraph_vocab = sorted(gensim_model.vocab.keys()) logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab))) wlk_files = get_files(corpus_dir, extn) logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files))) c_vectorizer = CountVectorizer(input='filename', tokenizer=subgraph2vec_tokenizer, lowercase=False, vocabulary=subgraph_vocab) normalizer = Normalizer() X = c_vectorizer.fit_transform(wlk_files) X = normalizer.fit_transform(X) logging.info('X (sample) matrix shape: {}'.format(X.shape)) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test) subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab) deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
def perform_classification(corpus_dir, extn, embeddings, class_labels_fname): ''' Perform classification from :param corpus_dir: folder containing subgraph2vec sentence files :param extn: extension of subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code) :param class_labels_fname: files containing labels of each graph :return: None ''' wlk_files = get_files(corpus_dir, extn) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) # logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) # with open(embedding_fname,'r') as fh: # graph_embedding_dict = json.load(fh) wlk_files = [os.path.basename(x) for x in wlk_files] # graph_embedding_dict = {os.path.basename(x):y for x, y in graph_embedding_dict.iteritems()} # X = np.array([graph_embedding_dict[fname] for fname in wlk_files]) X = embeddings from sklearn.model_selection import StratifiedKFold kf = StratifiedKFold(10, shuffle=True, random_state=None) accs = [] for train_index, test_index in kf.split(X, Y): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] # logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) acc = linear_svm_classify(X_train, X_test, Y_train, Y_test) accs.append(acc) print(np.mean(accs), np.std(accs)) return np.mean(accs)
def main(): ############################################################################ # load our training data set of images examples program_start = cv2.getTickCount() print("Loading images...") start = cv2.getTickCount() # N.B. specify data path names in same order as class names (neg, pos) paths = [params.DATA_training_path_neg, params.DATA_training_path_pos] # build a list of class names automatically from our dictionary of class (name,number) pairs class_names = [utils.get_class_name(class_number) for class_number in range(len(params.DATA_CLASS_NAMES))] # specify number of sub-window samples to take from each positive and negative # example image in the data set # N.B. specify in same order as class names (neg, pos) - again sampling_sizes = [params.DATA_training_sample_count_neg, params.DATA_training_sample_count_pos] # do we want to take samples only centric to the example image or ramdonly? # No - for background -ve images (first class) # Yes - for object samples +ve images (second class) sample_from_centre = [False, True]; # perform image loading imgs_data = utils.load_images(paths, class_names, sampling_sizes, sample_from_centre, params.DATA_WINDOW_OFFSET_FOR_TRAINING_SAMPLES, params.DATA_WINDOW_SIZE); print(("Loaded {} image(s)".format(len(imgs_data)))) utils.print_duration(start) ############################################################################ # perform HOG feature extraction print("Computing HOG descriptors...") # for each training image start = cv2.getTickCount() #each HoG descriptor is stored in its respective img_data instance [img_data.compute_hog_descriptor() for img_data in imgs_data] utils.print_duration(start) ############################################################################ # train an SVM based on these norm_features print("Training SVM...") start = cv2.getTickCount() # define SVM parameters svm = cv2.ml.SVM_create() svm.setType(cv2.ml.SVM_C_SVC) # set SVM type svm.setKernel(params.HOG_SVM_kernel) # use specific kernel type # get hog descriptor for each image and store in single global array samples = utils.get_hog_descriptors(imgs_data) # get class label for each training image (i.e. 0 for other, 1 for pedestrian... can extend) class_labels = utils.get_class_labels(imgs_data); # specify the termination criteria for the SVM training svm.setTermCriteria((cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, params.HOG_SVM_max_training_iterations, 1.e-06)) # perform auto training for the SVM which will essentially perform grid # search over the set of parameters for the chosen kernel and the penalty # cost term, C (N.B. trainAuto() syntax is correct as of OpenCV 3.4.x) svm.trainAuto(samples, cv2.ml.ROW_SAMPLE, class_labels, kFold = 10, balanced = True); # save the trained SVM to file so that we can load it again for testing / detection svm.save(params.HOG_SVM_PATH_TRAIN) ############################################################################ # measure performance of the SVM trained on the bag of visual word features # perform prediction over the set of examples we trained over output = svm.predict(samples)[1].ravel() error = (np.absolute(class_labels.ravel() - output).sum()) / float(output.shape[0]) # we are succesful if our prediction > than random # e.g. for 2 class labels this would be 1/2 = 0.5 (i.e. 50%) if error < (1.0 / len(params.DATA_CLASS_NAMES)): print("Trained SVM obtained {}% training set error".format(round(error * 100,2))) print("-- meaining the SVM got {}% of the training examples correct!".format(round((1.0 - error) * 100,2))) else: print("Failed to train SVM. {}% error".format(round(error * 100,2))) utils.print_duration(start) print(("Finished training HoG detector. {}".format(format_time(get_elapsed_time(program_start)))))